<a href="https://colab.research.google.com/github/BengiNouri/Project2/blob/main/clean_ml_el.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/new_car_data_el123.csv')

In [None]:
columns_to_replace = ['GPS','Range (Electric)','Trailer Hitch', '4WD', 'Parkingsensor', 'Cruise Control', 'Antispin', 'ESP', 'El-SÃ¦der', 'Leather Interior', 'Glass Roof', 'Headup Display', 'Isofix', 'KlimaanlÃ¦g', 'Regnsensor', 'Soltag', 'Sports Package', 'Seatwarmer', 'Xenon Lights', 'Non-Smoker', 'One-Owner', 'Service OK', 'Nysynet', 'Demo Car', 'Partly Leather Interior', 'CVR/Engros', 'Full Leather', 'Adaptive Cruise Control']  # Specify the columns you want to replace values in
data[columns_to_replace] = data[columns_to_replace].replace({'Yes': 1, 'No': 0})

In [None]:
# Make vector of features for modelling
selected_features = data.columns.tolist()
# Remove the columns from our vector that are not meant to be used for modelling
selected_features.remove('Price')
selected_features.remove('Link')
selected_features.remove('Segment')


In [None]:
p_data = data[data['Segment'] == 'Premium']


# Perform the train-test split on the Premium segment
pX_train, pX_test, py_train, py_test = train_test_split(
    p_data[selected_features],
    p_data['Price'],
    test_size=0.2,
    random_state=42
)

In [None]:
# KNN Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(pX_train)
X_test_scaled = scaler.transform(pX_test)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, py_train)

predictions = knn.predict(X_test_scaled)
r_squared = r2_score(py_test, predictions)
mae = mean_absolute_error(py_test, predictions)
rmse = np.sqrt(mean_squared_error(py_test, predictions))

# Print the metrics
print(f"R² Score: {r_squared}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

In [None]:
# Baseline xgboosting without parameters # 6 sec
xgb_model = XGBRegressor(n_estimators=600)
xgb_model.fit(pX_train, py_train)
predictions_xgb = xgb_model.predict(pX_test)

#Metrics
r_squared = r2_score(py_test, predictions_xgb)
mae = mean_absolute_error(py_test, predictions_xgb)
mse = mean_squared_error(py_test, predictions_xgb)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': pX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# XGBoosting with hyperparameters # 32 min

# Defining grid
param_grid = {
        'min_child_weight': [5, 10],
        'gamma': [0, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 8, 12]
        }

xgb_model = XGBRegressor(n_estimators=600)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=0)
grid_search.fit(pX_train, py_train)
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(pX_test)

#Metrics
r_squared = r2_score(py_test, predictions)
mae = mean_absolute_error(py_test, predictions)
mse = mean_squared_error(py_test, predictions)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': pX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# Random Forest # 2 min
rf_model = RandomForestRegressor(n_estimators=600, random_state=42)


rf_model.fit(pX_train, py_train)
rpredictions = rf_model.predict(pX_test)

# Metrics
rr_squared = r2_score(py_test, rpredictions)
rmae = mean_absolute_error(py_test, rpredictions)
rmse = mean_squared_error(py_test, rpredictions)
rrmse = np.sqrt(rmse)

print("Random Forest Regression Model Evaluation:")
print("R-squared:", rr_squared)
print("Mean Absolute Error:", rmae)
print("Mean Squared Error:", rmse)
print("Root Mean Squared Error:", rrmse)

# Feature Importance For Random Forest
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': pX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

tree = rf_model.estimators_[0]

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(tree, filled=True, feature_names=pX_train.columns, max_depth=3, precision=2, proportion=True)
plt.title('Decision Tree from Random Forest')
plt.show()


In [None]:
# Linear regressionsmodel # 1 sec
l_features = ['KM Driven', 'Year', 'Horsepower', 'Km/L', 'Brand_Mercedes', 'Range (Electric)']
lX_train, lX_test, ly_train, ly_test = train_test_split(p_data[l_features], p_data['Price'], test_size=0.2, random_state=42)

linear_model = LinearRegression()

# Train the model
linear_model.fit(lX_train, ly_train)

# Make predictions on the test set
linear_predictions = linear_model.predict(lX_test)

# Metrics
linear_r_squared = r2_score(ly_test, linear_predictions)
linear_mae = mean_absolute_error(ly_test, linear_predictions)
linear_mse = mean_squared_error(ly_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)

print("Linear Regression Model Evaluation:")
print("R-squared:", linear_r_squared)
print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("Root Mean Squared Error:", linear_rmse)

In [None]:
m_data = data[data['Segment'] == 'Mid-Range']


# Perform the train-test split on the Mid-Range segment
mX_train, mX_test, my_train, my_test = train_test_split(
    m_data[selected_features],
    m_data['Price'],
    test_size=0.2,
    random_state=42
)

In [None]:
# KNN Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(mX_train)
X_test_scaled = scaler.transform(mX_test)

knn = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train_scaled, my_train)

predictions = knn.predict(X_test_scaled)

# Evaluate the model
r_squared = r2_score(my_test, predictions)
mae = mean_absolute_error(my_test, predictions)
rmse = np.sqrt(mean_squared_error(my_test, predictions))

# Print the performance metrics
print(f"R² Score: {r_squared}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

In [None]:
# Baseline xgboosting without parameters # 6 sec
xgb_model = XGBRegressor(n_estimators=600)
xgb_model.fit(mX_train, my_train)
predictions_xgb = xgb_model.predict(mX_test)

#Metrics
r_squared = r2_score(my_test, predictions_xgb)
mae = mean_absolute_error(my_test, predictions_xgb)
mse = mean_squared_error(my_test, predictions_xgb)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': mX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# XGBoosting with hyperparameters # 32 min

# Defining grid
param_grid = {
        'min_child_weight': [5, 10],
        'gamma': [0, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 8, 12]
        }

xgb_model = XGBRegressor(n_estimators=600)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=0)
grid_search.fit(mX_train, my_train)
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(mX_test)

#Metrics
r_squared = r2_score(my_test, predictions)
mae = mean_absolute_error(my_test, predictions)
mse = mean_squared_error(my_test, predictions)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': mX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# Random Forest # 2 min
rf_model = RandomForestRegressor(n_estimators=600, random_state=42)


rf_model.fit(mX_train, my_train)

rpredictions = rf_model.predict(mX_test)

# Metrics
rr_squared = r2_score(my_test, rpredictions)
rmae = mean_absolute_error(my_test, rpredictions)
rmse = mean_squared_error(my_test, rpredictions)
rrmse = np.sqrt(rmse)

print("Random Forest Regression Model Evaluation:")
print("R-squared:", rr_squared)
print("Mean Absolute Error:", rmae)
print("Mean Squared Error:", rmse)
print("Root Mean Squared Error:", rrmse)

# Feature Importance For Random Forest
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': mX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

tree = rf_model.estimators_[0]

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(tree, filled=True, feature_names=mX_train.columns, max_depth=3, precision=2, proportion=True)
plt.title('Decision Tree from Random Forest')
plt.show()

In [None]:
# Linear regressionsmodel # 1 sec
l_features = ['KM Driven', 'Year', 'Horsepower', 'Km/L', 'Gear_Automatgear', 'Range (Electric)', 'Adaptive Cruise Control']
lX_train, lX_test, ly_train, ly_test = train_test_split(m_data[l_features], m_data['Price'], test_size=0.2, random_state=42)

linear_model = LinearRegression()

linear_model.fit(lX_train, ly_train)

linear_predictions = linear_model.predict(lX_test)

# Metrics
linear_r_squared = r2_score(ly_test, linear_predictions)
linear_mae = mean_absolute_error(ly_test, linear_predictions)
linear_mse = mean_squared_error(ly_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)

print("Linear Regression Model Evaluation:")
print("R-squared:", linear_r_squared)
print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("Root Mean Squared Error:", linear_rmse)

In [None]:
e_data = data[data['Segment'] == 'Economy']


# Perform the train-test split on the Economy segment
eX_train, eX_test, ey_train, ey_test = train_test_split(
    e_data[selected_features],
    e_data['Price'],
    test_size=0.2,
    random_state=42
)

In [None]:
# KNN Model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(eX_train)
X_test_scaled = scaler.transform(eX_test)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, ey_train)

predictions = knn.predict(X_test_scaled)

# Metrics
r_squared = r2_score(ey_test, predictions)
mae = mean_absolute_error(ey_test, predictions)
rmse = np.sqrt(mean_squared_error(ey_test, predictions))

# Print the performance metrics
print(f"R² Score: {r_squared}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

In [None]:
# Baseline xgboosting without parameters # 6 sec
xgb_model = XGBRegressor(n_estimators=600)
xgb_model.fit(eX_train, ey_train)
predictions_xgb = xgb_model.predict(eX_test)

#Metrics
r_squared = r2_score(ey_test, predictions_xgb)
mae = mean_absolute_error(ey_test, predictions_xgb)
mse = mean_squared_error(ey_test, predictions_xgb)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': eX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# XGBoosting with hyperparameters # 32 min

# Defining grid
param_grid = {
        'min_child_weight': [5, 10],
        'gamma': [0, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 8, 12]
        }

xgb_model = XGBRegressor(n_estimators=600)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=0)
grid_search.fit(eX_train, ey_train)
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(eX_test)

#Metrics
r_squared = r2_score(ey_test, predictions)
mae = mean_absolute_error(ey_test, predictions)
mse = mean_squared_error(ey_test, predictions)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': eX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

In [None]:
# Random Forest # 2 min
rf_model = RandomForestRegressor(n_estimators=600, random_state=42)


rf_model.fit(eX_train, ey_train)
rpredictions = rf_model.predict(eX_test)

# Metrics
rr_squared = r2_score(ey_test, rpredictions)
rmae = mean_absolute_error(ey_test, rpredictions)
rmse = mean_squared_error(ey_test, rpredictions)
rrmse = np.sqrt(rmse)

print("Random Forest Regression Model Evaluation:")
print("R-squared:", rr_squared)
print("Mean Absolute Error:", rmae)
print("Mean Squared Error:", rmse)
print("Root Mean Squared Error:", rrmse)

# Feature Importance For Random Forest
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': eX_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

tree = rf_model.estimators_[0]

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(tree, filled=True, feature_names=eX_train.columns, max_depth=3, precision=2, proportion=True)
plt.title('Decision Tree from Random Forest')
plt.show()


In [None]:
# Linear regressionsmodel # 1 sec
l_features = ['KM Driven', 'Year', 'Horsepower', 'Km/L', 'Gear_Automatgear', 'Range (Electric)', 'Adaptive Cruise Control']
lX_train, lX_test, ly_train, ly_test = train_test_split(e_data[l_features], e_data['Price'], test_size=0.2, random_state=42)

linear_model = LinearRegression()

# Train the model
linear_model.fit(lX_train, ly_train)

# Make predictions on the test set
linear_predictions = linear_model.predict(lX_test)

# Metrics
linear_r_squared = r2_score(ly_test, linear_predictions)
linear_mae = mean_absolute_error(ly_test, linear_predictions)
linear_mse = mean_squared_error(ly_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)

print("Linear Regression Model Evaluation:")
print("R-squared:", linear_r_squared)
print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("Root Mean Squared Error:", linear_rmse)