<a href="https://colab.research.google.com/github/BengiNouri/Project2/blob/main/final_ml_el.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('/content/drive/MyDrive/imputed_cardata.csv')

In [3]:
data = pd.get_dummies(data, columns=['Brand', 'Gear', 'Fuel_Type', 'Model'])

In [4]:
# Make vector of features for modelling
selected_features = data.columns.tolist()
# Remove the columns from our vector that are not meant to be used for modelling
selected_features.remove('Price')
selected_features.remove('Link')

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['Price'], test_size=0.2, random_state=42)

In [5]:
# Baseline xgboosting without parameters # 2 sec
xgb_model = XGBRegressor(n_estimators=600)
xgb_model.fit(X_train, y_train)
predictions_xgb = xgb_model.predict(X_test)

#Metrics
r_squared = r2_score(y_test, predictions_xgb)
mae = mean_absolute_error(y_test, predictions_xgb)
mse = mean_squared_error(y_test, predictions_xgb)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

R-squared: 0.8970661530405906
Mean Absolute Error: 20372.09643977076
Mean Squared Error: 1074767603.686463
Root Mean Squared Error: 32783.648419394434
Top 25 important features:
              Feature  Importance
88          Model_ID.    0.109601
102     Model_Model 3    0.074346
104     Model_Model X    0.067176
86           Model_GT    0.056104
33          Brand_BMW    0.040885
45     Brand_Mercedes    0.040623
31         Horsepower    0.039881
82       Model_EQE350    0.038993
92          Model_ID7    0.033569
32         Brand_Audi    0.031920
117    Model_Vivaro-e    0.030622
43           Brand_MG    0.030000
2    Range (Electric)    0.027863
1                Km/L    0.023692
85        Model_Enyaq    0.021918
18     Sports Package    0.019034
50     Brand_Polestar    0.018783
35        Brand_Cupra    0.018243
111          Model_Q4    0.016602
120      Model_Zafira    0.015448
95         Model_Kona    0.012813
64        Model_Ariya    0.011758
97          Model_MG4    0.010053
138   

In [6]:
# XGBoosting with hyperparameters # 4 min

#Defining grid
param_grid = {
        'min_child_weight': [5, 10],
        'gamma': [0, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 8, 12]
        }

xgb_model = XGBRegressor(n_estimators=600)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=0)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

#Metrics
r_squared = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 1.0}
R-squared: 0.9063747545815448
Mean Absolute Error: 20431.03852184256
Mean Squared Error: 977573302.0318407
Root Mean Squared Error: 31266.168649705716
Top 25 important features:
                     Feature  Importance
88                 Model_ID.    0.079890
18            Sports Package    0.062647
111                 Model_Q4    0.058810
50            Brand_Polestar    0.056821
89                 Model_ID3    0.055749
32                Brand_Audi    0.050373
31                Horsepower    0.046583
2           Range (Electric)    0.039299
95                Model_Kona    0.036852
1                       Km/L    0.036642
45            Brand_Mercedes    0.035554
33                 Brand_BMW    0.032342
55              Brand_Toyota    0.029423
102            Model_Model 3    0.022842
43                  Brand_MG    0.021058
35               Brand_Cupra    0.020823
4           

In [7]:
# Random Forest # 10 sec
rf_model = RandomForestRegressor(n_estimators=600, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rpredictions = rf_model.predict(X_test)

# Calculate Mean Squared Error
rr_squared = r2_score(y_test, rpredictions)
rmae = mean_absolute_error(y_test, rpredictions)
rmse = mean_squared_error(y_test, rpredictions)
rrmse = np.sqrt(rmse)

print("Random Forest Regression Model Evaluation:")
print("R-squared:", rr_squared)
print("Mean Absolute Error:", rmae)
print("Mean Squared Error:", rmse)
print("Root Mean Squared Error:", rrmse)

# Feature Importance For Random Forest
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

Random Forest Regression Model Evaluation:
R-squared: 0.890537079661891
Mean Absolute Error: 21059.866416124292
Mean Squared Error: 1142939898.3864286
Root Mean Squared Error: 33807.39413776857
Top 25 important features:
                     Feature  Importance
2           Range (Electric)    0.314459
31                Horsepower    0.207263
1                       Km/L    0.123879
0                  KM Driven    0.119392
88                 Model_ID.    0.019346
18            Sports Package    0.009506
104            Model_Model X    0.009301
82              Model_EQE350    0.008482
32                Brand_Audi    0.008099
45            Brand_Mercedes    0.007573
111                 Model_Q4    0.007522
12                Glass Roof    0.006125
33                 Brand_BMW    0.006124
43                  Brand_MG    0.006095
13            Headup Display    0.005788
26   Partly Leather Interior    0.005383
30                      Year    0.005214
66                Model_Born    0.005116


In [11]:
# Linear regressionsmodel # 0 sec
l_features = ['KM Driven', 'Year', 'Horsepower', 'Km/L', 'Sports Package', 'Range (Electric)']
lX_train, lX_test, ly_train, ly_test = train_test_split(data[l_features], data['Price'], test_size=0.2, random_state=42)

linear_model = LinearRegression()

# Train the model
linear_model.fit(lX_train, ly_train)

# Make predictions on the test set
linear_predictions = linear_model.predict(lX_test)

# Metrics
linear_r_squared = r2_score(y_test, linear_predictions)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)

print("Linear Regression Model Evaluation:")
print("R-squared:", linear_r_squared)
print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("Root Mean Squared Error:", linear_rmse)

Linear Regression Model Evaluation:
R-squared: 0.5902605968113104
Mean Absolute Error: 49883.09272372648
Mean Squared Error: 4278229654.378749
Root Mean Squared Error: 65408.17727454839
