<a href="https://colab.research.google.com/github/BengiNouri/Project2/blob/main/final_ml_bdh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('/content/drive/MyDrive/CleanDdh(1).csv')

In [3]:
data = pd.get_dummies(data, columns=['Brand', 'Gear', 'Fuel_Type', 'Model'])

In [4]:
# Make vector of features for modelling
selected_features = data.columns.tolist()
# Remove the columns from our vector that are not meant to be used for modelling
selected_features.remove('Price')
selected_features.remove('Link')

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['Price'], test_size=0.2, random_state=42)


In [7]:
# Baseline xgboosting without parameters # 6 sec
xgb_model = XGBRegressor(n_estimators=600)
xgb_model.fit(X_train, y_train)
predictions_xgb = xgb_model.predict(X_test)

#Metrics
r_squared = r2_score(y_test, predictions_xgb)
mae = mean_absolute_error(y_test, predictions_xgb)
mse = mean_squared_error(y_test, predictions_xgb)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

R-squared: 0.8842752475425635
Mean Absolute Error: 24906.46034926732
Mean Squared Error: 1493825623.5922046
Root Mean Squared Error: 38650.040408674926
Top 25 important features:
                     Feature  Importance
60          Gear_Automatgear    0.202291
44                  Brand_MG    0.031731
46            Brand_Mercedes    0.029807
30                Horsepower    0.028822
63          Fuel_Type_Diesel    0.016220
634                 Model_i3    0.015926
50                Brand_Opel    0.014657
52             Brand_Porsche    0.014423
28   Adaptive Cruise Control    0.014274
278         Model_California    0.012844
161                Model_911    0.011389
58                  Brand_VW    0.009643
412             Model_Kodiaq    0.009486
609               Model_XC60    0.009150
582          Model_Traveller    0.008408
146                  Model_6    0.008247
198             Model_Arteon    0.007827
129               Model_5008    0.007781
4                        4WD    0.007182
2

In [6]:
# XGBoosting with hyperparameters # 32 min

# Defining grid
param_grid = {
        'min_child_weight': [5, 10],
        'gamma': [0, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 8, 12]
        }

xgb_model = XGBRegressor(n_estimators=600)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=0)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

#Metrics
r_squared = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
#Print Metrics
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Feature Importance
feature_importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

Best Parameters: {'colsample_bytree': 0.6, 'gamma': 0, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 1.0}
R-squared: 0.8611779330398579
Mean Absolute Error: 26771.68505618751
Mean Squared Error: 1791975842.1722767
Root Mean Squared Error: 42331.73563855227
Top 25 important features:
                     Feature  Importance
60          Gear_Automatgear    0.271548
61         Gear_Manuelt gear    0.083528
52             Brand_Porsche    0.024937
30                Horsepower    0.023986
44                  Brand_MG    0.017475
46            Brand_Mercedes    0.017227
28   Adaptive Cruise Control    0.015451
0                  KM Driven    0.014002
64          Fuel_Type_Hybrid    0.013301
4                        4WD    0.013149
17            Sports Package    0.013066
634                 Model_i3    0.012773
609               Model_XC60    0.011924
180                 Model_A6    0.011682
50                Brand_Opel    0.011072
198             Model_Arteon    0.010682
19          

In [8]:
# Random Forest # 2 min
rf_model = RandomForestRegressor(n_estimators=600, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rpredictions = rf_model.predict(X_test)

# Calculate Mean Squared Error
rr_squared = r2_score(y_test, rpredictions)
rmae = mean_absolute_error(y_test, rpredictions)
rmse = mean_squared_error(y_test, rpredictions)
rrmse = np.sqrt(rmse)

print("Random Forest Regression Model Evaluation:")
print("R-squared:", rr_squared)
print("Mean Absolute Error:", rmae)
print("Mean Squared Error:", rmse)
print("Root Mean Squared Error:", rrmse)

# Feature Importance For Random Forest
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
N = 25  # Number of Features
print("Top", N, "important features:")
print(feature_importance_df.head(N))

Random Forest Regression Model Evaluation:
R-squared: 0.8598919183336144
Mean Absolute Error: 27149.13154669192
Mean Squared Error: 1808576281.4016407
Root Mean Squared Error: 42527.35921029709
Top 25 important features:
                     Feature  Importance
0                  KM Driven    0.236648
30                Horsepower    0.227534
61         Gear_Manuelt gear    0.164563
60          Gear_Automatgear    0.147800
1                       Km/L    0.029816
29                      Year    0.026154
46            Brand_Mercedes    0.009657
4                        4WD    0.006419
63          Fuel_Type_Diesel    0.005859
28   Adaptive Cruise Control    0.004866
6             Cruise Control    0.003915
19              Xenon Lights    0.003878
52             Brand_Porsche    0.002961
13                    Isofix    0.002924
11                Glass Roof    0.002760
5              Parkingsensor    0.002699
278         Model_California    0.002675
2                        GPS    0.002596


In [10]:
# Linear regressionsmodel # 1 sec
l_features = ['KM Driven', 'Year', 'Horsepower', 'Km/L', 'Gear_Automatgear', 'Fuel_Type_Diesel', 'Adaptive Cruise Control']
lX_train, lX_test, ly_train, ly_test = train_test_split(data[l_features], data['Price'], test_size=0.2, random_state=42)

linear_model = LinearRegression()

# Train the model
linear_model.fit(lX_train, ly_train)

# Make predictions on the test set
linear_predictions = linear_model.predict(lX_test)

# Metrics
linear_r_squared = r2_score(y_test, linear_predictions)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_mse = mean_squared_error(y_test, linear_predictions)
linear_rmse = np.sqrt(linear_mse)

print("Linear Regression Model Evaluation:")
print("R-squared:", linear_r_squared)
print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("Root Mean Squared Error:", linear_rmse)

Linear Regression Model Evaluation:
R-squared: 0.6821651675024251
Mean Absolute Error: 43425.21350715471
Mean Squared Error: 4102750766.5625896
Root Mean Squared Error: 64052.71865083159
