# Hyperparameter Optimization for XG-Boost in Scikit-learn - Car Price Prediction

Predict the MSRP (Manufacturer's Suggested Retail Price) using the used car prices dataset. Train an XGBoost model and compare the non-optimised RMSE and R2 values to those obtained through a model using GridSearch for hyperparameter tuning.
1. Load the “used_car_price.csv” dataset
3. Split the data into 75% for training and 25% for testing
4. Train an XG-Boost model and obtain the RMSE and R2 values (No optimisation)
5. Perform GridSearch hyperparameter optimization and obtain the RMSE and R2 values

## Import dependencies

In [68]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn.model_selection import GridSearchCV

## 1) Load the “used_car_price.csv” dataset

In [6]:
# import csv file using pandas 
car_df = pd.read_csv('data/used_car_price.csv')

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,3.5,6,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,2.0,4,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,2.4,4,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,3.2,6,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,3.5,6,225,18,24,3880,115,197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,C70 LPT convertible 2dr,Sedan,Europe,Front,40565,2.4,5,197,21,28,3450,105,186
424,Volvo,C70 HPT convertible 2dr,Sedan,Europe,Front,42565,2.3,5,242,20,26,3450,105,186
425,Volvo,S80 T6 4dr,Sedan,Europe,Front,45210,2.9,6,268,19,26,3653,110,190
426,Volvo,V40,Wagon,Europe,Front,26135,1.9,4,170,22,29,2822,101,180


In [11]:
# explore the dataframe
car_df.head(5)

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,3.5,6,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,2.0,4,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,2.4,4,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,3.2,6,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,3.5,6,225,18,24,3880,115,197


In [12]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         428 non-null    object 
 1   Model        428 non-null    object 
 2   Type         428 non-null    object 
 3   Origin       428 non-null    object 
 4   DriveTrain   428 non-null    object 
 5   MSRP         428 non-null    int64  
 6   EngineSize   428 non-null    float64
 7   Cylinders    428 non-null    int64  
 8   Horsepower   428 non-null    int64  
 9   MPG_City     428 non-null    int64  
 10  MPG_Highway  428 non-null    int64  
 11  Weight       428 non-null    int64  
 12  Wheelbase    428 non-null    int64  
 13  Length       428 non-null    int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 46.9+ KB


In [13]:
car_df.describe()

Unnamed: 0,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
count,428.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0,428.0
mean,32774.85514,3.196729,5.799065,215.885514,20.060748,26.843458,3577.953271,108.154206,186.36215
std,19431.716674,1.108595,1.559679,71.836032,5.238218,5.741201,758.983215,8.311813,14.357991
min,10280.0,1.3,3.0,73.0,10.0,12.0,1850.0,89.0,143.0
25%,20334.25,2.375,4.0,165.0,17.0,24.0,3104.0,103.0,178.0
50%,27635.0,3.0,6.0,210.0,19.0,26.0,3474.5,107.0,187.0
75%,39205.0,3.9,6.0,255.0,21.25,29.0,3977.75,112.0,194.0
max,192465.0,8.3,12.0,500.0,60.0,66.0,7190.0,144.0,238.0


## 2) Split the data into 75% for training and 25% for testing

In [45]:
# Perform One-Hot Encoding for categorical data
car_df = pd.get_dummies(car_df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])

In [46]:
# obtain input features
X = car_df.drop("MSRP", axis = 1)
X

Unnamed: 0,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Make_Acura,Make_Audi,...,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,Origin_Asia,Origin_Europe,Origin_USA,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,3.5,6,265,17,23,4451,106,189,1,0,...,0,0,0,0,1,0,0,1,0,0
1,2.0,4,200,24,31,2778,101,172,1,0,...,1,0,0,0,1,0,0,0,1,0
2,2.4,4,200,22,29,3230,105,183,1,0,...,1,0,0,0,1,0,0,0,1,0
3,3.2,6,270,20,28,3575,108,186,1,0,...,1,0,0,0,1,0,0,0,1,0
4,3.5,6,225,18,24,3880,115,197,1,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,2.4,5,197,21,28,3450,105,186,0,0,...,1,0,0,0,0,1,0,0,1,0
424,2.3,5,242,20,26,3450,105,186,0,0,...,1,0,0,0,0,1,0,0,1,0
425,2.9,6,268,19,26,3653,110,190,0,0,...,1,0,0,0,0,1,0,0,1,0
426,1.9,4,170,22,29,2822,101,180,0,0,...,0,0,0,1,0,1,0,0,1,0


In [47]:
# obtain output feature
y = car_df["MSRP"]
y

0      36945
1      23820
2      26990
3      33195
4      43755
       ...  
423    40565
424    42565
425    45210
426    26135
427    35145
Name: MSRP, Length: 428, dtype: int64

In [48]:
# convert pandas Series to numpy array
X = np.array(X)
y = np.array(y)

In [49]:
# split into training data and test data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.25)

In [51]:
# check shape of X_train data
X_train.shape

(321, 483)

In [53]:
# check shape of y_train data
y_train.shape

(321,)

## 4) Train an XG-Boost model and obtain the RMSE and R2 values (No optimisation)

In [54]:
# train the model
model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 20, n_estimators = 500)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='', learning_rate=1,
             max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=20,
             max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=0, reg_alpha=0, ...)

In [66]:
# make predictions on the test data
y_predict = model.predict(X_test)

In [67]:
# obtain Key Performance Indicators (RMSE and R2)
# NOTE: This compares predictions for x_test data contained in y_predict against actual values in y_test
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE, '\nR2 =', r2) 

RMSE = 8497.68 
R2 = 0.7269612669007198


## 5) Perform GridSearch hyperparameter optimization and obtain the RMSE and R2 values

In [70]:
# define hyperparameter combinations to be evaluated
parameters_grid = { 'max_depth': [3, 10, 20], 
                   'learning_rate': [0.1, 0.5],
                   'n_estimators': [100, 500],
                   'colsample_bytree': [0.3, 0.7]}

In [71]:
# train the models
model = xgb.XGBRegressor()
xgb_gridsearch = GridSearchCV(estimator = model, 
                              param_grid = parameters_grid, 
                              scoring = 'neg_mean_squared_error', # we use'neg_mean_squared_error' because we are trying to minimize the error.
                              cv = 5, 
                              verbose = 5)
xgb_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100, score=-37217869.432, total=   0.7s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100, score=-289480785.622, total=   0.5s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100, score=-29708599.750, total=   0.5s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s


[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100, score=-20553226.719, total=   0.5s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.2s remaining:    0.0s


[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100, score=-38131619.177, total=   0.5s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500, score=-35642221.098, total=   2.4s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500, score=-283602161.572, total=   2.5s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500, score=-27160031.363, total=   2.4s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500, score=-20068695.571, total=   2.4s
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500 
[CV]  colsample_bytree=0.3, learning_rate=0.1, ma

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  4.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_...
                                    n_jobs=None, num_parallel_tree=None,
                                    objective='reg:squarederror',
                                    predictor=None, random_state=None,
                                    reg_alpha=None, ...),
             iid='deprec

In [72]:
# print parameter combination for 'best' model
xgb_gridsearch.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.5,
 'max_depth': 3,
 'n_estimators': 100}

In [73]:
# make predictions on the test data using the 'best' model
y_predict = xgb_gridsearch.predict(X_test)

In [76]:
# obtain Key Performance Indicators (RMSE and R2)
# NOTE: This compares predictions for x_test data contained in y_predict against actual values in y_test
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE, '\nR2 =', r2) 

RMSE = 7782.636 
R2 = 0.7709781535127014
