# Tuning Hyperparameters using Car Data

# Import Data & Libraries

In [105]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
%matplotlib inline

# Project

Using the used car prices dataset included in the course package, perform the following:
1. Load the “used_car_price.csv” dataset 
3. Split the data into 75% for training and 25% for testing 
4. Train an XG-Boost model in Scikit-Learn
5. Assess trained XG-Boost model performance using RMSE and R2 
6. Perform hyperparameters optimization using GridSearch, choose any reasonable values for max_depth, learning_rate, n_estimators, and colsample_bytree. Use 5 cross validation folds.  
7. Perform hyperparameters optimization using RandomSearch, choose any reasonable values for max_depth, learning_rate, n_estimators, and colsample_bytree. Use 5 cross validation folds and 100 iterations.  
8. Compare the optimization strategies using RMSE and R2. Feel free to introduce any additional optimization strategy. Comment on your resuk

# Regression Optimization Using GridSearch

In [106]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [107]:
# Read the CSV file 
car_df = pd.read_csv("used_car_price.csv")

In [108]:
# Load the top 5 instances
car_df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,3.5,6,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,2.0,4,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,2.4,4,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,3.2,6,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,3.5,6,225,18,24,3880,115,197


In [109]:
# Perform One-Hot Encoding to create dummies for "Make", "Model", "Type", "Origin", and "DriveTrain"
car_df = pd.get_dummies(car_df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])

In [110]:
# Feeding input features to X and output (MSRP) to y
X = car_df.drop("MSRP", axis = 1)
y = car_df["MSRP"]

In [111]:
X = np.array(X)

In [112]:
y = np.array(y)

In [113]:
#Create test data using 25% from sample
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.25)

In [114]:
X_train

array([[  3.2,   6. , 215. , ...,   0. ,   0. ,   1. ],
       [  2.2,   4. , 140. , ...,   0. ,   1. ,   0. ],
       [  3.2,   6. , 290. , ...,   0. ,   0. ,   1. ],
       ...,
       [  2.6,   6. , 168. , ...,   1. ,   0. ,   0. ],
       [  6.8,  10. , 310. , ...,   1. ,   0. ,   0. ],
       [  3.2,   6. , 221. , ...,   0. ,   0. ,   1. ]])

In [115]:
X_test

array([[  4.7,   8. , 235. , ...,   1. ,   0. ,   0. ],
       [  2.4,   4. , 150. , ...,   0. ,   1. ,   0. ],
       [  4.6,   8. , 224. , ...,   0. ,   0. ,   1. ],
       ...,
       [  4.3,   8. , 300. , ...,   0. ,   0. ,   1. ],
       [  3. ,   6. , 225. , ...,   0. ,   0. ,   1. ],
       [  3. ,   6. , 220. , ...,   0. ,   1. ,   0. ]])

In [116]:
#Using just core XGBoost to estimate a prediction.
!pip install xgboost
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 3, n_estimators = 500)
model.fit(X_train, y_train)

# Make predictions on the test data using the text regressors
y_predict = model.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

#Assess Model Performance
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2) 


RMSE = 8540.773 
R2 = 0.7953782827504423


In [117]:
#Use XG-Boost with Gridsearch to optimize estimate
#Use 5 cross validation folds and 100 iterations
from sklearn.model_selection import GridSearchCV
#Select parameters
parameters_grid = { 'max_depth': [3, 10, 20], 
                   'learning_rate': [0.1, 0.5],
                   'n_estimators': [100, 500],
                   'colsample_bytree': [0.3, 0.7]}

model = xgb.XGBRegressor()

#"neg_mean_squared_error" ranks all the estimators and specifies which one is the best to minimize the error.  
xgb_gridsearch = GridSearchCV(estimator = model, 
                              param_grid = parameters_grid, 
                              scoring = 'neg_mean_squared_error',  
                              cv = 5, 
                              verbose = 5)

xgb_gridsearch.fit(X_train, y_train)
y_predict = xgb_gridsearch.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

#Assess Model Performance
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2) 


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-47875344.185 total time=   0.0s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-227604528.843 total time=   0.0s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-28767924.871 total time=   0.0s
[CV 4/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-44624157.233 total time=   0.0s
[CV 5/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=100;, score=-54761345.646 total time=   0.0s
[CV 1/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-44902549.329 total time=   0.2s
[CV 2/5] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-214404414.393 total time=   0.2s
[CV 3/5] END colsample_bytree=0.3, learning_rate=0.1, max_dep