# <div align="center"> Machine Learning Models

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows',2000)

In [2]:
df = pd.read_csv('../Clean_Data/mercedes_esp.csv')

#### Encoding and transforming variables

In [3]:
df_t = pd.get_dummies(df, columns=['Model','Fuel_type'])
df_t['Km'] = StandardScaler().fit_transform(df_t[['Km']])
df_t['Year'] = StandardScaler().fit_transform(df_t[['Year']])
df_t['Power'] = StandardScaler().fit_transform(df_t[['Power']])

In [4]:
df_t.columns

Index(['Brand', 'Year', 'Km', 'Power', 'Price', 'Model_A', 'Model_B',
       'Model_C', 'Model_CL', 'Model_CLA', 'Model_CLC', 'Model_CLS',
       'Model_Citan', 'Model_E', 'Model_EQC400', 'Model_G', 'Model_GL',
       'Model_GLA', 'Model_GLB', 'Model_GLC', 'Model_GLE', 'Model_GLK',
       'Model_GLS', 'Model_ML', 'Model_MarcoPolo', 'Model_R', 'Model_S',
       'Model_SL', 'Model_SLC', 'Model_SLK', 'Model_Viano', 'Fuel_type_Diesel',
       'Fuel_type_Electrico', 'Fuel_type_Electro/Diesel',
       'Fuel_type_Electro/Gasolina', 'Fuel_type_Gasolina'],
      dtype='object')

In [5]:
X =  df_t.drop(columns=['Price','Brand'])
y = df_t.loc[:,'Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/8, random_state=0)

In [7]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((2841, 34), (2841,), (406, 34), (406,))

-------------------------------------------------------------------------------------------------------------------------------

### KNeighborsRegressor

In [15]:
from sklearn.neighbors import KNeighborsRegressor

In [16]:
clf =GridSearchCV(KNeighborsRegressor(weights='distance'),param_grid={
    'n_neighbors':range(2,35),'leaf_size':range(5,15)},scoring='neg_mean_absolute_error',
    cv=5, return_train_score=False) 

In [17]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [18]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_leaf_size,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
74,0.021875,0.007653059,0.028121,0.006248593,7,10,"{'leaf_size': 7, 'n_neighbors': 10}",-2947.360361,-2886.426654,-3043.841543,-3242.84675,-3057.630317,-3035.621125,121.27934,1
8,0.021875,0.007653001,0.031243,7.951894e-06,5,10,"{'leaf_size': 5, 'n_neighbors': 10}",-2947.360361,-2886.426654,-3043.841543,-3242.84675,-3057.630317,-3035.621125,121.27934,1
41,0.022289,0.006212068,0.03168,0.000354773,6,10,"{'leaf_size': 6, 'n_neighbors': 10}",-2947.360361,-2886.426654,-3043.841543,-3242.84675,-3057.630317,-3035.621125,121.27934,1
107,0.028128,0.006249885,0.024997,0.007652962,8,10,"{'leaf_size': 8, 'n_neighbors': 10}",-2947.360361,-2886.426654,-3043.841543,-3242.84675,-3057.630317,-3035.621125,121.27934,1
305,0.019262,0.007272077,0.023944,0.007060512,14,10,"{'leaf_size': 14, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5
173,0.015625,7.921814e-07,0.024997,0.007654169,10,10,"{'leaf_size': 10, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5
272,0.021875,0.007652767,0.021872,0.00765265,13,10,"{'leaf_size': 13, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5
239,0.018751,0.006248093,0.024996,0.007652787,12,10,"{'leaf_size': 12, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5
140,0.015626,1.427329e-06,0.025,0.007657384,9,10,"{'leaf_size': 9, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5
206,0.021875,0.007652339,0.021872,0.007652592,11,10,"{'leaf_size': 11, 'n_neighbors': 10}",-2947.3941,-2886.760586,-3044.552443,-3243.281254,-3060.61705,-3036.521086,121.463338,5


-------------------------------------------------------------------------------------------------------------------------------

###  DecisionTreeRegressor

In [19]:
from sklearn.tree import DecisionTreeRegressor

In [20]:
clf =GridSearchCV(DecisionTreeRegressor(),param_grid={
    'criterion':('mse','friedman_mse','mae'),'splitter':('best','random'),'max_depth':range(5,30,3)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [21]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [22]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,0.009374,0.007654,0.0,0.0,mse,14,random,"{'criterion': 'mse', 'max_depth': 14, 'splitte...",-3338.806421,-3149.307897,-3467.748397,-3604.010129,-3680.398761,-3448.054321,189.672633,1
41,0.221855,0.022965,0.00625,0.007654422,mae,11,random,"{'criterion': 'mae', 'max_depth': 11, 'splitte...",-3255.585387,-3240.134683,-3552.082746,-3643.287852,-3550.968254,-3448.411785,167.214563,2
40,0.215613,0.011697,0.00625,0.007654247,mae,11,best,"{'criterion': 'mae', 'max_depth': 11, 'splitte...",-3403.839789,-3451.787852,-3555.245599,-3804.649648,-3597.279541,-3562.560486,139.519833,3
43,0.256232,0.032172,0.0,0.0,mae,14,random,"{'criterion': 'mae', 'max_depth': 14, 'splitte...",-3678.881162,-3214.027289,-3850.521127,-3656.301937,-3421.411817,-3564.228666,222.091743,4
23,0.006254,0.007659,0.0,0.0,friedman_mse,11,random,"{'criterion': 'friedman_mse', 'max_depth': 11,...",-3429.199264,-3336.335183,-3635.132673,-3942.106224,-3535.940637,-3575.742796,208.865767,5
25,0.0,0.0,0.00625,0.007654247,friedman_mse,14,random,"{'criterion': 'friedman_mse', 'max_depth': 14,...",-3420.194276,-3448.6079,-3670.721929,-3703.488321,-3638.926644,-3576.387814,118.058203,6
38,0.223043,0.007609,0.001245,0.0008004973,mae,8,best,"{'criterion': 'mae', 'max_depth': 8, 'splitter...",-3303.597711,-3495.769366,-3583.538732,-3848.016725,-3705.959436,-3587.376394,184.895774,7
47,0.221859,0.011692,0.003125,0.006249619,mae,20,random,"{'criterion': 'mae', 'max_depth': 20, 'splitte...",-3544.213028,-3390.161092,-3643.96831,-3631.805458,-3770.112875,-3596.052152,125.65812,8
9,0.009182,0.006338,0.001039,0.0006373778,mse,17,random,"{'criterion': 'mse', 'max_depth': 17, 'splitte...",-3512.599693,-3347.201929,-3758.896108,-3817.768448,-3631.467599,-3613.586755,169.872869,9
22,0.009119,0.006773,0.0004,0.0004896873,friedman_mse,11,best,"{'criterion': 'friedman_mse', 'max_depth': 11,...",-3582.907826,-3391.74113,-3756.454577,-3690.085674,-3726.150463,-3629.467934,132.536415,10


-------------------------------------------------------------------------------------------------------------------------------

###  RandomForestRegressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
clf =GridSearchCV(DecisionTreeRegressor(),param_grid={
    'criterion':('mse','friedman_mse','mae'),'splitter':('best','random'),'max_depth':range(12,25)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [25]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [26]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,0.00625,0.007654,0.0,0.0,friedman_mse,13,random,"{'criterion': 'friedman_mse', 'max_depth': 13,...",-3186.862087,-3380.285396,-3518.043643,-3531.233767,-3436.929975,-3410.670974,124.713683,1
57,0.240604,0.025385,0.0,0.0,mae,14,random,"{'criterion': 'mae', 'max_depth': 14, 'splitte...",-3340.270246,-3327.902289,-3736.204225,-3554.300176,-3351.380952,-3462.011578,160.46444,2
55,0.224984,0.007654,0.00625,0.007654,mae,13,random,"{'criterion': 'mae', 'max_depth': 13, 'splitte...",-3382.351232,-3219.283451,-3654.614437,-3827.085387,-3550.536155,-3526.774132,210.95171,3
33,0.003125,0.00625,0.003125,0.00625,friedman_mse,15,random,"{'criterion': 'friedman_mse', 'max_depth': 15,...",-3420.484715,-3323.114485,-3595.483957,-3588.598497,-3749.492297,-3535.43479,148.691849,4
53,0.209364,0.028973,0.003125,0.00625,mae,12,random,"{'criterion': 'mae', 'max_depth': 12, 'splitte...",-3602.090669,-3165.957746,-3725.760563,-3788.427817,-3414.660494,-3539.379458,226.139025,5
1,0.001199,0.002398,0.000211,0.000422,mse,12,random,"{'criterion': 'mse', 'max_depth': 12, 'splitte...",-3464.026307,-3201.83341,-3986.252103,-3497.580544,-3555.825545,-3541.103582,253.50083,6
37,0.00625,0.007654,0.0,0.0,friedman_mse,17,random,"{'criterion': 'friedman_mse', 'max_depth': 17,...",-3299.034296,-3269.897747,-3822.177157,-3694.592444,-3667.193632,-3550.579055,223.678836,7
5,0.00625,0.007654,0.003125,0.006249,mse,14,random,"{'criterion': 'mse', 'max_depth': 14, 'splitte...",-3628.628991,-3463.041259,-3665.197097,-3560.849467,-3443.667866,-3552.276936,87.650469,8
63,0.297392,0.030223,0.0004,0.0008,mae,17,random,"{'criterion': 'mae', 'max_depth': 17, 'splitte...",-3462.059859,-3474.410211,-3673.872359,-3652.516725,-3499.309524,-3552.433736,91.478122,9
35,0.00625,0.007654,0.003125,0.00625,friedman_mse,16,random,"{'criterion': 'friedman_mse', 'max_depth': 16,...",-3378.707614,-3294.705281,-3726.242692,-3759.848143,-3626.584379,-3557.217622,187.19984,10


### Gradient Boosting

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

In [30]:
clf =GridSearchCV(GradientBoostingRegressor(),param_grid={
    'loss':('huber','lad'),'learning_rate':(0.05,0.1),'n_estimators':(75,90),
    'criterion':('mse','mae'),'max_depth':range(3,8,2)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [31]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [32]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_learning_rate,param_loss,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
16,1.274619,0.065701,0.0006,0.001199055,mse,0.1,huber,7,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2743.827762,-2831.84353,-3007.704704,-3227.982418,-2901.89039,-2942.649761,166.803121,1
15,0.698464,0.009477,0.0,0.0,mse,0.1,huber,5,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2748.792065,-2838.66919,-3033.169107,-3190.00722,-2905.290322,-2943.185581,154.380389,2
20,0.478091,0.007655,0.003125,0.006249714,mse,0.1,lad,5,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2749.2874,-2910.186531,-3204.099268,-3047.355123,-2863.561741,-2954.898012,157.013845,3
23,1.012043,0.060952,0.003924,0.006051186,mse,0.1,lad,7,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2728.838179,-2926.201287,-3167.248475,-3127.009197,-2846.034246,-2959.066277,166.376667,4
21,0.602452,0.05619,0.003125,0.006249714,mse,0.1,lad,5,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2703.976797,-2944.295919,-3233.911534,-3027.776838,-2889.964197,-2959.985057,173.390161,5
17,1.416382,0.028325,0.0,0.0,mse,0.1,huber,7,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2803.838167,-2839.704998,-3044.49583,-3231.329573,-2898.170094,-2963.507732,157.084757,6
22,0.82884,0.025709,0.0,0.0,mse,0.1,lad,7,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2727.581093,-2936.509204,-3152.424456,-3083.812403,-2922.054578,-2964.476347,147.150749,7
5,1.623561,0.028341,0.003125,0.006250381,mse,0.05,huber,7,90,"{'criterion': 'mse', 'learning_rate': 0.05, 'l...",-2787.481465,-2856.65549,-3078.867551,-3212.914582,-2891.503772,-2965.484572,156.894451,8
14,0.61779,0.009653,0.0004,0.0007998466,mse,0.1,huber,5,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2749.70078,-2883.779749,-3081.282053,-3211.475176,-2942.801879,-2973.807928,159.578739,9
29,25.271679,1.971736,0.000607,0.001214123,mae,0.05,huber,7,90,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",-2720.281858,-2883.699632,-3077.659109,-3180.776567,-3024.522875,-2977.388008,160.327153,10


-------------------------------------------------------------------------------------------------------------------------------

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
reg = LinearRegression()

In [10]:
cross_val_score(reg,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-3391.9264008677046