# <div align="center"> Machine Learning Models

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
pd.set_option('display.max_rows',2000)

In [2]:
df = pd.read_csv('mercedes_esp.csv')

#### Encoding and transforming variables

In [3]:
df_t = pd.get_dummies(df, columns=['Model','Fuel_type'])

In [4]:
df_t.columns

Index(['Brand', 'Year', 'Km', 'Power', 'Price', 'Model_A', 'Model_B',
       'Model_C', 'Model_CL', 'Model_CLA', 'Model_CLC', 'Model_CLS',
       'Model_Citan', 'Model_E', 'Model_EQC400', 'Model_G', 'Model_GL',
       'Model_GLA', 'Model_GLB', 'Model_GLC', 'Model_GLE', 'Model_GLK',
       'Model_GLS', 'Model_ML', 'Model_MarcoPolo', 'Model_R', 'Model_S',
       'Model_SL', 'Model_SLC', 'Model_SLK', 'Model_Viano', 'Fuel_type_Diesel',
       'Fuel_type_Electrico', 'Fuel_type_Electro/Diesel',
       'Fuel_type_Electro/Gasolina', 'Fuel_type_Gasolina'],
      dtype='object')

In [5]:
X =  df_t.drop(columns=['Price','Brand'])
y = df_t.loc[:,'Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/8, random_state=0)

In [7]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((2841, 34), (2841,), (406, 34), (406,))

-------------------------------------------------------------------------------------------------------------------------------

### KNeighborsRegressor

In [8]:
from sklearn.neighbors import KNeighborsRegressor

In [9]:
clf =GridSearchCV(KNeighborsRegressor(weights='distance'),param_grid={
    'n_neighbors':range(2,35),'leaf_size':range(5,15)},scoring='neg_mean_absolute_error',
    cv=5, return_train_score=False) 

In [10]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [11]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_leaf_size,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
302,0.021874,0.007652826,0.006249,0.007653312,14,7,"{'leaf_size': 14, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
269,0.01875,0.006248498,0.00937,0.007650337,13,7,"{'leaf_size': 13, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
137,0.024999,0.00765304,0.003124,0.006248569,9,7,"{'leaf_size': 9, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
170,0.024998,0.00765302,0.003125,0.006249714,10,7,"{'leaf_size': 10, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
236,0.028123,0.006248236,0.003124,0.00624876,12,7,"{'leaf_size': 12, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
203,0.01723,0.007600089,0.009378,0.007657466,11,7,"{'leaf_size': 11, 'n_neighbors': 7}",-6659.093906,-6358.640464,-6866.672788,-6537.462085,-6810.858365,-6646.545522,184.608074,1
38,0.028124,0.006248474,0.003125,0.006249523,6,7,"{'leaf_size': 6, 'n_neighbors': 7}",-6661.268615,-6358.550002,-6867.203027,-6539.183369,-6810.675163,-6647.376035,184.558742,7
104,0.01875,0.006248546,0.009374,0.007653526,8,7,"{'leaf_size': 8, 'n_neighbors': 7}",-6661.268615,-6358.550002,-6867.203027,-6539.183369,-6810.675163,-6647.376035,184.558742,7
5,0.022238,0.007779432,0.009652,0.005857789,5,7,"{'leaf_size': 5, 'n_neighbors': 7}",-6661.268615,-6358.550002,-6867.203027,-6539.183369,-6810.675163,-6647.376035,184.558742,7
71,0.034376,0.006238371,0.003125,0.006249809,7,7,"{'leaf_size': 7, 'n_neighbors': 7}",-6661.268615,-6358.550002,-6867.203027,-6539.183369,-6810.675163,-6647.376035,184.558742,7


-------------------------------------------------------------------------------------------------------------------------------

###  DecisionTreeRegressor

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
clf =GridSearchCV(DecisionTreeRegressor(),param_grid={
    'criterion':('mse','friedman_mse','mae'),'splitter':('best','random'),'max_depth':range(5,30,3)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [14]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [15]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
23,0.003125,0.006249,0.003129,0.006258,friedman_mse,11,random,"{'criterion': 'friedman_mse', 'max_depth': 11,...",-3344.429389,-3476.928111,-3759.235514,-3473.468627,-3543.577302,-3519.527789,136.113042,1
9,0.006249,0.007654,0.0,0.0,mse,17,random,"{'criterion': 'mse', 'max_depth': 17, 'splitte...",-3459.45244,-3624.067729,-3860.420482,-3370.991805,-3386.119799,-3540.210451,183.557753,2
22,0.008148,0.009039,0.000585,0.000479,friedman_mse,11,best,"{'criterion': 'friedman_mse', 'max_depth': 11,...",-3590.714448,-3374.810041,-3768.603662,-3249.517312,-3723.780432,-3541.485179,200.131818,3
4,0.009374,0.007654,0.0,0.0,mse,11,best,"{'criterion': 'mse', 'max_depth': 11, 'splitte...",-3507.832003,-3361.248421,-3777.947278,-3347.92111,-3749.327095,-3548.855182,184.350685,4
5,0.006254,0.007659,0.0,0.0,mse,11,random,"{'criterion': 'mse', 'max_depth': 11, 'splitte...",-3417.165711,-3242.276946,-3800.1339,-3412.280848,-3948.044247,-3563.980331,265.078831,5
40,0.243542,0.010502,0.000403,0.000806,mae,11,best,"{'criterion': 'mae', 'max_depth': 11, 'splitte...",-3609.756591,-3432.852993,-3713.977113,-3490.136444,-3626.339789,-3574.612586,100.563688,6
25,0.006799,0.000402,0.000996,7e-06,friedman_mse,14,random,"{'criterion': 'friedman_mse', 'max_depth': 14,...",-3299.177456,-3646.315153,-3980.262644,-3526.361047,-3480.795795,-3586.582419,226.261422,7
7,0.00625,0.007654,0.0,0.0,mse,14,random,"{'criterion': 'mse', 'max_depth': 14, 'splitte...",-3354.528726,-3484.091118,-3736.213491,-3414.665034,-3961.800489,-3590.259772,226.724244,8
38,0.237364,0.009669,0.001401,0.0008,mae,8,best,"{'criterion': 'mae', 'max_depth': 8, 'splitter...",-3710.563269,-3448.578345,-3661.519366,-3599.134683,-3609.212148,-3605.801562,88.152945,9
45,0.265606,0.009881,0.0,0.0,mae,17,random,"{'criterion': 'mae', 'max_depth': 17, 'splitte...",-3560.884007,-3774.974472,-3831.448063,-3349.139965,-3580.283451,-3619.345992,171.559025,10


-------------------------------------------------------------------------------------------------------------------------------

###  RandomForestRegressor

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
clf =GridSearchCV(DecisionTreeRegressor(),param_grid={
    'criterion':('mse','friedman_mse','mae'),'splitter':('best','random'),'max_depth':range(12,25)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [18]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [19]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,0.00625,0.007654,0.0,0.0,mse,14,random,"{'criterion': 'mse', 'max_depth': 14, 'splitte...",-3399.148439,-3436.935848,-3607.885693,-3289.701756,-3510.188562,-3448.77206,106.76619,1
27,0.003125,0.00625,0.0,0.0,friedman_mse,12,random,"{'criterion': 'friedman_mse', 'max_depth': 12,...",-3288.912941,-3277.62665,-3713.103089,-3482.788502,-3568.525053,-3466.191247,166.551697,2
29,0.009374,0.007654,0.0,0.0,friedman_mse,13,random,"{'criterion': 'friedman_mse', 'max_depth': 13,...",-3268.964938,-3627.584998,-3845.702693,-3272.939756,-3497.60692,-3502.559861,219.405925,3
59,0.28728,0.057523,0.0,0.0,mae,15,random,"{'criterion': 'mae', 'max_depth': 15, 'splitte...",-3295.367311,-3504.482394,-3776.963028,-3374.577465,-3600.494718,-3510.376983,169.643591,4
31,0.00625,0.007654,0.0,0.0,friedman_mse,14,random,"{'criterion': 'friedman_mse', 'max_depth': 14,...",-3328.012857,-3501.704576,-3769.581267,-3524.039526,-3527.842668,-3530.236179,140.739847,5
65,0.271856,0.028986,0.0,0.0,mae,18,random,"{'criterion': 'mae', 'max_depth': 18, 'splitte...",-3496.756591,-3506.214789,-3695.086268,-3189.925176,-3847.519366,-3547.100438,220.876275,6
61,0.262481,0.033364,0.00625,0.007654,mae,16,random,"{'criterion': 'mae', 'max_depth': 16, 'splitte...",-3616.746924,-3338.189261,-3854.34507,-3234.643486,-3809.301937,-3570.645336,247.588747,7
0,0.010396,0.001201,0.001402,0.000492,mse,12,best,"{'criterion': 'mse', 'max_depth': 12, 'splitte...",-3598.942223,-3413.298192,-3833.240076,-3295.761695,-3719.388305,-3572.126098,196.055787,8
52,0.231233,0.00625,0.003125,0.00625,mae,12,best,"{'criterion': 'mae', 'max_depth': 12, 'splitte...",-3627.981547,-3494.585387,-3720.84331,-3445.180458,-3608.951585,-3579.508457,98.435784,9
1,0.005336,0.005722,0.0002,0.0004,mse,12,random,"{'criterion': 'mse', 'max_depth': 12, 'splitte...",-3421.977861,-3553.628197,-3803.055946,-3392.961873,-3728.313807,-3579.987537,162.727176,10


### Gradient Boosting

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
clf =GridSearchCV(GradientBoostingRegressor(),param_grid={
    'loss':('huber','lad'),'learning_rate':(0.05,0.1),'n_estimators':(75,90),
    'criterion':('mse','mae'),'max_depth':range(3,8,2)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [22]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [23]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_learning_rate,param_loss,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
16,1.201827,0.025941,0.003125,0.00625,mse,0.1,huber,7,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2806.406841,-2767.375582,-3058.243941,-2900.85981,-3051.002525,-2916.77774,120.652476,1
17,1.409448,0.077782,0.00625,0.007655,mse,0.1,huber,7,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2830.249731,-2772.764986,-3058.852326,-2878.499681,-3057.935487,-2919.660442,118.119539,2
15,0.725965,0.036731,0.0,0.0,mse,0.1,huber,5,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2806.720829,-2880.620945,-2988.558932,-2874.786384,-3078.478475,-2925.833113,95.992479,3
39,31.358314,2.195734,0.003724,0.006062,mae,0.1,huber,5,90,"{'criterion': 'mae', 'learning_rate': 0.1, 'lo...",-2800.375805,-2863.061899,-3004.048311,-2881.970925,-3163.15571,-2942.52253,128.569516,4
14,0.599954,0.021187,0.0,0.0,mse,0.1,huber,5,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2818.765911,-2900.498709,-3018.298678,-2866.561088,-3129.35031,-2946.694939,112.615667,5
38,23.637338,0.742293,0.00625,0.007654,mae,0.1,huber,5,75,"{'criterion': 'mae', 'learning_rate': 0.1, 'lo...",-2840.368715,-2885.369294,-3001.326775,-2906.229768,-3166.929867,-2960.044884,116.017659,6
5,1.681731,0.05814,0.007052,0.007156,mse,0.05,huber,7,90,"{'criterion': 'mse', 'learning_rate': 0.05, 'l...",-2867.326547,-2818.808081,-3122.769347,-2902.640201,-3095.390018,-2961.386839,123.79691,7
20,0.481215,0.006259,0.0,0.0,mse,0.1,lad,5,75,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2896.551755,-2863.749871,-2960.371666,-2994.190344,-3094.393713,-2961.85147,80.626813,8
21,0.560944,0.012534,0.003125,0.00625,mse,0.1,lad,5,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2912.306628,-2856.96154,-2942.000437,-3029.41876,-3085.215931,-2965.180659,81.950429,9
23,1.058854,0.073019,0.003126,0.006251,mse,0.1,lad,7,90,"{'criterion': 'mse', 'learning_rate': 0.1, 'lo...",-2901.490281,-2830.745294,-3039.846645,-2902.17944,-3168.466874,-2968.545707,120.782583,10


-------------------------------------------------------------------------------------------------------------------------------

### Linear Regression

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
reg = LinearRegression()

In [26]:
cross_val_score(reg,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-3391.9264008678447