# <div align="center"> Machine Learning Models

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows',2000)

In [2]:
mercedes = pd.read_csv('mercedes_esp.csv')
audi = pd.read_csv('audi_esp.csv')
land_rover = pd.read_csv('land_rover_esp.csv')
bmw = pd.read_csv('BMW_esp.csv')
porsche = pd.read_csv('porsche_esp.csv')

In [3]:
df = pd.concat([mercedes,audi,land_rover,bmw,porsche],axis = 0)
df=df.reset_index(drop=True)

#### Encoding and transforming variables

In [4]:
df_t = pd.get_dummies(df, columns=['Brand','Model','Fuel_type'])
df_t['Km'] = StandardScaler().fit_transform(df_t[['Km']])
df_t['Year'] = StandardScaler().fit_transform(df_t[['Year']])
df_t['Power'] = StandardScaler().fit_transform(df_t[['Power']])

In [5]:
X =  df_t.drop(columns='Price')
y = df_t.loc[:,'Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/8, random_state=0)

In [7]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((11418, 94), (11418,), (1632, 94), (1632,))

-------------------------------------------------------------------------------------------------------------------------------

### KNeighborsRegressor

In [8]:
from sklearn.neighbors import KNeighborsRegressor

In [9]:
clf =GridSearchCV(KNeighborsRegressor(weights='distance'),param_grid={
    'n_neighbors':range(14,16),'leaf_size':range(6,11)},scoring='neg_mean_absolute_error',
    cv=5, return_train_score=False)

In [10]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [11]:
results.sort_values(by='rank_test_score',ascending=True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_leaf_size,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,0.286355,0.026282,0.662969,0.028418,9,14,"{'leaf_size': 9, 'n_neighbors': 14}",-2830.959029,-2741.002213,-2941.899462,-2855.660842,-2865.310146,-2846.966338,64.666866,1
8,0.265606,9e-06,0.617488,0.00702,10,14,"{'leaf_size': 10, 'n_neighbors': 14}",-2830.959029,-2741.002213,-2941.899462,-2855.660842,-2865.310146,-2846.966338,64.666866,1
0,0.314441,0.030652,0.747085,0.049324,6,14,"{'leaf_size': 6, 'n_neighbors': 14}",-2830.679027,-2741.63522,-2942.333851,-2855.720214,-2865.430756,-2847.159814,64.610001,3
2,0.287487,0.007643,0.723963,0.006641,7,14,"{'leaf_size': 7, 'n_neighbors': 14}",-2830.679027,-2741.63522,-2942.333851,-2855.720214,-2865.430756,-2847.159814,64.610001,3
4,0.322858,0.030688,0.806429,0.05197,8,14,"{'leaf_size': 8, 'n_neighbors': 14}",-2830.679027,-2741.63522,-2942.333851,-2855.720214,-2865.430756,-2847.159814,64.610001,3


In [12]:
kng = KNeighborsRegressor(n_neighbors=14,weights='distance',leaf_size=10)

In [13]:
cross_val_score(kng,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-2846.966338285062

In [14]:
kng.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=14, p=2,
                    weights='distance')

In [15]:
y_pred = kng.predict(X_test)
d = {'Price':y_test, 'Predictions':y_pred}
pred_kng = pd.DataFrame(data=d)

In [16]:
pred_kng['Price-Prediction']=pred_kng['Price']-pred_kng['Predictions']
pred_kng['Brand']=df.loc[pred_kng.index,'Brand']
pred_kng['Model']=df.loc[pred_kng.index,'Model']
pred_kng['Year']=df.loc[pred_kng.index,'Year']
pred_kng['Km']=df.loc[pred_kng.index,'Km']
pred_kng['Power']=df.loc[pred_kng.index,'Power']
pred_kng['Fuel_type']=df.loc[pred_kng.index,'Fuel_type']

In [17]:
pred_kng.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
12796,44490,71456.447672,-26966.447672,Porsche,Cayenne,2016,58914,420,Gasolina
11711,31135,50080.457312,-18945.457312,BMW,X5,2018,30952,231,Diesel
8300,25500,44233.333333,-18733.333333,LandRover,DiscoverySport,2019,15000,150,Diesel
12964,54800,72370.680999,-17570.680999,Porsche,Macan,2018,22000,340,Gasolina
11170,37999,52689.433815,-14690.433815,BMW,Serie7,2017,48000,408,Gasolina
12685,48990,62476.166097,-13486.166097,Porsche,Cayenne,2015,66000,420,Gasolina
12956,51900,64939.25606,-13039.25606,Porsche,Cayenne,2017,40000,262,Diesel
12977,69975,82619.634179,-12644.634179,Porsche,Cayenne,2018,60000,340,Gasolina
8217,52290,64878.893572,-12588.893572,LandRover,RangeRoverSport,2018,21340,258,Diesel
2291,36900,49191.327985,-12291.327985,Mercedes-Benz,GLC,2017,46000,320,Electro/Gasolina


### Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

In [33]:
clf =GridSearchCV(GradientBoostingRegressor(loss='huber'),param_grid={
    'n_estimators':range(75,125,10),
    'criterion':('friedman_mse','mse'),'max_depth':range(8,12)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [34]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [35]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,6.598281,0.113391,0.014503,0.002239943,mse,8,115,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2744.899927,-2748.603871,-2780.360939,-2752.03499,-2755.520114,-2756.283968,12.545263,1
4,6.583626,0.107144,0.015624,8.792443e-07,friedman_mse,8,115,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2760.642591,-2731.933742,-2777.387395,-2759.17325,-2761.664101,-2758.160216,14.676362,2
23,6.078979,0.130078,0.012498,0.00624907,mse,8,105,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2746.950558,-2739.254754,-2772.931428,-2764.117969,-2783.264311,-2761.303804,16.233061,3
3,6.138774,0.149262,0.012498,0.006249142,friedman_mse,8,105,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2776.454085,-2745.958885,-2771.0805,-2760.390746,-2758.691588,-2762.515161,10.591523,4
22,5.571189,0.122224,0.015622,9.772254e-07,mse,8,95,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2770.997562,-2731.576179,-2787.698534,-2747.312002,-2780.14398,-2763.545651,20.976061,5
1,5.037487,0.118152,0.009374,0.007653799,friedman_mse,8,85,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2762.814261,-2748.212435,-2788.021579,-2786.742762,-2783.2281,-2773.803828,15.710298,6
2,5.688583,0.197779,0.012498,0.006249118,friedman_mse,8,95,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2771.799427,-2768.500338,-2792.566607,-2749.648225,-2787.727008,-2774.048321,15.239657,7
26,6.442318,0.079287,0.009374,0.007654033,mse,9,85,"{'criterion': 'mse', 'max_depth': 9, 'n_estima...",-2765.003897,-2786.851137,-2824.17085,-2782.945048,-2739.901517,-2779.77449,27.709899,8
27,7.061813,0.153201,0.015624,1.101896e-06,mse,9,95,"{'criterion': 'mse', 'max_depth': 9, 'n_estima...",-2791.382978,-2774.393549,-2824.449183,-2772.468679,-2745.267948,-2781.592467,26.027258,9
8,7.681779,0.234555,0.015622,1.836906e-06,friedman_mse,9,105,"{'criterion': 'friedman_mse', 'max_depth': 9, ...",-2788.392212,-2779.282261,-2818.764086,-2781.959688,-2747.616594,-2783.202968,22.692858,10


In [36]:
gbr = GradientBoostingRegressor(loss='huber',criterion='mse',max_depth=8,n_estimators=115)

In [37]:
cross_val_score(gbr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-2762.0463650188576

In [38]:
gbr.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=115,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

-------------------------------------------------------------------------------------------------------------------------------

In [39]:
y_pred = gbr.predict(X_test)

In [40]:
d = {'Price':y_test, 'Predictions':y_pred}

In [41]:
pred_gbr = pd.DataFrame(data=d)

In [42]:
pred_gbr['Price-Prediction']=pred_gbr['Price']-pred_gbr['Predictions']
pred_gbr['Brand']=df.loc[pred_gbr.index,'Brand']
pred_gbr['Model']=df.loc[pred_gbr.index,'Model']
pred_gbr['Year']=df.loc[pred_gbr.index,'Year']
pred_gbr['Km']=df.loc[pred_gbr.index,'Km']
pred_gbr['Power']=df.loc[pred_gbr.index,'Power']
pred_gbr['Fuel_type']=df.loc[pred_gbr.index,'Fuel_type']

In [43]:
pred_gbr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
11170,37999,58817.125684,-20818.125684,BMW,Serie7,2017,48000,408,Gasolina
12964,54800,71205.586434,-16405.586434,Porsche,Macan,2018,22000,340,Gasolina
12796,44490,60015.683787,-15525.683787,Porsche,Cayenne,2016,58914,420,Gasolina
2291,36900,49137.644198,-12237.644198,Mercedes-Benz,GLC,2017,46000,320,Electro/Gasolina
8300,25500,37389.83203,-11889.83203,LandRover,DiscoverySport,2019,15000,150,Diesel
8217,52290,63832.080684,-11542.080684,LandRover,RangeRoverSport,2018,21340,258,Diesel
1943,42700,54056.418422,-11356.418422,Mercedes-Benz,GLC,2016,23546,320,Electro/Gasolina
5459,17445,27875.808454,-10430.808454,Audi,A4,2016,88610,272,Diesel
5582,66900,77204.537315,-10304.537315,Audi,S8,2016,40000,605,Gasolina
9665,13000,23251.597411,-10251.597411,BMW,X3,2014,200000,258,Diesel


###  DecisionTreeRegressor

In [8]:
from sklearn.tree import DecisionTreeRegressor

In [9]:
clf =GridSearchCV(DecisionTreeRegressor(splitter='random'),param_grid={
    'criterion':('mse','friedman_mse'),'max_depth':range(7,25,1)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [10]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [11]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,0.049996,0.006238562,0.003125,0.006249,friedman_mse,17,"{'criterion': 'friedman_mse', 'max_depth': 17}",-3495.348268,-3371.316204,-3496.49856,-3304.689689,-3404.897427,-3414.55003,73.858714,1
29,0.056246,0.007654469,0.0,0.0,friedman_mse,18,"{'criterion': 'friedman_mse', 'max_depth': 18}",-3478.729276,-3374.821096,-3376.369782,-3423.071408,-3420.060019,-3414.610316,38.100002,2
30,0.053121,0.007660948,0.003125,0.006249,friedman_mse,19,"{'criterion': 'friedman_mse', 'max_depth': 19}",-3400.160415,-3295.737394,-3535.939438,-3446.977515,-3452.380888,-3426.23913,78.560082,3
10,0.05313,0.007648333,0.003121,0.006242,mse,17,"{'criterion': 'mse', 'max_depth': 17}",-3572.661363,-3309.704656,-3367.925124,-3468.346364,-3443.448375,-3432.417176,89.775958,4
11,0.053125,0.007650572,0.003121,0.006241,mse,18,"{'criterion': 'mse', 'max_depth': 18}",-3539.208956,-3343.175164,-3471.961816,-3463.024781,-3413.518998,-3446.177943,65.237371,5
7,0.053121,0.007653818,0.003124,0.006248,mse,14,"{'criterion': 'mse', 'max_depth': 14}",-3521.295945,-3385.487267,-3396.694122,-3448.22316,-3503.738632,-3451.087825,54.720384,6
9,0.056246,0.007653098,0.003125,0.00625,mse,16,"{'criterion': 'mse', 'max_depth': 16}",-3332.157776,-3398.473431,-3497.55654,-3552.073442,-3496.917805,-3455.435799,79.089734,7
33,0.061832,0.009887361,0.004371,0.006122,friedman_mse,22,"{'criterion': 'friedman_mse', 'max_depth': 22}",-3531.471115,-3443.729202,-3428.55791,-3424.002005,-3462.933245,-3458.138695,39.111639,8
31,0.068742,0.007657969,0.003124,0.006249,friedman_mse,20,"{'criterion': 'friedman_mse', 'max_depth': 20}",-3638.734154,-3366.777044,-3408.535756,-3375.190597,-3553.441559,-3468.535822,108.484694,9
8,0.056247,0.007654429,0.009373,0.007653,mse,15,"{'criterion': 'mse', 'max_depth': 15}",-3449.211689,-3492.85046,-3494.500316,-3542.923114,-3366.940775,-3469.285271,59.145571,10


In [12]:
dtr = DecisionTreeRegressor(splitter='random',criterion='mse',max_depth=18)

In [13]:
cross_val_score(dtr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-3468.9775314400817

In [14]:
dtr.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=18,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='random')

In [15]:
y_pred = dtr.predict(X_test)

In [16]:
d = {'Price':y_test, 'Predictions':y_pred}

In [17]:
pred_dtr = pd.DataFrame(data=d)

In [18]:
pred_dtr['Price-Prediction']=pred_dtr['Price']-pred_dtr['Predictions']
pred_dtr['Brand']=df.loc[pred_dtr.index,'Brand']
pred_dtr['Model']=df.loc[pred_dtr.index,'Model']
pred_dtr['Year']=df.loc[pred_dtr.index,'Year']
pred_dtr['Km']=df.loc[pred_dtr.index,'Km']
pred_dtr['Power']=df.loc[pred_dtr.index,'Power']
pred_dtr['Fuel_type']=df.loc[pred_dtr.index,'Fuel_type']

In [19]:
pred_dtr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
4372,38900,69000.0,-30100.0,Audi,S8,2013,173000,519,Gasolina
12425,50000,75900.0,-25900.0,Porsche,Panamera,2012,89000,430,Gasolina
2578,36490,61254.0,-24764.0,Mercedes-Benz,E,2018,14333,189,Diesel
12956,51900,75000.0,-23100.0,Porsche,Cayenne,2017,40000,262,Diesel
11801,22760,41890.0,-19130.0,BMW,Serie3,2019,112878,258,Gasolina
8300,25500,44233.333333,-18733.333333,LandRover,DiscoverySport,2019,15000,150,Diesel
2436,44199,61900.0,-17701.0,Mercedes-Benz,GLC,2018,11513,211,Gasolina
11711,31135,48697.2,-17562.2,BMW,X5,2018,30952,231,Diesel
11478,33356,50414.285714,-17058.285714,BMW,Serie4,2018,56356,313,Diesel
12931,45000,61710.0,-16710.0,Porsche,Cayenne,2017,110000,385,Diesel


###  RandomForestRegressor

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
clf =GridSearchCV(RandomForestRegressor(criterion='mse'),param_grid={
    'n_estimators':range(90,110,5)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [22]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [23]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,4.458748,0.163477,0.062501,7e-06,95,{'n_estimators': 95},-2950.461049,-2952.247369,-2940.598467,-2954.046545,-2880.182172,-2935.50712,28.05116,1
2,4.593689,0.086619,0.074997,0.00625,100,{'n_estimators': 100},-2957.76418,-2933.437723,-2968.202149,-2944.698538,-2883.858693,-2937.592257,29.322098,2
3,4.978259,0.193082,0.074993,0.006247,105,{'n_estimators': 105},-2979.016324,-2933.148258,-2956.4404,-2954.923219,-2894.305515,-2943.566743,28.588763,3
0,4.18483,0.11051,0.062494,8e-06,90,{'n_estimators': 90},-2967.423126,-2938.626032,-2960.993695,-2956.594199,-2904.203168,-2945.568044,22.787786,4


In [24]:
rfr = RandomForestRegressor(criterion='mse',n_estimators=105)

In [25]:
cross_val_score(rfr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-2933.33762355053

In [26]:
rfr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=105, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [27]:
y_pred = rfr.predict(X_test)

In [28]:
d = {'Price':y_test, 'Predictions':y_pred}

In [29]:
pred_rfr = pd.DataFrame(data=d)

In [30]:
pred_rfr['Price-Prediction']=pred_rfr['Price']-pred_rfr['Predictions']
pred_rfr['Brand']=df.loc[pred_rfr.index,'Brand']
pred_rfr['Model']=df.loc[pred_rfr.index,'Model']
pred_rfr['Year']=df.loc[pred_rfr.index,'Year']
pred_rfr['Km']=df.loc[pred_rfr.index,'Km']
pred_rfr['Power']=df.loc[pred_rfr.index,'Power']
pred_rfr['Fuel_type']=df.loc[pred_rfr.index,'Fuel_type']

In [31]:
pred_rfr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
11170,37999,59712.2,-21713.2,BMW,Serie7,2017,48000,408,Gasolina
8300,25500,44296.326531,-18796.326531,LandRover,DiscoverySport,2019,15000,150,Diesel
12964,54800,72907.571429,-18107.571429,Porsche,Macan,2018,22000,340,Gasolina
8217,52290,67703.390476,-15413.390476,LandRover,RangeRoverSport,2018,21340,258,Diesel
2284,34400,49192.403628,-14792.403628,Mercedes-Benz,GLC,2017,13500,211,Gasolina
2291,36900,49457.52381,-12557.52381,Mercedes-Benz,GLC,2017,46000,320,Electro/Gasolina
11801,22760,35278.469841,-12518.469841,BMW,Serie3,2019,112878,258,Gasolina
6371,31990,44195.27619,-12205.27619,Audi,A4,2018,63000,252,Gasolina
12796,44490,56590.692063,-12100.692063,Porsche,Cayenne,2016,58914,420,Gasolina
12956,51900,63874.285714,-11974.285714,Porsche,Cayenne,2017,40000,262,Diesel


# ------------------------------------------------------------------------------------------------------------------

# --------------------------------------------------------------------------------------