# <div align="center"> Machine Learning Models

#### Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
pd.set_option('display.max_rows',2000)

In [3]:
mercedes = pd.read_csv('mercedes_esp.csv')
audi = pd.read_csv('audi_esp.csv')
land_rover = pd.read_csv('land_rover_esp.csv')
bmw = pd.read_csv('BMW_esp.csv')
porsche = pd.read_csv('porsche_esp.csv')

In [5]:
df = pd.concat([mercedes,audi,land_rover,bmw,porsche],axis = 0)
df=df.reset_index(drop=True)

#### Encoding and transforming variables

In [6]:
df_t = pd.get_dummies(df, columns=['Brand','Model','Fuel_type'])

In [7]:
X =  df_t.drop(columns='Price')
y = df_t.loc[:,'Price']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/8, random_state=0)

In [9]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((11418, 94), (11418,), (1632, 94), (1632,))

-------------------------------------------------------------------------------------------------------------------------------

### KNeighborsRegressor

In [37]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
clf =GridSearchCV(KNeighborsRegressor(weights='distance'),param_grid={
    'n_neighbors':range(14,16),'leaf_size':range(6,11)},scoring='neg_mean_absolute_error',
    cv=5, return_train_score=False)

In [39]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [40]:
results.sort_values(by='rank_test_score',ascending=True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_leaf_size,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,0.268693,0.006182,0.074228,0.006051,9,14,"{'leaf_size': 9, 'n_neighbors': 14}",-6070.205596,-6025.485649,-6226.870674,-6245.925569,-6028.002147,-6119.297927,97.110427,1
8,0.294744,0.028791,0.089005,0.013675,10,14,"{'leaf_size': 10, 'n_neighbors': 14}",-6070.205596,-6025.485649,-6226.870674,-6245.925569,-6028.002147,-6119.297927,97.110427,1
0,0.325188,0.026747,0.081404,0.006032,6,14,"{'leaf_size': 6, 'n_neighbors': 14}",-6071.492315,-6025.115236,-6226.127213,-6246.498875,-6028.02922,-6119.452572,97.034211,3
2,0.299974,0.006249,0.084373,0.007654,7,14,"{'leaf_size': 7, 'n_neighbors': 14}",-6071.492315,-6025.115236,-6226.127213,-6246.498875,-6028.02922,-6119.452572,97.034211,3
4,0.296853,0.009881,0.084365,0.007651,8,14,"{'leaf_size': 8, 'n_neighbors': 14}",-6071.492315,-6025.115236,-6226.127213,-6246.498875,-6028.02922,-6119.452572,97.034211,3


In [41]:
kng = KNeighborsRegressor(n_neighbors=14,weights='distance',leaf_size=10)

In [42]:
cross_val_score(kng,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-6119.297926909554

In [43]:
kng.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=14, p=2,
                    weights='distance')

In [47]:
y_pred = kng.predict(X_test)
d = {'Price':y_test, 'Predictions':y_pred}
pred_kng = pd.DataFrame(data=d)

In [48]:
pred_kng['Price-Prediction']=pred_kng['Price']-pred_kng['Predictions']
pred_kng['Brand']=df.loc[pred_kng.index,'Brand']
pred_kng['Model']=df.loc[pred_kng.index,'Model']
pred_kng['Year']=df.loc[pred_kng.index,'Year']
pred_kng['Km']=df.loc[pred_kng.index,'Km']
pred_kng['Power']=df.loc[pred_kng.index,'Power']
pred_kng['Fuel_type']=df.loc[pred_kng.index,'Fuel_type']

In [49]:
pred_kng.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
12655,55999,78174.958177,-22175.958177,Porsche,Cayenne,2015,86000,420,Gasolina
6916,44900,64952.633315,-20052.633315,Audi,S3,2019,13000,300,Gasolina
6136,40800,60319.149394,-19519.149394,Audi,S3,2017,11000,310,Gasolina
8881,15000,34333.690389,-19333.690389,BMW,Serie5,2011,120000,252,Diesel
8300,25500,44233.333333,-18733.333333,LandRover,DiscoverySport,2019,15000,150,Diesel
11180,30900,49157.176345,-18257.176345,BMW,Serie2,2017,38000,252,Gasolina
4014,19000,37070.614127,-18070.614127,Audi,A6,2012,90000,245,Electro/Gasolina
4103,31490,49550.032636,-18060.032636,Audi,S5,2013,69990,333,Gasolina
9904,21680,39473.387463,-17793.387463,BMW,Serie5,2015,25000,190,Diesel
10243,29999,47418.068612,-17419.068612,BMW,Serie1,2015,49000,326,Gasolina


### Gradient Boosting

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
clf =GridSearchCV(GradientBoostingRegressor(loss='huber'),param_grid={
    'n_estimators':range(75,125,10),
    'criterion':('friedman_mse','mse'),'max_depth':range(8,12)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [22]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [23]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,6.749214,0.150763,0.012494,0.006247072,mse,8,115,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2767.963687,-2728.966812,-2751.002612,-2760.195069,-2775.855585,-2756.796753,16.171368,1
4,6.708597,0.136002,0.015623,5.091228e-07,friedman_mse,8,115,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2763.48142,-2756.538072,-2778.650334,-2764.62626,-2752.236503,-2763.106518,9.004022,2
22,5.66157,0.162139,0.012497,0.006248689,mse,8,95,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2754.583136,-2743.093958,-2786.100922,-2767.544198,-2781.686541,-2766.601751,16.159951,3
23,6.137683,0.087337,0.015628,7.785745e-06,mse,8,105,"{'criterion': 'mse', 'max_depth': 8, 'n_estima...",-2773.126807,-2754.966387,-2776.821274,-2760.54052,-2777.420626,-2768.575123,9.129209,4
2,5.924463,0.252658,0.012172,0.006118736,friedman_mse,8,95,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2778.905124,-2758.71584,-2775.000099,-2778.323609,-2772.099438,-2772.608822,7.366069,5
28,7.742993,0.19746,0.012498,0.006248808,mse,9,105,"{'criterion': 'mse', 'max_depth': 9, 'n_estima...",-2775.848843,-2769.959021,-2806.825151,-2776.623949,-2735.060616,-2772.863516,22.864562,6
0,4.651675,0.163172,0.012494,0.006247096,friedman_mse,8,75,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2770.61826,-2751.945919,-2790.060169,-2768.714453,-2788.291711,-2773.926103,14.056389,7
3,6.19685,0.166895,0.009337,0.007788331,friedman_mse,8,105,"{'criterion': 'friedman_mse', 'max_depth': 8, ...",-2768.897735,-2764.363081,-2785.313931,-2778.26087,-2780.010917,-2775.369307,7.639627,8
9,8.382612,0.162614,0.015624,1.364488e-05,friedman_mse,9,115,"{'criterion': 'friedman_mse', 'max_depth': 9, ...",-2788.374813,-2781.652953,-2804.175254,-2757.526217,-2751.542293,-2776.654306,19.576804,9
8,8.236921,0.54154,0.015622,1.328315e-06,friedman_mse,9,105,"{'criterion': 'friedman_mse', 'max_depth': 9, ...",-2786.917889,-2772.035182,-2811.099294,-2773.620161,-2741.037388,-2776.941983,22.761774,10


In [24]:
gbr = GradientBoostingRegressor(loss='huber',criterion='mse',max_depth=8,n_estimators=115)

In [25]:
cross_val_score(gbr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-2767.314711806356

In [26]:
gbr.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=115,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

-------------------------------------------------------------------------------------------------------------------------------

In [32]:
y_pred = gbr.predict(X_test)

In [33]:
d = {'Price':y_test, 'Predictions':y_pred}

In [34]:
pred_gbr = pd.DataFrame(data=d)

In [35]:
pred_gbr['Price-Prediction']=pred_gbr['Price']-pred_gbr['Predictions']
pred_gbr['Brand']=df.loc[pred_gbr.index,'Brand']
pred_gbr['Model']=df.loc[pred_gbr.index,'Model']
pred_gbr['Year']=df.loc[pred_gbr.index,'Year']
pred_gbr['Km']=df.loc[pred_gbr.index,'Km']
pred_gbr['Power']=df.loc[pred_gbr.index,'Power']
pred_gbr['Fuel_type']=df.loc[pred_gbr.index,'Fuel_type']

In [36]:
pred_gbr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
11170,37999,59388.107296,-21389.107296,BMW,Serie7,2017,48000,408,Gasolina
12796,44490,60790.816449,-16300.816449,Porsche,Cayenne,2016,58914,420,Gasolina
12964,54800,70646.575502,-15846.575502,Porsche,Macan,2018,22000,340,Gasolina
2291,36900,49357.454568,-12457.454568,Mercedes-Benz,GLC,2017,46000,320,Electro/Gasolina
2898,66660,78982.588844,-12322.588844,Mercedes-Benz,EQC400,2019,13000,408,Electrico
11478,33356,45414.439763,-12058.439763,BMW,Serie4,2018,56356,313,Diesel
8217,52290,64072.094007,-11782.094007,LandRover,RangeRoverSport,2018,21340,258,Diesel
8300,25500,36646.547304,-11146.547304,LandRover,DiscoverySport,2019,15000,150,Diesel
12931,45000,55290.285598,-10290.285598,Porsche,Cayenne,2017,110000,385,Diesel
1943,42700,52829.896259,-10129.896259,Mercedes-Benz,GLC,2016,23546,320,Electro/Gasolina


###  DecisionTreeRegressor

In [50]:
from sklearn.tree import DecisionTreeRegressor

In [51]:
clf =GridSearchCV(DecisionTreeRegressor(splitter='random'),param_grid={
    'criterion':('mse','friedman_mse'),'max_depth':range(7,25,1)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [52]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [53]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.056246,0.007654247,0.0,0.0,mse,18,"{'criterion': 'mse', 'max_depth': 18}",-3512.874758,-3308.192849,-3332.209078,-3418.342376,-3451.747082,-3404.673229,75.723023,1
30,0.053122,0.007654344,0.003125,0.006249,friedman_mse,19,"{'criterion': 'friedman_mse', 'max_depth': 19}",-3444.299387,-3384.663797,-3399.273966,-3338.345064,-3539.724625,-3421.261368,68.208849,2
31,0.053126,0.007659019,0.006246,0.007649,friedman_mse,20,"{'criterion': 'friedman_mse', 'max_depth': 20}",-3445.299597,-3356.813661,-3373.903966,-3416.305154,-3546.898263,-3427.844128,67.181242,3
29,0.049992,0.006251579,0.003125,0.006249,friedman_mse,18,"{'criterion': 'friedman_mse', 'max_depth': 18}",-3542.66032,-3297.583343,-3390.170109,-3443.500894,-3494.218514,-3433.626636,84.915803,4
9,0.053335,0.009080281,0.001199,0.001469,mse,16,"{'criterion': 'mse', 'max_depth': 16}",-3439.973781,-3466.681039,-3488.930416,-3380.45482,-3401.553496,-3435.51871,40.073474,5
26,0.043748,0.006248522,0.003121,0.006242,friedman_mse,15,"{'criterion': 'friedman_mse', 'max_depth': 15}",-3441.610668,-3445.045414,-3549.850637,-3394.349264,-3370.86746,-3440.344688,61.575834,6
25,0.046872,4.422006e-07,0.0,0.0,friedman_mse,14,"{'criterion': 'friedman_mse', 'max_depth': 14}",-3472.22927,-3329.431716,-3426.19499,-3499.738721,-3537.35718,-3452.990376,71.635237,7
10,0.046248,0.001245929,0.003125,0.00625,mse,17,"{'criterion': 'mse', 'max_depth': 17}",-3525.643449,-3385.747189,-3542.74804,-3368.86563,-3465.096551,-3457.620172,70.672358,8
12,0.053121,0.007653468,0.006249,0.007654,mse,19,"{'criterion': 'mse', 'max_depth': 19}",-3494.888289,-3349.779433,-3534.659172,-3439.187294,-3497.912584,-3463.285354,64.430308,9
33,0.056246,0.007645647,0.003125,0.00625,friedman_mse,22,"{'criterion': 'friedman_mse', 'max_depth': 22}",-3495.110986,-3425.668196,-3432.884634,-3426.513521,-3561.605978,-3468.356663,53.371616,10


In [58]:
dtr = DecisionTreeRegressor(splitter='random',criterion='mse',max_depth=18)

In [59]:
cross_val_score(dtr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-3439.2349294720734

In [60]:
dtr.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=18,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='random')

In [61]:
y_pred = dtr.predict(X_test)

In [62]:
d = {'Price':y_test, 'Predictions':y_pred}

In [63]:
pred_dtr = pd.DataFrame(data=d)

In [64]:
pred_dtr['Price-Prediction']=pred_dtr['Price']-pred_dtr['Predictions']
pred_dtr['Brand']=df.loc[pred_dtr.index,'Brand']
pred_dtr['Model']=df.loc[pred_dtr.index,'Model']
pred_dtr['Year']=df.loc[pred_dtr.index,'Year']
pred_dtr['Km']=df.loc[pred_dtr.index,'Km']
pred_dtr['Power']=df.loc[pred_dtr.index,'Power']
pred_dtr['Fuel_type']=df.loc[pred_dtr.index,'Fuel_type']

In [65]:
pred_dtr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
12408,43990,75900.0,-31910.0,Porsche,Panamera,2011,92400,420,Gasolina
12425,50000,75900.0,-25900.0,Porsche,Panamera,2012,89000,430,Gasolina
12956,51900,75000.0,-23100.0,Porsche,Cayenne,2017,40000,262,Diesel
12796,44490,67320.0,-22830.0,Porsche,Cayenne,2016,58914,420,Gasolina
467,18000,38500.0,-20500.0,Mercedes-Benz,Viano,2012,96000,163,Diesel
11170,37999,58000.0,-20001.0,BMW,Serie7,2017,48000,408,Gasolina
11244,31500,50700.0,-19200.0,BMW,Serie5,2018,65132,252,Gasolina
11801,22760,41890.0,-19130.0,BMW,Serie3,2019,112878,258,Gasolina
12703,54000,72900.0,-18900.0,Porsche,Cayenne,2015,81277,416,Electro/Gasolina
8300,25500,44233.333333,-18733.333333,LandRover,DiscoverySport,2019,15000,150,Diesel


###  RandomForestRegressor

In [54]:
from sklearn.ensemble import RandomForestRegressor

In [55]:
clf =GridSearchCV(RandomForestRegressor(criterion='mse'),param_grid={
    'n_estimators':range(90,110,5)},
    scoring='neg_mean_absolute_error', cv=5, return_train_score=False) 

In [56]:
clf.fit(X_train,y_train)
results=pd.DataFrame(clf.cv_results_)

In [57]:
results.sort_values(by='rank_test_score',ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,5.610817,0.127397,0.086757,0.006304,105,{'n_estimators': 105},-2958.499203,-2929.447503,-2960.554984,-2936.618936,-2882.64423,-2933.552971,28.175799,1
0,4.455226,0.175277,0.05986,0.005286,90,{'n_estimators': 90},-2966.725625,-2928.245946,-2957.422395,-2934.943895,-2885.916849,-2934.650942,28.154737,2
2,5.274526,0.22096,0.102726,0.057407,100,{'n_estimators': 100},-2964.345206,-2931.796001,-2950.768078,-2961.360697,-2893.592691,-2940.372534,26.019877,3
1,4.932243,0.424331,0.071676,0.014116,95,{'n_estimators': 95},-2967.553589,-2937.160342,-2948.049116,-2951.029731,-2902.630437,-2941.284643,21.642273,4


In [67]:
rfr = RandomForestRegressor(criterion='mse',n_estimators=105)

In [68]:
cross_val_score(rfr,X_train,y_train,cv=5,scoring='neg_mean_absolute_error').mean()

-2939.414337068339

In [69]:
rfr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=105, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [70]:
y_pred = rfr.predict(X_test)

In [71]:
d = {'Price':y_test, 'Predictions':y_pred}

In [72]:
pred_rfr = pd.DataFrame(data=d)

In [73]:
pred_rfr['Price-Prediction']=pred_rfr['Price']-pred_rfr['Predictions']
pred_rfr['Brand']=df.loc[pred_rfr.index,'Brand']
pred_rfr['Model']=df.loc[pred_rfr.index,'Model']
pred_rfr['Year']=df.loc[pred_rfr.index,'Year']
pred_rfr['Km']=df.loc[pred_rfr.index,'Km']
pred_rfr['Power']=df.loc[pred_rfr.index,'Power']
pred_rfr['Fuel_type']=df.loc[pred_rfr.index,'Fuel_type']

In [74]:
pred_rfr.sort_values(by='Price-Prediction')

Unnamed: 0,Price,Predictions,Price-Prediction,Brand,Model,Year,Km,Power,Fuel_type
11170,37999,61566.714286,-23567.714286,BMW,Serie7,2017,48000,408,Gasolina
8300,25500,42995.725624,-17495.725624,LandRover,DiscoverySport,2019,15000,150,Diesel
12964,54800,70413.32381,-15613.32381,Porsche,Macan,2018,22000,340,Gasolina
8217,52290,67154.190476,-14864.190476,LandRover,RangeRoverSport,2018,21340,258,Diesel
2284,34400,48960.634921,-14560.634921,Mercedes-Benz,GLC,2017,13500,211,Gasolina
11801,22760,36668.73288,-13908.73288,BMW,Serie3,2019,112878,258,Gasolina
2291,36900,50008.838095,-13108.838095,Mercedes-Benz,GLC,2017,46000,320,Electro/Gasolina
9665,13000,25020.809524,-12020.809524,BMW,X3,2014,200000,258,Diesel
12931,45000,56720.444444,-11720.444444,Porsche,Cayenne,2017,110000,385,Diesel
12956,51900,63616.952381,-11716.952381,Porsche,Cayenne,2017,40000,262,Diesel
