# 1. Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder
import pickle
import ml_utils as mlu

# Reading data

In [None]:
train = pd.read_csv("use_to_train.csv")
test = pd.read_csv("use_to_test.csv")
validation = pd.read_csv("use_to_val.csv")

In [None]:
# Select variables
# feature_names_number = ['fuel_consumption_km_l', 'horsepower', 'displacement', 'gears', 'torque', 'passengers', 'doors', 'wheels', 'km', 'age']
# feature_names_category = ['touch_screen', 'navigation_system','rear_sensor',  'start_stop', 'turbo', 'seat_material', 'trunk_opening',  'body_type', 'electric_parking_brake', 'electric_locks']

# feature_names_number = ['age', 'km', 'fuel_consumption_km_l', 'horsepower', 'displacement', 'torque']
# feature_names_category = ['push_start','start_stop', 'turbo', 'electric_locks', 'navigation_system']

feature_names_number = train.select_dtypes(include='number').columns.tolist()
feature_names_number.remove('price')
feature_names_category = train.select_dtypes(include='object').columns.tolist()

print('feature_names_number')
print(feature_names_number)
print('feature_names_category')
print(feature_names_category)

feature_names_number
['fuel_consumption_km_l', 'horsepower', 'displacement', 'gears', 'torque', 'passengers', 'doors', 'wheels', 'km', 'age']
feature_names_category
['touch_screen', 'navigation_system', 'push_start', 'rear_cupholders', 'sunroof', 'rear_sensor', 'start_stop', 'turbo', 'seat_material', 'trunk_opening', 'headlights', 'body_type', 'electric_parking_brake', 'electric_locks']


# 2. Partition of data

In [None]:
X_train = train[feature_names_number + feature_names_category]
y_train = train['price']
X_test = test[feature_names_number + feature_names_category]
y_test = test['price']
X_val = validation[feature_names_number + feature_names_category]
y_val = validation['price']

# 3. Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

In [None]:
attributes_number = Pipeline(steps=[
    ('null_replacement', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler())
])

attributes_category = Pipeline(steps=[
    ('null_replacement', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

attributes_preprocessing = ColumnTransformer(transformers = [
    ('number', attributes_number, feature_names_number),
    ('category', attributes_category, feature_names_category)    
])

# 4. Evaluate Model

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
def get_metrics_pd(y_values, preds, model_name):
    mse = np.round(mean_squared_error(y_values, preds),3)
    r2 = np.round(r2_score(y_values, preds),3)
    mae = np.round(mean_absolute_error(y_values, preds),3)
    rmse = np.round(np.sqrt(mse),3)
    return pd.Series([mse, r2, mae, rmse], index=['MSE', 'R^2', 'MAE', 'RMSE'])

# 5. Hyper parameters 

In [None]:
# !pip install catboost
# !pip install optuna

# Extreme

In [None]:
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, 
                                  HuberRegressor, BayesianRidge, TheilSenRegressor, 
                                  RANSACRegressor, PassiveAggressiveRegressor, 
                                  OrthogonalMatchingPursuit)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(),
    'CatBoostRegressor': CatBoostRegressor(),
    'HuberRegressor': HuberRegressor(),
    'BayesianRidge': BayesianRidge(),
    'TheilSenRegressor': TheilSenRegressor(),
    'RANSACRegressor': RANSACRegressor(),
    'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
    'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit()
}

In [None]:
%%time
df_train_metrics = pd.DataFrame()
df_test_metrics = pd.DataFrame()
df_val_metrics = pd.DataFrame()
for name, model in models.items():
    print(f'Training {name}')
    model_pipeline = Pipeline(steps=[
        ('preprocess', attributes_preprocessing),
        ('model', model)
    ])

    model_pipeline.fit(X_train, y_train)
    
    print("Getting train metrics:")
    train_metrics = mlu.get_metrics_pd(y_train, model_pipeline.predict(X_train), name)
    df_train_metrics = pd.concat([df_train_metrics, train_metrics], axis=1)

    print("Getting test metrics:")
    test_metrics = mlu.get_metrics_pd(y_test, model_pipeline.predict(X_test), name)
    df_test_metrics = pd.concat([df_test_metrics, test_metrics], axis=1)

    print("Getting validation metrics:")
    val_metrics = mlu.get_metrics_pd(y_val, model_pipeline.predict(X_val), name)
    df_val_metrics = pd.concat([df_val_metrics, val_metrics], axis=1)

38:	learn: 63427.0371855	total: 123ms	remaining: 3.04s
39:	learn: 62668.8038299	total: 125ms	remaining: 3s
40:	learn: 62027.9178576	total: 127ms	remaining: 2.97s
41:	learn: 61290.3367467	total: 129ms	remaining: 2.93s
42:	learn: 60553.5787925	total: 130ms	remaining: 2.9s
43:	learn: 59909.1836764	total: 132ms	remaining: 2.87s
44:	learn: 59290.3831996	total: 134ms	remaining: 2.84s
45:	learn: 58706.9071487	total: 135ms	remaining: 2.81s
46:	learn: 58127.5409602	total: 137ms	remaining: 2.78s
47:	learn: 57547.5992487	total: 139ms	remaining: 2.75s
48:	learn: 56906.9685913	total: 140ms	remaining: 2.73s
49:	learn: 56376.0592397	total: 142ms	remaining: 2.7s
50:	learn: 55930.6369509	total: 144ms	remaining: 2.68s
51:	learn: 55393.6558395	total: 146ms	remaining: 2.65s
52:	learn: 54850.0018488	total: 147ms	remaining: 2.63s
53:	learn: 54372.0519059	total: 149ms	remaining: 2.61s
54:	learn: 53826.8784103	total: 151ms	remaining: 2.59s
55:	learn: 53341.9114409	total: 152ms	remaining: 2.57s
56:	learn: 5288

## Training metrics

In [None]:
df_train_metrics = df_train_metrics.T.sort_values(by='RMSE')
# df_train_metrics.index.name = 'Model'
# df_train_metrics.reset_index(inplace=True)
df_train_metrics

Unnamed: 0,MSE,R^2,MAE,RMSE
DecisionTreeRegressor,627534.7,1.0,132.337,792.171
XGBRegressor,7659561.0,1.0,1975.037,2767.591
CatBoostRegressor,161298900.0,0.99,9543.504,12700.35
RandomForestRegressor,332325300.0,0.979,11695.288,18229.792
LGBMRegressor,423150900.0,0.973,12219.654,20570.631
GradientBoostingRegressor,841126900.0,0.946,21351.745,29002.187
KNeighborsRegressor,2355314000.0,0.848,31737.247,48531.58
LinearRegression,2814722000.0,0.819,37750.962,53053.956
Lasso,2814724000.0,0.819,37750.002,53053.974
Ridge,2817439000.0,0.819,37782.349,53079.552


## Test metrics

In [None]:
df_test_metrics = df_test_metrics.T.sort_values(by='RMSE')
# df_test_metrics = df_test_metrics.index.name = 'Model'
# df_test_metrics.reset_index(inplace=True)
df_test_metrics

Unnamed: 0,MSE,R^2,MAE,RMSE
CatBoostRegressor,2027566000.0,0.897,27004.779,45028.504
GradientBoostingRegressor,2485331000.0,0.873,31524.158,49853.097
LGBMRegressor,2490445000.0,0.873,30367.526,49904.361
XGBRegressor,2677129000.0,0.863,30892.098,51740.983
RandomForestRegressor,2823976000.0,0.856,32882.582,53141.098
BayesianRidge,4097034000.0,0.791,44108.849,64008.08
Ridge,4140021000.0,0.789,44182.956,64342.992
Lasso,4161280000.0,0.788,44197.938,64507.98
LinearRegression,4162009000.0,0.788,44200.637,64513.634
TheilSenRegressor,4298016000.0,0.781,43647.185,65559.253


## Validation metrics

In [None]:
df_val_metrics = df_val_metrics.T.sort_values(by='RMSE')
# df_val_metrics.index.name = 'Model'
# df_val_metrics.reset_index(inplace=True)
df_val_metrics

Unnamed: 0,MSE,R^2,MAE,RMSE
CatBoostRegressor,1595510000.0,0.887,26071.341,39943.84
XGBRegressor,2159432000.0,0.847,28953.559,46469.685
RandomForestRegressor,2182417000.0,0.845,30221.749,46716.342
LGBMRegressor,2282189000.0,0.838,29494.251,47772.263
GradientBoostingRegressor,2368786000.0,0.832,31509.79,48670.176
HuberRegressor,2969031000.0,0.789,37003.684,54488.813
TheilSenRegressor,2974646000.0,0.789,38700.989,54540.318
PassiveAggressiveRegressor,3030075000.0,0.785,37304.428,55046.115
BayesianRidge,3049265000.0,0.784,40075.444,55220.148
Ridge,3118013000.0,0.779,40356.276,55839.169


In [None]:
df_full_metris = pd.concat([df_train_metrics, df_test_metrics, df_val_metrics], axis=1)
df_full_metris.index.name = 'Model'
df_full_metris.reset_index(inplace=True)
df_full_metris

Unnamed: 0,Model,MSE,R^2,MAE,RMSE,MSE.1,R^2.1,MAE.1,RMSE.1,MSE.2,R^2.2,MAE.2,RMSE.2
0,DecisionTreeRegressor,627534.7,1.0,132.337,792.171,6716132000.0,0.657,46840.812,81952.01,4661619000.0,0.669,43318.763,68276.049
1,XGBRegressor,7659561.0,1.0,1975.037,2767.591,2677129000.0,0.863,30892.098,51740.983,2159432000.0,0.847,28953.559,46469.685
2,CatBoostRegressor,161298900.0,0.99,9543.504,12700.35,2027566000.0,0.897,27004.779,45028.504,1595510000.0,0.887,26071.341,39943.84
3,RandomForestRegressor,332325300.0,0.979,11695.288,18229.792,2823976000.0,0.856,32882.582,53141.098,2182417000.0,0.845,30221.749,46716.342
4,LGBMRegressor,423150900.0,0.973,12219.654,20570.631,2490445000.0,0.873,30367.526,49904.361,2282189000.0,0.838,29494.251,47772.263
5,GradientBoostingRegressor,841126900.0,0.946,21351.745,29002.187,2485331000.0,0.873,31524.158,49853.097,2368786000.0,0.832,31509.79,48670.176
6,KNeighborsRegressor,2355314000.0,0.848,31737.247,48531.58,4786587000.0,0.756,44032.479,69185.162,3266764000.0,0.768,38078.038,57155.616
7,LinearRegression,2814722000.0,0.819,37750.962,53053.956,4162009000.0,0.788,44200.637,64513.634,3146115000.0,0.777,40518.071,56090.241
8,Lasso,2814724000.0,0.819,37750.002,53053.974,4161280000.0,0.788,44197.938,64507.98,3145392000.0,0.777,40513.426,56083.796
9,Ridge,2817439000.0,0.819,37782.349,53079.552,4140021000.0,0.789,44182.956,64342.992,3118013000.0,0.779,40356.276,55839.169


In [None]:
df_full_metris.T


Unnamed: 0,DecisionTreeRegressor,XGBRegressor,CatBoostRegressor,RandomForestRegressor,LGBMRegressor,GradientBoostingRegressor,KNeighborsRegressor,LinearRegression,Lasso,Ridge,BayesianRidge,TheilSenRegressor,HuberRegressor,PassiveAggressiveRegressor,RANSACRegressor,OrthogonalMatchingPursuit,AdaBoostRegressor,ElasticNet,SVR
MSE,627534.7,7659561.0,161298900.0,332325300.0,423150900.0,841126900.0,2355314000.0,2814722000.0,2814724000.0,2817439000.0,2841880000.0,2975687000.0,3155678000.0,3496532000.0,3631866000.0,3759143000.0,3827813000.0,3985817000.0,16392570000.0
R^2,1.0,1.0,0.99,0.979,0.973,0.946,0.848,0.819,0.819,0.819,0.817,0.809,0.797,0.775,0.766,0.758,0.754,0.744,-0.055
MAE,132.337,1975.037,9543.504,11695.29,12219.65,21351.74,31737.25,37750.96,37750.0,37782.35,38062.06,37461.93,35487.86,37561.58,37676.83,43958.08,52842.59,42974.04,92741.44
RMSE,792.171,2767.591,12700.35,18229.79,20570.63,29002.19,48531.58,53053.96,53053.97,53079.55,53309.29,54549.86,56175.42,59131.48,60264.96,61311.85,61869.32,63133.33,128033.5
MSE,6716132000.0,2677129000.0,2027566000.0,2823976000.0,2490445000.0,2485331000.0,4786587000.0,4162009000.0,4161280000.0,4140021000.0,4097034000.0,4298016000.0,4620665000.0,4688456000.0,5158664000.0,4870913000.0,5307495000.0,5130688000.0,21073440000.0
R^2,0.657,0.863,0.897,0.856,0.873,0.873,0.756,0.788,0.788,0.789,0.791,0.781,0.764,0.761,0.737,0.752,0.729,0.738,-0.075
MAE,46840.81,30892.1,27004.78,32882.58,30367.53,31524.16,44032.48,44200.64,44197.94,44182.96,44108.85,43647.18,42233.48,42971.25,44583.15,49540.54,58438.83,46700.45,97696.22
RMSE,81952.01,51740.98,45028.5,53141.1,49904.36,49853.1,69185.16,64513.63,64507.98,64342.99,64008.08,65559.25,67975.48,68472.3,71823.84,69791.93,72852.56,71628.82,145166.9
MSE,4661619000.0,2159432000.0,1595510000.0,2182417000.0,2282189000.0,2368786000.0,3266764000.0,3146115000.0,3145392000.0,3118013000.0,3049265000.0,2974646000.0,2969031000.0,3030075000.0,3338361000.0,3485567000.0,4508074000.0,3311408000.0,14471400000.0
R^2,0.669,0.847,0.887,0.845,0.838,0.832,0.768,0.777,0.777,0.779,0.784,0.789,0.789,0.785,0.763,0.753,0.68,0.765,-0.027


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a817877f-bc24-4404-a1bc-b1f4f599a592' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>