# Packages

In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder
import pickle
import scripts.ml_utils as mlu
import fs

# Reading data

In [50]:
IMTERIM_DIR = fs.open_fs("../data/interim")
TRAIN_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_train.csv")
TEST_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_test.csv")
VALIDATION_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_val.csv")

In [51]:
NEW_MODELS_DIR = fs.open_fs("../models/new")

In [52]:
train = pd.read_csv(TRAIN_CSV_DIR)
test = pd.read_csv(TEST_CSV_DIR)
validation = pd.read_csv(VALIDATION_CSV_DIR)

In [53]:
# Best
# feature_names_number = ['fuel_consumption_km_l', 'horsepower', 'displacement', 'gears', 'torque', 'max_speed', 'passengers', 'doors', 'wheels', 'km', 'age']
# feature_names_category = ['touch_screen', 'navigation_system','rear_sensor', 'sunroof' , 'start_stop', 'turbo', 'seat_material', 'trunk_opening',  'body_type', 'electric_parking_brake', 'electric_locks']

# feature_names_number = ['age', 'km', 'fuel_consumption_km_l', 'horsepower', 'displacement', 'torque']
# feature_names_category = ['push_start','start_stop', 'turbo', 'electric_locks', 'navigation_system']

# Test
feature_names_number = ['fuel_consumption_km_l', 'horsepower', 'displacement', 'gears', 'torque', 'doors', 'km', 'age', 'wheels']
feature_names_category = ['touch_screen', 'navigation_system','rear_sensor',  'start_stop', 'turbo', 'seat_material', 'trunk_opening',  'body_type', 'electric_parking_brake', 'electric_locks']

# feature_names_number = train.select_dtypes(include='number').columns.tolist()
# feature_names_number.remove('price')
# feature_names_category = train.select_dtypes(include='object').columns.tolist()

print('feature_names_number')
print(feature_names_number)
print('feature_names_category')
print(feature_names_category)

feature_names_number
['fuel_consumption_km_l', 'horsepower', 'displacement', 'gears', 'torque', 'doors', 'km', 'age', 'wheels']
feature_names_category
['touch_screen', 'navigation_system', 'rear_sensor', 'start_stop', 'turbo', 'seat_material', 'trunk_opening', 'body_type', 'electric_parking_brake', 'electric_locks']


# Split

In [54]:
X_train = train[feature_names_number + feature_names_category].copy()
y_train = train['price'].copy()
X_test = test[feature_names_number + feature_names_category].copy()
y_test = test['price'].copy()
X_val = validation[feature_names_number + feature_names_category].copy()
y_val = validation['price'].copy()

# Scale

In [55]:
rob_scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
    rob_scaler.fit_transform(X_train[feature_names_number]),
    columns=feature_names_number,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    rob_scaler.transform(X_test[feature_names_number]),
    columns=feature_names_number,
    index=X_test.index
)

X_val_scaled = pd.DataFrame(
    rob_scaler.transform(X_val[feature_names_number]),
    columns=feature_names_number,
    index=X_val.index
)

# Concatenar las variables categóricas que no fueron transformadas
X_train = pd.concat([X_train_scaled, X_train[feature_names_category]], axis=1)
X_test = pd.concat([X_test_scaled, X_test[feature_names_category]], axis=1)
X_val = pd.concat([X_val_scaled, X_val[feature_names_category]], axis=1)

# Training

## Catboost (predicting expected value)

In [56]:
ctReg = CatBoostRegressor(iterations=100, learning_rate=0.3, bootstrap_type =  "MVS")
# ctReg = CatBoostRegressor()

In [57]:
model = ctReg.fit(X_train, y_train,
        cat_features=feature_names_category, 
        # eval_set=(X_val, y_val), 
        verbose=True
)

0:	learn: 104678.1227753	total: 27.8ms	remaining: 2.75s
1:	learn: 88707.0842110	total: 47.2ms	remaining: 2.31s
2:	learn: 79017.9836556	total: 50.1ms	remaining: 1.62s
3:	learn: 70313.8904395	total: 52.4ms	remaining: 1.26s
4:	learn: 63843.4139362	total: 54.7ms	remaining: 1.04s
5:	learn: 59077.8076681	total: 56.9ms	remaining: 891ms
6:	learn: 54923.3460183	total: 59.1ms	remaining: 785ms
7:	learn: 50960.0915445	total: 62.5ms	remaining: 718ms
8:	learn: 48474.7098303	total: 64.9ms	remaining: 656ms
9:	learn: 46766.9072339	total: 67.8ms	remaining: 610ms
10:	learn: 45048.0549225	total: 70.6ms	remaining: 571ms
11:	learn: 44218.1033943	total: 73.2ms	remaining: 537ms
12:	learn: 43241.0074560	total: 75.9ms	remaining: 508ms
13:	learn: 42616.3630929	total: 79.2ms	remaining: 486ms
14:	learn: 41644.4648095	total: 81.9ms	remaining: 464ms
15:	learn: 41006.1792684	total: 87.9ms	remaining: 462ms
16:	learn: 40566.4834369	total: 91.7ms	remaining: 448ms
17:	learn: 39800.4673337	total: 94.8ms	remaining: 432ms
1

### Evaluating train

### Evaluating test

In [58]:
preds_train = model.predict(X_train)
# preds_train

In [59]:
preds_test = model.predict(X_test)
# preds_test

In [60]:
mlu.get_metrics_pd(y_train, preds_train, 'Catbost Regressor Train')

Unnamed: 0,Catbost Regressor Train
MSE,430321774.34
R^2,0.972
MAE,15284.78
RMSE,20744.2


In [61]:
mlu.get_metrics_pd(y_test, preds_test, 'Catbost Regressor Test')

Unnamed: 0,Catbost Regressor Test
MSE,2272609017.74
R^2,0.884
MAE,31019.7
RMSE,47671.89


In [110]:
CATBOOST_REGRESSOR_DIR = NEW_MODELS_DIR.getsyspath('1_catboost_regressor.joblib')
joblib.dump(model, CATBOOST_REGRESSOR_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/1_catboost_regressor.joblib']

## Replanning:
We have 1992 observations, of which there are 364 different prices. Therefore, we will proceed to perform a better regression with a multi-quantile loss function.

In [62]:
y_train.nunique()

371

## Catboost (predicting interval)

In [63]:
quantile_levels = [0.5, 0.75]
quantile_str = str(quantile_levels).replace('[','').replace(']','')

model = CatBoostRegressor(
    loss_function=f'MultiQuantile:alpha={quantile_str}',
    thread_count= 4,
    cat_features= feature_names_category,
    bootstrap_type =  "MVS",
    # iterations=26, learning_rate=0.1
    iterations=1000, learning_rate=0.3
)
_ = model.fit(X_train, y_train, verbose=True)

0:	learn: 35479.5354167	total: 6.51ms	remaining: 6.51s
1:	learn: 29215.6291494	total: 11.9ms	remaining: 5.96s
2:	learn: 25133.7583226	total: 16.8ms	remaining: 5.59s
3:	learn: 22480.8051433	total: 22.7ms	remaining: 5.66s
4:	learn: 20536.3242359	total: 29.8ms	remaining: 5.93s
5:	learn: 18969.2566186	total: 38.6ms	remaining: 6.4s
6:	learn: 17746.8518557	total: 55ms	remaining: 7.8s
7:	learn: 16731.0750076	total: 59.6ms	remaining: 7.39s
8:	learn: 15985.3722812	total: 64ms	remaining: 7.05s
9:	learn: 15389.3100675	total: 68.3ms	remaining: 6.76s
10:	learn: 14911.2871558	total: 74ms	remaining: 6.66s
11:	learn: 14525.5708358	total: 78.5ms	remaining: 6.46s
12:	learn: 14157.2049180	total: 82.9ms	remaining: 6.29s
13:	learn: 13507.7577013	total: 88.1ms	remaining: 6.21s
14:	learn: 13237.7485952	total: 92.5ms	remaining: 6.08s
15:	learn: 12940.9036873	total: 96.8ms	remaining: 5.95s
16:	learn: 12725.2250267	total: 101ms	remaining: 5.85s
17:	learn: 12540.1944507	total: 106ms	remaining: 5.76s
18:	learn: 1

### Evaluating train

In [64]:
y_train_predict = model.predict(X_train)

predictions = y_train.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_train_predict[:, 0]
predictions["pi_upper"] = y_train_predict[:, 1]
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)

In [65]:
predictions

Unnamed: 0,y_true,pi_median,pi_upper,avg
0,451999,437653.783536,429163.755854,433408.769695
1,281999,304231.248362,309457.152611,306844.200486
2,224999,224920.801169,226670.342280,225795.571725
3,171999,171247.611605,178448.453247,174848.032426
4,199999,201286.107347,195481.933111,198384.020229
...,...,...,...,...
932,319999,327964.518345,335705.278755,331834.898550
933,161999,161970.381097,162000.254637,161985.317867
934,377999,369504.296445,376712.459725,373108.378085
935,285999,273664.354342,283704.087917,278684.221130


In [66]:
mlu.get_metrics_pd(predictions.y_true, predictions['pi_median'], 'Catbost Regressor Median')

Unnamed: 0,Catbost Regressor Median
MSE,422905292.46
R^2,0.973
MAE,13047.6
RMSE,20564.66


In [67]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'Catbost Regressor AVG')

Unnamed: 0,Catbost Regressor AVG
MSE,477532227.06
R^2,0.969
MAE,14285.37
RMSE,21852.51


### Evaluating test

In [68]:
y_test_predict = model.predict(X_test)

predictions = y_test.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_test_predict[:, 0]
predictions["pi_upper"] = y_test_predict[:, 1]
predictions["avg"] = ((predictions["pi_median"] + predictions["pi_upper"])/2)

In [69]:
predictions["y_true"].nunique()
# len(predictions["y_true"])

270

In [70]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'Catbost Regressor')

Unnamed: 0,Catbost Regressor
MSE,2579279828.1
R^2,0.868
MAE,33458.68
RMSE,50786.61


In [71]:
predictions

Unnamed: 0,y_true,pi_median,pi_upper,avg
0,206999,220285.355900,231202.889122,225744.122511
1,192999,188084.542816,193931.398867,191007.970841
2,791999,669100.996540,785237.260426,727169.128483
3,418999,455373.259370,488039.900148,471706.579759
4,210999,220663.236358,236372.013907,228517.625132
...,...,...,...,...
463,593999,694353.474054,701536.489778,697944.981916
464,335999,347354.707068,347096.186434,347225.446751
465,285999,269353.502780,272784.941106,271069.221943
466,462999,436279.360038,431363.221404,433821.290721


In [72]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'Catbost Regressor Median')

Unnamed: 0,Catbost Regressor Median
MSE,2492298455.32
R^2,0.873
MAE,32750.09
RMSE,49922.93


In [73]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'Catbost Regressor AVG')

Unnamed: 0,Catbost Regressor AVG
MSE,2579279828.1
R^2,0.868
MAE,33458.68
RMSE,50786.61


### Evaluating val

In [74]:
y_val_predict = model.predict(X_val)

predictions = y_val.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_val_predict[:, 0]
predictions["pi_upper"] = y_val_predict[:, 1]
predictions["avg"] = ((predictions["pi_median"] + predictions["pi_upper"])/2)

In [75]:
predictions

Unnamed: 0,y_true,pi_median,pi_upper,avg
0,172999,167720.828629,176342.951814,172031.890222
1,181999,181930.735171,189267.294360,185599.014766
2,390999,357517.816903,335256.531151,346387.174027
3,620999,702850.241514,688787.971332,695819.106423
4,229999,239980.497692,259042.918281,249511.707987
...,...,...,...,...
464,360999,349733.681226,390628.442478,370181.061852
465,125999,149877.386270,156307.573421,153092.479846
466,182999,187591.754681,195490.472225,191541.113453
467,206999,189105.004867,189037.877851,189071.441359


In [76]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'Catbost Regressor Median')

Unnamed: 0,Catbost Regressor Median
MSE,2147050791.77
R^2,0.848
MAE,30619.2
RMSE,46336.28


In [77]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'Catbost Regressor AVG')

Unnamed: 0,Catbost Regressor AVG
MSE,2402295463.27
R^2,0.83
MAE,32075.87
RMSE,49013.22


#### Saving model

In [78]:
CATBOOST_INTERVAL_DIR = NEW_MODELS_DIR.getsyspath('2_catboost_interval.sav')
joblib.dump(model, CATBOOST_INTERVAL_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/2_catboost_interval.sav']

In [79]:
#To Load
loaded_model = pickle.load(open(CATBOOST_INTERVAL_DIR, 'rb'))

## LGBM

### Label encoder

In [80]:
label_encoders = {}
for col in X_test.columns: 
    if X_test[col].dtype == 'object':
        label_encoders[col] = LabelEncoder()
        X_test[col + '_encoded'] = label_encoders[col].fit_transform(X_test[col])

In [81]:
encoded_cols = list(X_train.filter(regex='_encoded').columns)
encoded_cols = list(X_test.filter(regex='_encoded').columns)
encoded_cols = list(X_val.filter(regex='_encoded').columns)

In [82]:
X_train = X_train[encoded_cols + feature_names_number]
X_test = X_test[encoded_cols + feature_names_number]
X_val = X_val[encoded_cols + feature_names_number]

In [83]:
len(X_test.columns)

9

### Model

In [84]:
quantile_alphas = [0.5, 0.75]

lgb_quantile_alphas = {}
for quantile_alpha in quantile_alphas:
    lgb = LGBMRegressor(objective='quantile', alpha=quantile_alpha, n_estimators=1000, learning_rate=0.3)
    lgb.fit(X_test, y_test)
    lgb_quantile_alphas[quantile_alpha] = lgb

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 379
[LightGBM] [Info] Number of data points in the train set: 468, number of used features: 9
[LightGBM] [Info] Start training from score 283999.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 379
[LightGBM] [Info] Number of data points in the train set: 468, number of used features: 9
[LightGBM] [Info] Start training from score 374999.000000


In [85]:
lgb_quantile_alphas

{0.5: LGBMRegressor(alpha=0.5, learning_rate=0.3, n_estimators=1000,
               objective='quantile'),
 0.75: LGBMRegressor(alpha=0.75, learning_rate=0.3, n_estimators=1000,
               objective='quantile')}

In [86]:
lgb_quantile_alphas[0.75]

### Evaluating train

In [87]:
y_train_predict_5 = lgb_quantile_alphas[0.5].predict(X_train)
y_train_predict_75 = lgb_quantile_alphas[0.75].predict(X_train)

predictions = y_train.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_train_predict_5
predictions["pi_upper"] =y_train_predict_75
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)

In [88]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'Catbost Regressor Median')

Unnamed: 0,Catbost Regressor Median
MSE,2876558979.03
R^2,0.815
MAE,35058.81
RMSE,53633.56


In [89]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'Catbost Regressor AVG')

Unnamed: 0,Catbost Regressor AVG
MSE,2873605110.72
R^2,0.815
MAE,35961.07
RMSE,53606.02


### Evaluating test

In [90]:
y_test_predict_5 = lgb_quantile_alphas[0.5].predict(X_test)
y_test_predict_75 = lgb_quantile_alphas[0.75].predict(X_test)

predictions = y_test.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_test_predict_5
predictions["pi_upper"] = y_test_predict_75
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)

In [91]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'LGBM Regressor Median')

Unnamed: 0,LGBM Regressor Median
MSE,1053991149.75
R^2,0.946
MAE,12514.31
RMSE,32465.23


In [92]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'LGBM Regressor AVG')

Unnamed: 0,LGBM Regressor AVG
MSE,819028782.01
R^2,0.958
MAE,14171.73
RMSE,28618.68


#### Saving model

In [93]:
LGBM_INTERVAL_DIR = NEW_MODELS_DIR.getsyspath('2_lgbm_interval.sav')
joblib.dump(lgb_quantile_alphas, LGBM_INTERVAL_DIR)

['2_lgbm_interval.sav']

In [94]:
#To Load
loaded_model = pickle.load(open(LGBM_INTERVAL_DIR, 'rb'))

In [95]:
loaded_model[0.5]

## Regression


### Label encoder

In [96]:
label_encoders = {}
for col in X_test.columns: 
    if X_test[col].dtype == 'object':
        label_encoders[col] = LabelEncoder()
        X_test[col + '_encoded'] = label_encoders[col].fit_transform(X_test[col])

In [97]:
encoded_cols = list(X_train.filter(regex='_encoded').columns)
encoded_cols = list(X_test.filter(regex='_encoded').columns)
encoded_cols = list(X_val.filter(regex='_encoded').columns)

In [98]:
X_train = X_train[encoded_cols + feature_names_number]
X_test = X_test[encoded_cols + feature_names_number]
X_val = X_val[encoded_cols + feature_names_number]

### Model

In [99]:
quantile_alphas = [0.5, 0.75]

qr_quantile_alphas = {}
for quantile_alpha in quantile_alphas:
    qr = QuantileRegressor(quantile=quantile_alpha, alpha=0.1)
    qr.fit(X_test, y_test)
    qr_quantile_alphas[quantile_alpha] = qr

In [100]:
qr_quantile_alphas

{0.5: QuantileRegressor(alpha=0.1),
 0.75: QuantileRegressor(alpha=0.1, quantile=0.75)}

In [101]:
qr_quantile_alphas[0.75]

### Evaluating train

In [102]:
y_train_predict_5 = qr_quantile_alphas[0.5].predict(X_train)
y_train_predict_75 = qr_quantile_alphas[0.75].predict(X_train)

predictions = y_train.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_train_predict_5
predictions["pi_upper"] = y_train_predict_75
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)

In [103]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'LGBM Regressor Median')

Unnamed: 0,LGBM Regressor Median
MSE,7185386683.46
R^2,0.538
MAE,57607.5
RMSE,84766.66


In [104]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'LGBM Regressor AVG')

Unnamed: 0,LGBM Regressor AVG
MSE,6696399282.63
R^2,0.569
MAE,61186.65
RMSE,81831.53


### Evaluating test

In [105]:
y_test_predict_5 = qr_quantile_alphas[0.5].predict(X_test)
y_test_predict_75 = qr_quantile_alphas[0.75].predict(X_test)

predictions = y_test.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = y_test_predict_5
predictions["pi_upper"] = y_test_predict_75
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)

In [106]:
mlu.get_metrics_pd(predictions.y_true, predictions.pi_median, 'LGBM Regressor Median')

Unnamed: 0,LGBM Regressor Median
MSE,7994678158.52
R^2,0.592
MAE,55846.65
RMSE,89412.96


In [107]:
mlu.get_metrics_pd(predictions.y_true, predictions.avg, 'LGBM Regressor AVG')

Unnamed: 0,LGBM Regressor AVG
MSE,6884461387.82
R^2,0.649
MAE,59036.18
RMSE,82972.65


In [108]:
LGBM_INTERVAL_DIR = NEW_MODELS_DIR.getsyspath('2_regression_interval.sav')
joblib.dump(qr_quantile_alphas, LGBM_INTERVAL_DIR)

['2_regression_interval.sav']