# Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import optuna
from sklearn.model_selection import KFold
from sklearn.base import clone
import joblib
from sklearn.metrics import root_mean_squared_error, r2_score
from typing import List, Dict, Optional
import scripts.ml_utils as mlu
import fs

# Reading data

In [5]:
IMTERIM_DIR = fs.open_fs("../data/interim")
TRAIN_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_train.csv")
TEST_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_test.csv")
VALIDATION_CSV_DIR = IMTERIM_DIR.getsyspath("use_to_val.csv")

In [6]:
NEW_MODELS_DIR = fs.open_fs("../models/new")

In [7]:
train = pd.read_csv(TRAIN_CSV_DIR)
test = pd.read_csv(TEST_CSV_DIR)
validation = pd.read_csv(VALIDATION_CSV_DIR)

In [None]:
# var_int = train.select_dtypes(include=['int', 'float']).columns.tolist()
# feature_names_int = train.drop('price', axis=1).select_dtypes(include=['int', 'float']).columns.tolist()
# feature_names_cat = train.select_dtypes(include=['object']).columns.tolist()

In [8]:
feature_names_int = ['horsepower', 'displacement', 'torque', 'wheels', 'km', 'age']
feature_names_cat = ['navigation_system', 'rear_sensor', 'push_start', 'turbo', 'body_type']
# # var_int = feature_names_int


# Split

In [9]:
X_train = train.drop('price', axis = 1)
y_train = train['price']
X_val = validation.drop('price', axis = 1)
y_val = validation['price']

In [10]:
X_test = test.drop('price', axis = 1)
y_test = test['price']

# Pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.base import RegressorMixin, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OrdinalEncoder

In [12]:
# podriamos cambiar el nombre de la función porque hay una función con el mismo nombre en sklearn
def make_pipeline(
    regressor: RegressorMixin,
    feature_names_int: List[str], 
    feature_names_cat: List[str], 
    *, 
    cat_encoder: Optional[TransformerMixin] = None
) -> Pipeline:
    """
    Create a sklearn machine learning pipeline for preprocessing and regression.

    This function constructs a pipeline that preprocesses integer and categorical columns,
    then applies a regression model. Integer columns are scaled using `RobustScaler`, 
    and categorical columns are encoded using `OrdinalEncoder` or a custom encoder if provided.
    The regression step uses the provided regressor.

    Parameters
    ----------
    regressor : sklearn.base.RegressorMixin
        A scikit-learn compatible regressor that will be used as the final step of the pipeline.
    
    feature_names_int : list of str
        List of names of integer columns to be scaled.
    
    feature_names_cat : list of str
        List of names of categorical columns to be encoded.
    
    cat_encoder : sklearn.base.TransformerMixin, optional
        A transformer for encoding categorical features. If not provided, `OrdinalEncoder` 
        will be used by default.

    Returns
    -------
    sklearn.pipeline.Pipeline
        A scikit-learn `Pipeline` object that performs preprocessing and regression.
        
    Notes
    -----
    - Columns not included in `feature_names_int` or `feature_names_cat` will be dropped

    Examples
    --------
    >>> from sklearn.linear_model import LinearRegression
    >>> feature_names_int = ['age', 'salary']
    >>> feature_names_cat = ['gender', 'profession']
    >>> pipeline = make_pipeline(LinearRegression(), feature_names_int, feature_names_cat)
    >>> pipeline.fit(X_train, y_train)
    >>> predictions = pipeline.predict(X_test)
    """
    return Pipeline((
        (
            "preprocessing",
            ColumnTransformer((
                ("int", RobustScaler(), feature_names_int),
                (
                    "cat",
                    OrdinalEncoder() if cat_encoder is None else cat_encoder,
                    feature_names_cat,
                ),
            ),
            verbose_feature_names_out=False),
        ),
        ("regressor", regressor),
    )).set_output(transform="pandas")

# Model comparison (no tuning)

## Catboost (predicting expected value)

In [13]:
from catboost import CatBoostRegressor

In [14]:
cat_reg = CatBoostRegressor(
    iterations=100,
    learning_rate=0.3,
    bootstrap_type =  "MVS",
    cat_features=feature_names_cat
)

catboost_pipeline = make_pipeline(cat_reg, feature_names_int, feature_names_cat, cat_encoder='passthrough')
catboost_pipeline.fit(X_train, y_train)

0:	learn: 104653.0236578	total: 48.5ms	remaining: 4.8s
1:	learn: 92407.7801589	total: 50.9ms	remaining: 2.49s
2:	learn: 80351.5487205	total: 53.2ms	remaining: 1.72s
3:	learn: 72555.3830345	total: 55.8ms	remaining: 1.34s
4:	learn: 66673.1261827	total: 58.8ms	remaining: 1.12s
5:	learn: 61792.6520823	total: 60.7ms	remaining: 951ms
6:	learn: 59295.5256495	total: 62.4ms	remaining: 829ms
7:	learn: 56573.4889009	total: 64ms	remaining: 736ms
8:	learn: 54508.0036312	total: 65.7ms	remaining: 665ms
9:	learn: 53108.8616919	total: 67.7ms	remaining: 610ms
10:	learn: 52150.1281192	total: 69.3ms	remaining: 561ms
11:	learn: 51037.3439947	total: 70.8ms	remaining: 519ms
12:	learn: 50221.0841572	total: 72.3ms	remaining: 484ms
13:	learn: 49258.9353632	total: 74ms	remaining: 454ms
14:	learn: 48097.0497997	total: 75.7ms	remaining: 429ms
15:	learn: 47889.4188183	total: 76.8ms	remaining: 403ms
16:	learn: 47333.4314287	total: 78.5ms	remaining: 383ms
17:	learn: 46568.3405680	total: 80.2ms	remaining: 365ms
18:	le

In [15]:
preds_val = catboost_pipeline.predict(X_val)
mlu.get_metrics_pd(y_val, preds_val, 'CatBoost Pipeline Validation')

Unnamed: 0,CatBoost Pipeline Validation
MSE,2667709358.05
R^2,0.811
MAE,33029.95
RMSE,51649.87


In [16]:
# save model
CATBOOST_DIR = NEW_MODELS_DIR.getsyspath('3_catboost_bcu.joblib')
joblib.dump(catboost_pipeline, CATBOOST_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/3_catboost_bcu.joblib']

## Catboost (predicting interval)

In [17]:
quantile_levels = [0.5, 0.75]
quantile_str = str(quantile_levels).replace('[','').replace(']','')

cat_int_reg = CatBoostRegressor(
    loss_function=f'MultiQuantile:alpha={quantile_str}',
    thread_count= 4,
    cat_features= feature_names_cat,
    bootstrap_type =  "MVS",
    # iterations=26, learning_rate=0.1
    iterations=1000, learning_rate=0.3
)
cat_int_pipeline = make_pipeline(cat_int_reg, feature_names_int, feature_names_cat, cat_encoder='passthrough')
cat_int_pipeline.fit(X_train, y_train)

0:	learn: 35660.6186747	total: 3.38ms	remaining: 3.38s
1:	learn: 29936.3602451	total: 7.04ms	remaining: 3.51s
2:	learn: 25878.6662261	total: 10.4ms	remaining: 3.44s
3:	learn: 22467.3144834	total: 13.5ms	remaining: 3.35s
4:	learn: 20277.3724367	total: 17ms	remaining: 3.39s
5:	learn: 18368.5207284	total: 19.9ms	remaining: 3.3s
6:	learn: 16984.7645902	total: 22.8ms	remaining: 3.23s
7:	learn: 16279.0603452	total: 25.8ms	remaining: 3.2s
8:	learn: 15723.4955916	total: 29.3ms	remaining: 3.22s
9:	learn: 15293.0381011	total: 32ms	remaining: 3.17s
10:	learn: 14874.2109815	total: 34.7ms	remaining: 3.12s
11:	learn: 14637.9370484	total: 37.3ms	remaining: 3.07s
12:	learn: 14435.6764774	total: 40.3ms	remaining: 3.06s
13:	learn: 14297.0750204	total: 43.4ms	remaining: 3.06s
14:	learn: 13928.2231723	total: 47.3ms	remaining: 3.11s
15:	learn: 13759.7752682	total: 50.6ms	remaining: 3.11s
16:	learn: 13572.3527239	total: 53.6ms	remaining: 3.1s
17:	learn: 13304.1456487	total: 56.6ms	remaining: 3.09s
18:	learn

In [18]:
inter_pred = cat_int_pipeline.predict(X_train)

predictions = y_train.to_frame(name="y_true") # the "ground truth" column
predictions["pi_median"] = inter_pred[:, 0]
predictions["pi_upper"] = inter_pred[:, 1]
predictions["avg"] = ((predictions.pi_median + predictions.pi_upper)/2)
predictions

Unnamed: 0,y_true,pi_median,pi_upper,avg
0,451999,453621.217289,465998.332332,459809.774810
1,281999,283515.207976,276730.783842,280122.995909
2,224999,224456.056119,226504.041646,225480.048882
3,171999,189828.001832,191933.105431,190880.553631
4,199999,195865.932351,199880.709152,197873.320751
...,...,...,...,...
932,319999,322194.692601,327053.522898,324624.107750
933,161999,161976.653061,162023.914230,162000.283645
934,377999,366626.371402,377811.957859,372219.164630
935,285999,273746.711700,285002.354658,279374.533179


In [19]:
preds_val = cat_int_pipeline.predict(X_val)

mlu.get_metrics_pd(y_val, preds_val[:, 0], 'Catboost Interval Pipeline Validation')

Unnamed: 0,Catboost Interval Pipeline Validation
MSE,2339653769.68
R^2,0.834
MAE,30683.91
RMSE,48369.97


In [20]:
# save model
CATBOOST_INTERVAL_DIR = NEW_MODELS_DIR.getsyspath('3_catboost_interval_bcu.joblib')
joblib.dump(cat_int_pipeline, CATBOOST_INTERVAL_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/3_catboost_interval_bcu.joblib']

## LGBM

In [21]:
lgbm_reg = LGBMRegressor(
    objective='quantile',
    alpha=0.5
)

lgbm_pipeline = make_pipeline(lgbm_reg, feature_names_int, feature_names_cat)
lgbm_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000297 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 534
[LightGBM] [Info] Number of data points in the train set: 937, number of used features: 11
[LightGBM] [Info] Start training from score 281999.000000


In [22]:
preds_val = lgbm_pipeline.predict(X_val)
mlu.get_metrics_pd(y_val, preds_val, 'LGBM Pipeline Validation')

Unnamed: 0,LGBM Pipeline Validation
MSE,2699288978.85
R^2,0.809
MAE,32586.79
RMSE,51954.68


In [23]:
# save model
LGBM_DIR = NEW_MODELS_DIR.getsyspath('3_lgbm_bcu.joblib')
joblib.dump(lgbm_pipeline, LGBM_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/3_lgbm_bcu.joblib']

# Hyperparameters

In [24]:
from sklearn.base import clone

## Catboost

In [25]:
def catboost_objective(trial:optuna.trial.Trial) -> float:
    params = {
        "regressor__n_estimators": trial.suggest_int('regressor__n_estimators', 100, 1000, log=True),
        "regressor__learning_rate": trial.suggest_float("regressor__learning_rate", 1e-3, 0.3, log=True),
        "regressor__depth": trial.suggest_int("regressor__depth", 1, 16),
        "regressor__subsample": trial.suggest_float("regressor__subsample", 0.05, 1.0),
        "regressor__colsample_bylevel": trial.suggest_float("regressor__colsample_bylevel", 0.05, 1.0),
        "regressor__min_data_in_leaf": trial.suggest_int("regressor__min_data_in_leaf", 1, 100)
    }
    
    quantile_levels = [0.5, 0.75]
    quantile_str = str(quantile_levels).replace('[','').replace(']','')

    model = CatBoostRegressor(
        loss_function=f'MultiQuantile:alpha={quantile_str}',
        thread_count= 4,
        cat_features= feature_names_cat,
        bootstrap_type =  "MVS",
        verbose=0
    )
    pipeline = make_pipeline(model, feature_names_int, feature_names_cat, cat_encoder='passthrough')
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    rmse = root_mean_squared_error(y_test, predictions[:, 0])

    return rmse

In [26]:
%%time
cat_study = optuna.create_study(direction='minimize', study_name='catboost')
cat_study.optimize(catboost_objective, n_trials=30)

[I 2024-10-20 14:59:12,151] A new study created in memory with name: catboost
[I 2024-10-20 15:04:39,866] Trial 0 finished with value: 75655.92161184641 and parameters: {'regressor__n_estimators': 613, 'regressor__learning_rate': 0.07672277391279707, 'regressor__depth': 15, 'regressor__subsample': 0.2774824255245674, 'regressor__colsample_bylevel': 0.4881405178891243, 'regressor__min_data_in_leaf': 83}. Best is trial 0 with value: 75655.92161184641.
[I 2024-10-20 15:04:40,286] Trial 1 finished with value: 107252.468361755 and parameters: {'regressor__n_estimators': 349, 'regressor__learning_rate': 0.0037014862706037555, 'regressor__depth': 5, 'regressor__subsample': 0.42682540782305756, 'regressor__colsample_bylevel': 0.14434758159827915, 'regressor__min_data_in_leaf': 92}. Best is trial 0 with value: 75655.92161184641.
[I 2024-10-20 15:05:05,481] Trial 2 finished with value: 119648.71447484223 and parameters: {'regressor__n_estimators': 124, 'regressor__learning_rate': 0.0031750576816

CPU times: user 24min 58s, sys: 16min 50s, total: 41min 48s
Wall time: 19min 19s


In [27]:
print('Best hyperparameters:', cat_study.best_params)
print('Best RMSE:', cat_study.best_value)

Best hyperparameters: {'regressor__n_estimators': 668, 'regressor__learning_rate': 0.13553706356635456, 'regressor__depth': 8, 'regressor__subsample': 0.8655892179736383, 'regressor__colsample_bylevel': 0.3802259827034561, 'regressor__min_data_in_leaf': 12}
Best RMSE: 49141.21584908728


### verify

In [28]:
cat_int_reg = CatBoostRegressor(
    loss_function=f'MultiQuantile:alpha={quantile_str}',
    thread_count= 4,
    cat_features= feature_names_cat,
    bootstrap_type =  "MVS"
)
cat_int_pipeline = make_pipeline(cat_int_reg, feature_names_int, feature_names_cat, cat_encoder='passthrough')
cat_int_pipeline.set_params(**cat_study.best_params)
cat_int_pipeline.fit(X_train, y_train)

0:	learn: 40322.0170951	total: 18.8ms	remaining: 12.5s
1:	learn: 36715.6651516	total: 30.4ms	remaining: 10.1s
2:	learn: 33377.1500881	total: 43.9ms	remaining: 9.73s
3:	learn: 29960.4944760	total: 53.7ms	remaining: 8.91s
4:	learn: 27551.5447891	total: 63.2ms	remaining: 8.38s
5:	learn: 25375.3496891	total: 70.2ms	remaining: 7.75s
6:	learn: 23951.8139913	total: 74.4ms	remaining: 7.03s
7:	learn: 22365.7975291	total: 80.8ms	remaining: 6.67s
8:	learn: 21129.6801175	total: 86.1ms	remaining: 6.3s
9:	learn: 20833.2525954	total: 87.5ms	remaining: 5.75s
10:	learn: 19669.0349867	total: 92ms	remaining: 5.5s
11:	learn: 18764.4708731	total: 96.9ms	remaining: 5.3s
12:	learn: 18235.1425528	total: 100ms	remaining: 5.04s
13:	learn: 17546.3551926	total: 103ms	remaining: 4.81s
14:	learn: 16978.0725548	total: 112ms	remaining: 4.88s
15:	learn: 16607.3510553	total: 115ms	remaining: 4.68s
16:	learn: 16024.6248142	total: 123ms	remaining: 4.71s
17:	learn: 15591.4262520	total: 127ms	remaining: 4.57s
18:	learn: 15

In [29]:
preds_val = cat_int_pipeline.predict(X_val)
mlu.get_metrics_pd(y_val, preds_val[:, 0], 'Catboost Pipeline Optuna Metrics')

Unnamed: 0,Catboost Pipeline Optuna Metrics
MSE,2538002892.9
R^2,0.82
MAE,31505.12
RMSE,50378.6


In [30]:
# save model
CATBOOST_OPTUNA_DIR = NEW_MODELS_DIR.getsyspath('3_catboost_optuna_bcu.joblib')
joblib.dump(cat_int_pipeline, CATBOOST_OPTUNA_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/3_catboost_optuna_bcu.joblib']

## LGBM

In [31]:
def lgbm_objective(trial: optuna.trial.Trial) -> float:
    params = {
        'regressor__n_estimators': trial.suggest_int('regressor__n_estimators', 100, 1000, log=True),
        'regressor__learning_rate': trial.suggest_float('regressor__learning_rate', 1e-3, 0.5, log=True),
        #'num_leaves': trial.suggest_int('num_leaves', 8, 256, log=True),
        'regressor__max_depth': trial.suggest_int('regressor__max_depth', 5, 16, log=True),
        'regressor__colsample_bytree': trial.suggest_float("regressor__colsample_bytree", 0.1, 1),
        'regressor__reg_alpha': trial.suggest_float('regressor__reg_alpha', 1e-8, 100, log=True),
        'regressor__reg_lambda': trial.suggest_float('regressor__reg_lambda', 1e-8, 100,log=True),
        'regressor__min_split_gain': trial.suggest_float('regressor__min_split_gain', 1e-8, 100,log=True),
        'regressor__subsample': trial.suggest_float("regressor__subsample", 0.1, 1),
        'regressor__min_child_samples': trial.suggest_int('regressor__min_child_samples', 20, 1000, log=True)}

    model = LGBMRegressor(
        objective='quantile',
        alpha=0.5,
        verbose=0
    )
    pipeline = make_pipeline(model, feature_names_int, feature_names_cat)
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    rmse = root_mean_squared_error(y_test, predictions)
    return rmse

In [32]:
%%time
lgbm_study = optuna.create_study(direction='minimize', study_name='lgbm')
lgbm_study.optimize(lgbm_objective, n_trials=30)

[I 2024-10-20 15:22:16,734] A new study created in memory with name: lgbm
[I 2024-10-20 15:22:16,792] Trial 0 finished with value: 145205.77794536258 and parameters: {'regressor__n_estimators': 222, 'regressor__learning_rate': 0.0013518878599362136, 'regressor__max_depth': 6, 'regressor__colsample_bytree': 0.5551330472326685, 'regressor__reg_alpha': 0.0025962783646572476, 'regressor__reg_lambda': 0.004701471361784799, 'regressor__min_split_gain': 2.5085272430400848e-05, 'regressor__subsample': 0.21110975356320702, 'regressor__min_child_samples': 482}. Best is trial 0 with value: 145205.77794536258.




[I 2024-10-20 15:22:17,041] Trial 1 finished with value: 69271.78239042893 and parameters: {'regressor__n_estimators': 416, 'regressor__learning_rate': 0.3528519602898652, 'regressor__max_depth': 8, 'regressor__colsample_bytree': 0.7880662971870225, 'regressor__reg_alpha': 0.0049398267768667355, 'regressor__reg_lambda': 1.2440087034369057e-05, 'regressor__min_split_gain': 1.6650192600975816e-05, 'regressor__subsample': 0.6102365637014188, 'regressor__min_child_samples': 121}. Best is trial 1 with value: 69271.78239042893.




[I 2024-10-20 15:22:17,186] Trial 2 finished with value: 115398.06081223358 and parameters: {'regressor__n_estimators': 305, 'regressor__learning_rate': 0.003728314130314806, 'regressor__max_depth': 13, 'regressor__colsample_bytree': 0.2944923241311966, 'regressor__reg_alpha': 1.162597799672733e-06, 'regressor__reg_lambda': 0.9550217350543614, 'regressor__min_split_gain': 0.00498341357505542, 'regressor__subsample': 0.569345719627468, 'regressor__min_child_samples': 165}. Best is trial 1 with value: 69271.78239042893.




[I 2024-10-20 15:22:17,554] Trial 3 finished with value: 63572.43295561769 and parameters: {'regressor__n_estimators': 380, 'regressor__learning_rate': 0.06727516824085869, 'regressor__max_depth': 16, 'regressor__colsample_bytree': 0.6542174158285076, 'regressor__reg_alpha': 1.6610928351724746e-07, 'regressor__reg_lambda': 0.014583519536680134, 'regressor__min_split_gain': 3.8568029336219044e-07, 'regressor__subsample': 0.9582753595054128, 'regressor__min_child_samples': 66}. Best is trial 3 with value: 63572.43295561769.




[I 2024-10-20 15:22:17,768] Trial 4 finished with value: 84868.77789874664 and parameters: {'regressor__n_estimators': 304, 'regressor__learning_rate': 0.14228701941387498, 'regressor__max_depth': 7, 'regressor__colsample_bytree': 0.48324746030008625, 'regressor__reg_alpha': 0.0032082794933017337, 'regressor__reg_lambda': 5.433878755191258, 'regressor__min_split_gain': 8.114179685556032e-05, 'regressor__subsample': 0.22290180804397483, 'regressor__min_child_samples': 264}. Best is trial 3 with value: 63572.43295561769.
[I 2024-10-20 15:22:17,841] Trial 5 finished with value: 145205.77794536258 and parameters: {'regressor__n_estimators': 192, 'regressor__learning_rate': 0.20053021342619343, 'regressor__max_depth': 5, 'regressor__colsample_bytree': 0.686203147954633, 'regressor__reg_alpha': 0.5696617224748223, 'regressor__reg_lambda': 0.01578018484837322, 'regressor__min_split_gain': 0.004578018136606538, 'regressor__subsample': 0.517376851871798, 'regressor__min_child_samples': 755}. Be



[I 2024-10-20 15:22:17,993] Trial 6 finished with value: 139622.4909403232 and parameters: {'regressor__n_estimators': 155, 'regressor__learning_rate': 0.001013248983829831, 'regressor__max_depth': 8, 'regressor__colsample_bytree': 0.4777946876343182, 'regressor__reg_alpha': 79.2794613263384, 'regressor__reg_lambda': 5.453531995341913e-07, 'regressor__min_split_gain': 1.936835178915111e-06, 'regressor__subsample': 0.8729629360134409, 'regressor__min_child_samples': 31}. Best is trial 3 with value: 63572.43295561769.
[I 2024-10-20 15:22:18,078] Trial 7 finished with value: 87860.95632417839 and parameters: {'regressor__n_estimators': 189, 'regressor__learning_rate': 0.28927160080745584, 'regressor__max_depth': 8, 'regressor__colsample_bytree': 0.9358003470420106, 'regressor__reg_alpha': 0.0003788032154801415, 'regressor__reg_lambda': 0.0960202002823174, 'regressor__min_split_gain': 6.35107253654277, 'regressor__subsample': 0.888556093735882, 'regressor__min_child_samples': 31}. Best is 



[I 2024-10-20 15:22:18,145] Trial 8 finished with value: 145205.77794536258 and parameters: {'regressor__n_estimators': 127, 'regressor__learning_rate': 0.016083891913625886, 'regressor__max_depth': 5, 'regressor__colsample_bytree': 0.5114948380598845, 'regressor__reg_alpha': 2.2056890007768405e-08, 'regressor__reg_lambda': 2.554426785628939e-08, 'regressor__min_split_gain': 6.034844344430548e-05, 'regressor__subsample': 0.23025721988891482, 'regressor__min_child_samples': 548}. Best is trial 3 with value: 63572.43295561769.
[I 2024-10-20 15:22:18,311] Trial 9 finished with value: 104932.19298219291 and parameters: {'regressor__n_estimators': 564, 'regressor__learning_rate': 0.05791330465038165, 'regressor__max_depth': 9, 'regressor__colsample_bytree': 0.20737009990579403, 'regressor__reg_alpha': 55.897733253914176, 'regressor__reg_lambda': 7.768783884225387e-07, 'regressor__min_split_gain': 0.018150228600949826, 'regressor__subsample': 0.3339511413205108, 'regressor__min_child_samples



[I 2024-10-20 15:22:19,053] Trial 10 finished with value: 67062.59314839036 and parameters: {'regressor__n_estimators': 951, 'regressor__learning_rate': 0.020692152202797937, 'regressor__max_depth': 15, 'regressor__colsample_bytree': 0.9389662855885266, 'regressor__reg_alpha': 3.530228934898749e-06, 'regressor__reg_lambda': 0.00012712273239190054, 'regressor__min_split_gain': 1.1940994029623567e-08, 'regressor__subsample': 0.7856268420916672, 'regressor__min_child_samples': 77}. Best is trial 3 with value: 63572.43295561769.




[I 2024-10-20 15:22:19,864] Trial 11 finished with value: 66269.21998434096 and parameters: {'regressor__n_estimators': 934, 'regressor__learning_rate': 0.021305379783415335, 'regressor__max_depth': 16, 'regressor__colsample_bytree': 0.9513532342859419, 'regressor__reg_alpha': 1.1017432218914835e-06, 'regressor__reg_lambda': 0.00018088647180287582, 'regressor__min_split_gain': 1.0853530946979204e-08, 'regressor__subsample': 0.9875766864911445, 'regressor__min_child_samples': 76}. Best is trial 3 with value: 63572.43295561769.




[I 2024-10-20 15:22:20,440] Trial 12 finished with value: 64792.750118644915 and parameters: {'regressor__n_estimators': 792, 'regressor__learning_rate': 0.05281673784474362, 'regressor__max_depth': 12, 'regressor__colsample_bytree': 0.7696945644261042, 'regressor__reg_alpha': 8.976841234840237e-08, 'regressor__reg_lambda': 63.875706766774954, 'regressor__min_split_gain': 1.5060501860367676e-08, 'regressor__subsample': 0.9812975052313448, 'regressor__min_child_samples': 71}. Best is trial 3 with value: 63572.43295561769.




[I 2024-10-20 15:22:21,320] Trial 13 finished with value: 60464.89987493684 and parameters: {'regressor__n_estimators': 578, 'regressor__learning_rate': 0.07509236683313009, 'regressor__max_depth': 11, 'regressor__colsample_bytree': 0.732125626098439, 'regressor__reg_alpha': 1.958437047064431e-08, 'regressor__reg_lambda': 58.62414651400406, 'regressor__min_split_gain': 3.3099675059677156e-07, 'regressor__subsample': 0.7324172027708546, 'regressor__min_child_samples': 20}. Best is trial 13 with value: 60464.89987493684.




[I 2024-10-20 15:22:22,026] Trial 14 finished with value: 59948.27636320748 and parameters: {'regressor__n_estimators': 503, 'regressor__learning_rate': 0.06356251367117921, 'regressor__max_depth': 11, 'regressor__colsample_bytree': 0.7013671336326299, 'regressor__reg_alpha': 1.0963877018310553e-08, 'regressor__reg_lambda': 23.667781594627865, 'regressor__min_split_gain': 5.091984500586278e-07, 'regressor__subsample': 0.6929480629963936, 'regressor__min_child_samples': 22}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:22,948] Trial 15 finished with value: 65862.82337226973 and parameters: {'regressor__n_estimators': 606, 'regressor__learning_rate': 0.009818898271507512, 'regressor__max_depth': 11, 'regressor__colsample_bytree': 0.8324613630422343, 'regressor__reg_alpha': 5.252533468818828e-05, 'regressor__reg_lambda': 40.0518917081256, 'regressor__min_split_gain': 6.77765105661135e-07, 'regressor__subsample': 0.7092882924970044, 'regressor__min_child_samples': 20}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:23,280] Trial 16 finished with value: 66245.1683362344 and parameters: {'regressor__n_estimators': 554, 'regressor__learning_rate': 0.1082541969392275, 'regressor__max_depth': 10, 'regressor__colsample_bytree': 0.35822620457485577, 'regressor__reg_alpha': 1.2274882935537366e-08, 'regressor__reg_lambda': 0.6064750036558476, 'regressor__min_split_gain': 0.5696485913578923, 'regressor__subsample': 0.46309836088058265, 'regressor__min_child_samples': 21}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:23,878] Trial 17 finished with value: 62103.94328183322 and parameters: {'regressor__n_estimators': 432, 'regressor__learning_rate': 0.039654968621902716, 'regressor__max_depth': 13, 'regressor__colsample_bytree': 0.6458820221703043, 'regressor__reg_alpha': 1.551974459883957e-05, 'regressor__reg_lambda': 5.239330589095085, 'regressor__min_split_gain': 1.630782771484199e-07, 'regressor__subsample': 0.7136479700948452, 'regressor__min_child_samples': 37}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:24,748] Trial 18 finished with value: 69830.7228202679 and parameters: {'regressor__n_estimators': 706, 'regressor__learning_rate': 0.006864663586760549, 'regressor__max_depth': 10, 'regressor__colsample_bytree': 0.836584907609439, 'regressor__reg_alpha': 0.0587727819022247, 'regressor__reg_lambda': 83.07343670294466, 'regressor__min_split_gain': 0.0005321953123755467, 'regressor__subsample': 0.40746603062431397, 'regressor__min_child_samples': 20}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:24,995] Trial 19 finished with value: 60380.57983073107 and parameters: {'regressor__n_estimators': 100, 'regressor__learning_rate': 0.4581145107232095, 'regressor__max_depth': 11, 'regressor__colsample_bytree': 0.7161335661132968, 'regressor__reg_alpha': 1.0000480259839533e-07, 'regressor__reg_lambda': 0.4855677623243522, 'regressor__min_split_gain': 1.8211503987035846e-06, 'regressor__subsample': 0.6590331922787672, 'regressor__min_child_samples': 40}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:25,170] Trial 20 finished with value: 71725.20559174451 and parameters: {'regressor__n_estimators': 101, 'regressor__learning_rate': 0.380160237433435, 'regressor__max_depth': 14, 'regressor__colsample_bytree': 0.5984218885115689, 'regressor__reg_alpha': 2.4009131150156386e-07, 'regressor__reg_lambda': 0.18940719769546047, 'regressor__min_split_gain': 1.1200010910501745e-05, 'regressor__subsample': 0.6507811327688082, 'regressor__min_child_samples': 127}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:25,751] Trial 21 finished with value: 62416.01691402445 and parameters: {'regressor__n_estimators': 467, 'regressor__learning_rate': 0.1348151355308881, 'regressor__max_depth': 11, 'regressor__colsample_bytree': 0.6972228951476414, 'regressor__reg_alpha': 1.1945270894876062e-08, 'regressor__reg_lambda': 5.220660580338994, 'regressor__min_split_gain': 1.1452253147170308e-07, 'regressor__subsample': 0.7839838609261153, 'regressor__min_child_samples': 43}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:26,209] Trial 22 finished with value: 62242.105540553006 and parameters: {'regressor__n_estimators': 270, 'regressor__learning_rate': 0.0350352092275283, 'regressor__max_depth': 10, 'regressor__colsample_bytree': 0.7615427638956835, 'regressor__reg_alpha': 1.4647276304618003e-07, 'regressor__reg_lambda': 7.7989697294047575, 'regressor__min_split_gain': 3.495010148728189e-06, 'regressor__subsample': 0.6825277185244507, 'regressor__min_child_samples': 28}. Best is trial 14 with value: 59948.27636320748.




[I 2024-10-20 15:22:27,256] Trial 23 finished with value: 58204.0800849659 and parameters: {'regressor__n_estimators': 683, 'regressor__learning_rate': 0.09646324731613264, 'regressor__max_depth': 12, 'regressor__colsample_bytree': 0.8861708628050526, 'regressor__reg_alpha': 4.310201011996871e-06, 'regressor__reg_lambda': 1.0917433544407043, 'regressor__min_split_gain': 1.2212697811600724e-07, 'regressor__subsample': 0.806948538127418, 'regressor__min_child_samples': 27}. Best is trial 23 with value: 58204.0800849659.




[I 2024-10-20 15:22:28,021] Trial 24 finished with value: 60146.55291722883 and parameters: {'regressor__n_estimators': 709, 'regressor__learning_rate': 0.22690815946678008, 'regressor__max_depth': 12, 'regressor__colsample_bytree': 0.869884146429936, 'regressor__reg_alpha': 2.791541324584543e-05, 'regressor__reg_lambda': 0.12036411201348059, 'regressor__min_split_gain': 8.509407444569534e-08, 'regressor__subsample': 0.8622845437776754, 'regressor__min_child_samples': 54}. Best is trial 23 with value: 58204.0800849659.




[I 2024-10-20 15:22:28,640] Trial 25 finished with value: 59932.51634163509 and parameters: {'regressor__n_estimators': 671, 'regressor__learning_rate': 0.22674602850652806, 'regressor__max_depth': 13, 'regressor__colsample_bytree': 0.8788592421026136, 'regressor__reg_alpha': 0.00010946858754767329, 'regressor__reg_lambda': 0.061539309727713376, 'regressor__min_split_gain': 1.5321066860269882e-07, 'regressor__subsample': 0.8619673632013517, 'regressor__min_child_samples': 51}. Best is trial 23 with value: 58204.0800849659.




[I 2024-10-20 15:22:29,904] Trial 26 finished with value: 58381.77805830046 and parameters: {'regressor__n_estimators': 780, 'regressor__learning_rate': 0.11531452189817858, 'regressor__max_depth': 14, 'regressor__colsample_bytree': 0.8822306813732697, 'regressor__reg_alpha': 0.00016664246650331053, 'regressor__reg_lambda': 0.014549113411270085, 'regressor__min_split_gain': 0.00023413900957922233, 'regressor__subsample': 0.837121046207936, 'regressor__min_child_samples': 25}. Best is trial 23 with value: 58204.0800849659.
[I 2024-10-20 15:22:30,175] Trial 27 finished with value: 76896.90717598062 and parameters: {'regressor__n_estimators': 795, 'regressor__learning_rate': 0.09919285869364111, 'regressor__max_depth': 14, 'regressor__colsample_bytree': 0.9945144421505236, 'regressor__reg_alpha': 0.0002857153474882521, 'regressor__reg_lambda': 0.0016120447213153093, 'regressor__min_split_gain': 0.37323627313780783, 'regressor__subsample': 0.8032117123414458, 'regressor__min_child_samples'



[I 2024-10-20 15:22:31,253] Trial 28 finished with value: 58240.35713713406 and parameters: {'regressor__n_estimators': 679, 'regressor__learning_rate': 0.21021242607386625, 'regressor__max_depth': 14, 'regressor__colsample_bytree': 0.9088910150362564, 'regressor__reg_alpha': 0.04348229046625311, 'regressor__reg_lambda': 0.017921462379380186, 'regressor__min_split_gain': 0.0002518281491007448, 'regressor__subsample': 0.9084391778707122, 'regressor__min_child_samples': 28}. Best is trial 23 with value: 58204.0800849659.
[I 2024-10-20 15:22:31,637] Trial 29 finished with value: 78289.24735653395 and parameters: {'regressor__n_estimators': 361, 'regressor__learning_rate': 0.1574567324326514, 'regressor__max_depth': 15, 'regressor__colsample_bytree': 0.9935673370804171, 'regressor__reg_alpha': 0.035252604499204226, 'regressor__reg_lambda': 0.008529218844327502, 'regressor__min_split_gain': 0.00037648824617634923, 'regressor__subsample': 0.11656565238028455, 'regressor__min_child_samples': 

CPU times: user 53.4 s, sys: 1.28 s, total: 54.7 s
Wall time: 14.9 s


In [33]:
print('Best hyperparameters:', lgbm_study.best_params)
print('Best RMSE:', lgbm_study.best_value)

Best hyperparameters: {'regressor__n_estimators': 683, 'regressor__learning_rate': 0.09646324731613264, 'regressor__max_depth': 12, 'regressor__colsample_bytree': 0.8861708628050526, 'regressor__reg_alpha': 4.310201011996871e-06, 'regressor__reg_lambda': 1.0917433544407043, 'regressor__min_split_gain': 1.2212697811600724e-07, 'regressor__subsample': 0.806948538127418, 'regressor__min_child_samples': 27}
Best RMSE: 58204.0800849659


### verify

In [34]:
lgbm_pipeline = clone(lgbm_pipeline)
lgbm_pipeline.set_params(**lgbm_study.best_params)
lgbm_pipeline.fit(X_train, y_train)



In [35]:
preds_val = lgbm_pipeline.predict(X_val)
mlu.get_metrics_pd(y_val, preds_val, 'LGBM Pipeline Optuna Metrics')

Unnamed: 0,LGBM Pipeline Optuna Metrics
MSE,2622243619.06
R^2,0.814
MAE,32007.01
RMSE,51207.85


In [36]:
# save model
LGBM_OPTUNA_DIR = NEW_MODELS_DIR.getsyspath('3_lgbm_optuna_bcu.joblib')
joblib.dump(cat_int_pipeline, LGBM_OPTUNA_DIR)

['/home/lenovo/Documents/MCD/ml1/models/new/3_lgbm_optuna_bcu.joblib']