In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
import optuna
from sklearn.ensemble import StackingRegressor
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('Train_recoded.csv')

In [3]:
#columnas_encoded = ['brand', 'model', 'transmission', 'fuel_type', 'ext_col', 'int_col']
#df = pd.get_dummies(df, columns=columnas_encoded, drop_first=True)

In [4]:
y = df['price']
X = df.drop(['price', 'engine', 'id'], axis = 1).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

eval_set = [(X_train, y_train), (X_test, y_test)]

In [9]:

# Definición del espacio de búsqueda para XGBRegressor
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10)
    }

    # Create and fit the model
    model = XGBRegressor(**params, eval_metric='rmse', use_label_encoder=False)
    model.fit(X_train, y_train, verbose=False)
    
    # Predict and calculate the mean squared error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Execute the optimization for XGBRegressor using Optuna
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=100)

# Retrieve the best hyperparameters for XGBRegressor
best_xgb_params = study_xgb.best_params
best_xgb_score = study_xgb.best_value

print('Mejores hiperparámetros para XGB:', best_xgb_params)
print('Mejor puntaje (MSE):', best_xgb_score)

# Create a new XGBRegressor with the best hyperparameters
best_xgb = XGBRegressor(**best_xgb_params, eval_metric='rmse', use_label_encoder=False)

# Train the model using the best hyperparameters
best_xgb.fit(X_train, y_train)

# Definición del espacio de búsqueda para CatBoostRegressor
def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100, log=True),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'random_seed': 42,
        'verbose': 0
    }

    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, verbose=0)
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Ejecutar la optimización para CatBoost
study = optuna.create_study(direction='minimize')
study.optimize(objective_cat, n_trials=100)

# Obtener los mejores hiperparámetros para CatBoost
best_cat_params = study.best_params
print('Mejores hiperparámetros para CatBoost:', best_cat_params)

cat_best = CatBoostRegressor(**best_cat_params)

# Definir el espacio de búsqueda para LGBMRegressor
def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.5),
        'verbose': -1  # Suppress the output
    }

    # Create and fit the model
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    
    # Predict and calculate the mean squared error
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Execute the optimization for LGBM using Optuna
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=100)

# Retrieve the best hyperparameters for LGBM
best_lgbm_params = study_lgbm.best_params
best_lgbm_score = study_lgbm.best_value

print('Mejores hiperparámetros para LGBM:', best_lgbm_params)
print('Mejor puntaje (MSE):', best_lgbm_score)

# Create a new LGBMRegressor with the best hyperparameters
best_lgbm = LGBMRegressor(**best_lgbm_params)

# Train the model using the best hyperparameters
best_lgbm.fit(X_train, y_train)

[I 2024-09-26 05:42:43,140] A new study created in memory with name: no-name-b57056f8-95de-47cc-aa8d-b652d7fa7d69
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-26 05:42:43,804] Trial 0 finished with value: 4767100657.401426 and parameters: {'n_estimators': 166, 'max_depth': 4, 'gamma': 2.141010821738531, 'reg_alpha': 4, 'reg_lambda': 8.38547099515091, 'min_child_weight': 10, 'colsample_bytree': 0.8175091275026369, 'subsample': 0.9483613156138468, 'learning_rate': 0.14524004593330783, 'scale_pos_weight': 5.696185898016408}. Best is trial 0 with value: 4767100657.401426.
Parameters: { "use_label_encoder" } are not used.

[I 2024-09-26 05:42:45,183] Trial 1 finished with value: 4775810641.432676 and parameters: {'n_estimators': 288, 'max_depth': 3, 'gamma': 1.0112062902896541, 'reg_alpha': 9, 'reg_lambda': 6.693284566757805, 'min_child_weight': 2, 'colsample_bytree': 0.7961846668778562, 'subsample': 0.8295986415730108, 'learning_rate': 0.17184241490985516, 'scale_pos_weigh

Mejores hiperparámetros para XGB: {'n_estimators': 161, 'max_depth': 4, 'gamma': 2.020691956310642, 'reg_alpha': 3, 'reg_lambda': 3.9173660396959056, 'min_child_weight': 7, 'colsample_bytree': 0.5746542339385834, 'subsample': 0.847695122206775, 'learning_rate': 0.11689572008097264, 'scale_pos_weight': 5.843682968112533}
Mejor puntaje (MSE): 4743204682.873531


[I 2024-09-26 05:44:04,137] A new study created in memory with name: no-name-446a342e-c7b3-44b6-8dd5-d6b966f0c987
[I 2024-09-26 05:44:15,731] Trial 0 finished with value: 5320044169.004744 and parameters: {'iterations': 780, 'learning_rate': 0.0906213107316255, 'depth': 8, 'l2_leaf_reg': 2.818740036840395e-05, 'border_count': 122}. Best is trial 0 with value: 5320044169.004744.
[I 2024-09-26 05:44:23,524] Trial 1 finished with value: 4775397238.936294 and parameters: {'iterations': 765, 'learning_rate': 0.23611507066880744, 'depth': 5, 'l2_leaf_reg': 67.22002891234743, 'border_count': 71}. Best is trial 1 with value: 4775397238.936294.
[I 2024-09-26 05:44:29,394] Trial 2 finished with value: 5746140736.432706 and parameters: {'iterations': 432, 'learning_rate': 0.23214650213476484, 'depth': 7, 'l2_leaf_reg': 1.900534109181725e-05, 'border_count': 130}. Best is trial 1 with value: 4775397238.936294.
[I 2024-09-26 05:44:32,547] Trial 3 finished with value: 4831954030.585116 and parameter

Mejores hiperparámetros para CatBoost: {'iterations': 441, 'learning_rate': 0.04964035320856025, 'depth': 9, 'l2_leaf_reg': 51.73348398486648, 'border_count': 232}


[I 2024-09-26 05:57:07,531] Trial 0 finished with value: 4776614588.255268 and parameters: {'n_estimators': 662, 'learning_rate': 0.012145937266774699, 'max_depth': 6, 'num_leaves': 21, 'reg_alpha': 0.18947098608123064, 'reg_lambda': 0.02008107570763218}. Best is trial 0 with value: 4776614588.255268.
[I 2024-09-26 05:57:09,513] Trial 1 finished with value: 5217603846.618408 and parameters: {'n_estimators': 617, 'learning_rate': 0.21786976257871588, 'max_depth': 8, 'num_leaves': 28, 'reg_alpha': 0.29322595628199066, 'reg_lambda': 0.03446792051773084}. Best is trial 0 with value: 4776614588.255268.
[I 2024-09-26 05:57:12,590] Trial 2 finished with value: 4925556960.781752 and parameters: {'n_estimators': 1006, 'learning_rate': 0.07202757460989723, 'max_depth': 6, 'num_leaves': 22, 'reg_alpha': 0.22156582829336707, 'reg_lambda': 0.4724541774954352}. Best is trial 0 with value: 4776614588.255268.
[I 2024-09-26 05:57:15,044] Trial 3 finished with value: 5310896017.841191 and parameters: {'

Mejores hiperparámetros para LGBM: {'n_estimators': 309, 'learning_rate': 0.10301843188596381, 'max_depth': 3, 'num_leaves': 21, 'reg_alpha': 0.41047498110076963, 'reg_lambda': 0.001551435817382256}
Mejor puntaje (MSE): 4758789930.471998


In [10]:
# Crear un VotingRegressor con los mejores modelos
estimators = [
    ('xgb', best_xgb),
    ('cat', cat_best),
    ('lgbm', best_lgbm)
]

stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge()
)

# Entrenar el modelo de stacking
stacking_model.fit(X_train, y_train)

# Predecir y evaluar el modelo de stacking
stacking_pred = stacking_model.predict(X_test)
stacking_mse = mean_squared_error(y_test, stacking_pred)
print(f'Stacking MSE: {stacking_mse}')

Parameters: { "use_label_encoder" } are not used.



0:	learn: 79815.4400378	total: 21.8ms	remaining: 9.57s
1:	learn: 79383.1027662	total: 41.7ms	remaining: 9.15s
2:	learn: 78993.0156124	total: 61.6ms	remaining: 8.99s
3:	learn: 78626.0106737	total: 79.4ms	remaining: 8.67s
4:	learn: 78296.9041092	total: 98ms	remaining: 8.54s
5:	learn: 77989.0878759	total: 117ms	remaining: 8.47s
6:	learn: 77715.3445913	total: 135ms	remaining: 8.37s
7:	learn: 77463.1836107	total: 153ms	remaining: 8.3s
8:	learn: 77232.3451267	total: 172ms	remaining: 8.27s
9:	learn: 77014.7087457	total: 191ms	remaining: 8.24s
10:	learn: 76817.2578194	total: 209ms	remaining: 8.18s
11:	learn: 76636.0220342	total: 228ms	remaining: 8.15s
12:	learn: 76462.5906645	total: 248ms	remaining: 8.17s
13:	learn: 76310.1427463	total: 268ms	remaining: 8.18s
14:	learn: 76164.4926990	total: 287ms	remaining: 8.16s
15:	learn: 76031.1138427	total: 307ms	remaining: 8.15s
16:	learn: 75904.3857803	total: 326ms	remaining: 8.13s
17:	learn: 75786.2769056	total: 347ms	remaining: 8.15s
18:	learn: 75687.9

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



0:	learn: 79306.2200460	total: 19.7ms	remaining: 8.66s
1:	learn: 78869.5470240	total: 38.4ms	remaining: 8.43s
2:	learn: 78480.5843356	total: 55.8ms	remaining: 8.14s
3:	learn: 78101.5264389	total: 71.9ms	remaining: 7.86s
4:	learn: 77763.3952443	total: 89.2ms	remaining: 7.78s
5:	learn: 77463.0530917	total: 106ms	remaining: 7.71s
6:	learn: 77188.8858741	total: 122ms	remaining: 7.59s
7:	learn: 76942.7768955	total: 139ms	remaining: 7.53s
8:	learn: 76709.8732587	total: 155ms	remaining: 7.46s
9:	learn: 76486.7484808	total: 173ms	remaining: 7.44s
10:	learn: 76291.2416947	total: 188ms	remaining: 7.36s
11:	learn: 76100.2254305	total: 206ms	remaining: 7.36s
12:	learn: 75918.3991590	total: 223ms	remaining: 7.33s
13:	learn: 75760.2065389	total: 240ms	remaining: 7.32s
14:	learn: 75611.3938821	total: 258ms	remaining: 7.33s
15:	learn: 75480.9968113	total: 275ms	remaining: 7.29s
16:	learn: 75356.9928495	total: 293ms	remaining: 7.3s
17:	learn: 75237.0556644	total: 311ms	remaining: 7.32s
18:	learn: 75126

In [11]:
y_pred = stacking_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

mae = mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error (MSE): 4751628502.26175
Root Mean Squared Error (RMSE): 68932.05714514655
Mean Absolute Error (MAE): 19149.24678238976
R^2 Score: 0.1622475000535667


In [12]:
df_t = pd.read_csv('Test_recoded.csv')

df_test = df_t.drop(columns = ['engine', 'id'])

df_test = df_test.astype(int)

prediccion = stacking_model.predict(df_test)

df_predict = pd.DataFrame({
    'id': df_t['id'].astype(int),
    'price': prediccion
})

df_predict.to_csv('prediccion_stack_2.csv', index = False)