# 🤖 CatBoost model

**best_parmms = Best Parameters: {'depth': 4, 'learning_rate': 0.1119422098928344, 'l2_leaf_reg': 0.4985501174669087, 'random_strength': 0.07988380994191854, 'bagging_temperature': 0.139837561570809, 'iterations': 310}**

In [25]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
import catboost as cb
import optuna 
import lightgbm as lgb

In [27]:
# --- Cargar el DataFrame desde el archivo CSV ---
data= pd.read_csv('../data/data_imputed/data_processed_newdata2.csv')

In [28]:
'''
Seleccionamos todas las filas de 'data' donde el valor de la columna 'x_e_out [-]' no es nulo, y lo almacenamos en un nuevo dataframe 'y_no_null'.
'''
no_null = data[data['x_e_out [-]'].notnull()]
no_null.reset_index(drop=True, inplace=True)
no_null= no_null.drop('id', axis=1)

'''
Seleccionamos todas las filas de 'data' donde el valor de la columna 'x_e_out [-]' es nulo, y lo almacenamos en un nuevo dataframe 'target_null'.
'''
null = data[data['x_e_out [-]'].isnull()]
null.reset_index(drop=True, inplace=True)

In [29]:
# Creamos una copia del DataFrame original
no_null = no_null.copy()

In [31]:
# Variables independientes
X = no_null.drop('x_e_out [-]', axis=1)
# Variable dependiente
y = no_null['x_e_out [-]']

import re 
# Preprocess column names
X.columns = [re.sub(r'[\[\]<>\s]+', '_', col) for col in X.columns]
X.columns

import re 
# Preprocess column names
y = pd.DataFrame(y)
y.columns = [re.sub(r'[\[\]<>\s]+', '_', col) for col in y.columns]
y.columns

# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
def objective(trial, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    # Define the hyperparameter search space
    params = {
        'loss_function': 'RMSE',
        'depth': trial.suggest_int('depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 0.1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 0.5),
        'grow_policy': 'Lossguide',
        'iterations': trial.suggest_int('iterations', 100, 500),
        'early_stopping_rounds': 100,
        'verbose': False
    }

    
    X = no_null.drop('x_e_out [-]', axis=1)
    y = no_null['x_e_out [-]']
    
    import re 
    # Preprocess column names
    X.columns = [re.sub(r'[\[\]<>\s]+', '_', col) for col in X.columns]
    # Preprocess column names
    y = pd.DataFrame(y)
    y.columns = [re.sub(r'[\[\]<>\s]+', '_', col) for col in y.columns]

    # Dividimos los datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Entrenamos el modelo
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    
    # Predecimos los valores de 'y' para los datos de prueba
    y_pred = model.predict(X_test)
    
    # Calculamos el rmse
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    return rmse

# Creamos el estudio
study_catboost_rmse = optuna.create_study(direction='minimize')

# Buscamo los mejores hiperparámetros
study_catboost_rmse.optimize(objective, n_trials=100)

# Mejores hiperparámetros
best_params = study_catboost_rmse.best_params
best_rmse = study_catboost_rmse.best_value
print(f"Best Parameters: {best_params}")
print(f"Best RMSE: {best_rmse}")

[32m[I 2023-05-29 00:00:17,771][0m A new study created in memory with name: no-name-eff5460c-1d97-4f47-89a9-d4fb0d2f8e0c[0m
[32m[I 2023-05-29 00:00:19,489][0m Trial 0 finished with value: 0.07469819674403377 and parameters: {'depth': 4, 'learning_rate': 0.1119422098928344, 'l2_leaf_reg': 0.4985501174669087, 'random_strength': 0.07988380994191854, 'bagging_temperature': 0.139837561570809, 'iterations': 310}. Best is trial 0 with value: 0.07469819674403377.[0m
[32m[I 2023-05-29 00:00:21,576][0m Trial 1 finished with value: 0.07506370009825239 and parameters: {'depth': 6, 'learning_rate': 0.11797493671956065, 'l2_leaf_reg': 0.5053781640651808, 'random_strength': 0.018129894607592702, 'bagging_temperature': 0.33309986440257205, 'iterations': 297}. Best is trial 0 with value: 0.07469819674403377.[0m
[32m[I 2023-05-29 00:00:23,165][0m Trial 2 finished with value: 0.07508525158698041 and parameters: {'depth': 8, 'learning_rate': 0.16805043272602863, 'l2_leaf_reg': 0.983886644522208

Best Parameters: {'depth': 4, 'learning_rate': 0.1119422098928344, 'l2_leaf_reg': 0.4985501174669087, 'random_strength': 0.07988380994191854, 'bagging_temperature': 0.139837561570809, 'iterations': 310}
Best RMSE: 0.07469819674403377


#### Utilizamos los mejores hiperparámetros para entrenar el modelo LightGBM

In [34]:
X_train.shape

(16983, 17)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
catboost_model = CatBoostRegressor(**best_params)
catboost_model.fit(X_train, y_train)
y_pred_catboost = catboost_model.predict(X_test)
# Calculamos rmse
rmse = mean_squared_error(y_test, y_pred_catboost, squared=False)
print("RMSE:", rmse)

0:	learn: 0.0975589	total: 4.21ms	remaining: 1.3s
1:	learn: 0.0947072	total: 7.54ms	remaining: 1.16s
2:	learn: 0.0923642	total: 10.8ms	remaining: 1.11s
3:	learn: 0.0904278	total: 14.1ms	remaining: 1.08s
4:	learn: 0.0887313	total: 17.3ms	remaining: 1.05s
5:	learn: 0.0873584	total: 20.2ms	remaining: 1.02s
6:	learn: 0.0861662	total: 23.8ms	remaining: 1.03s
7:	learn: 0.0852405	total: 27.1ms	remaining: 1.02s
8:	learn: 0.0843773	total: 30.7ms	remaining: 1.03s
9:	learn: 0.0836976	total: 33.5ms	remaining: 1s
10:	learn: 0.0830846	total: 36.3ms	remaining: 987ms
11:	learn: 0.0825154	total: 39.6ms	remaining: 983ms
12:	learn: 0.0820330	total: 42.4ms	remaining: 968ms
13:	learn: 0.0815551	total: 45.6ms	remaining: 964ms
14:	learn: 0.0812044	total: 48.3ms	remaining: 949ms
15:	learn: 0.0808292	total: 50.9ms	remaining: 935ms
16:	learn: 0.0805256	total: 53.8ms	remaining: 928ms
17:	learn: 0.0802445	total: 56.6ms	remaining: 918ms
18:	learn: 0.0800043	total: 59ms	remaining: 904ms
19:	learn: 0.0798115	total: 

In [36]:
# Calculamos el RMSE
rmse = mean_squared_error(y_test, y_pred_catboost, squared=False)
print("RMSE:", rmse)

RMSE: 0.07500536527008682
