In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos
data = pd.read_csv('../data/FS_final_train.csv', index_col=0)

# Separar las características y la variable objetivo
X = data.drop(columns=['log_SalePrice'])
y = data['log_SalePrice']

# Estandarizar las características
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
X_scaled.dropna(inplace=True)

#Eliminar la fila eliminada en y tambien en X
dropped_rows = y.index.difference(X_scaled.index)
y_scaled = y.drop(index=dropped_rows)

In [None]:
# Mapa de calor de correlación
plt.figure(figsize=(10, 8))
sns.heatmap(X_scaled.corr(method='pearson'), annot=True, cmap='coolwarm', linewidths=0.5, mask=np.triu(np.ones_like(X_scaled.corr(method='pearson'), dtype=bool)), fmt='.2f', annot_kws={'size': 8}, cbar=False)
plt.title('Mapa de calor de correlación')
plt.show()

In [None]:
# # De las correlaciones altas nos quedamos con las que tienen mas correlacion con la target
# corr_matrix = X_scaled.corr()

# #Eliminar las diagonales
# corr_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# corr_matrix = corr_matrix.stack().reset_index()
# corr_matrix.columns = ['Feature 1', 'Feature 2', 'Correlation']

# # De las parejas con abs(correlacion > 0.9) nos quedamos con la que tiene mas correlacion con la target
# corr_matrix = corr_matrix[corr_matrix['Correlation'].abs() > 0.9]
# corr_matrix = corr_matrix.sort_values(by='Correlation', ascending=False)

# # de la pareja eliminar la que menos corr tenga con la target y, usar corrwith
# features_to_drop = []
# for index, row in corr_matrix.iterrows():
#     feature1 = row['Feature 1']
#     feature2 = row['Feature 2']
#     if y.corr(X[feature1]) < y.corr(X[feature2]):
#         features_to_drop.append(feature1)
#     else:
#         features_to_drop.append(feature2)

# X_scaled = X_scaled.drop(columns=features_to_drop)

### Empleando X_scaled

In [None]:
import optuna
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Configuración comn de Optuna
def create_study(name, version=None):
    return optuna.create_study(study_name=f'{name}_housing_{version}', directions = ['minimize', 'minimize'],
                               sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_min_trials=2),
                               load_if_exists=True, storage=f'sqlite:///../models/{name}_housing_{version}.db')

def objective_xgb(trial):
    # Definición de parámetros para XGBoost
    params_xgb = {
        'objective': 'reg:squarederror',
        'device': 'cuda',
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'tree_method': 'hist',
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 1.0),
        'n_jobs': -1,
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_bin': trial.suggest_int('max_bin', 256, 1024),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 1, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'eval_metric': 'rmse'
    }


    # Preparar el conjunto de datos en un formato DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns.tolist())

    # Validación cruzada con XGBoost
    result_xgb = xgb.cv(
        params_xgb,
        dtrain,
        num_boost_round=1000,
        nfold=5,
        stratified=False,
        early_stopping_rounds=100,
        seed=42,
        verbose_eval=True,
        as_pandas=True
    )
    
    # Extraemos la mejor puntuación AUC-PR
    mean_rmse = result_xgb['test-rmse-mean'].min()
    diff_rmse = np.abs(result_xgb['test-rmse-mean'].min() - result_xgb['train-rmse-mean'].min())
    return mean_rmse, diff_rmse

study_xgb = create_study('xgb_FS', version='1.1')
# study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True, gc_after_trial=True)


In [None]:
print(f'Direcciones de optimización: {study_xgb.directions}')
print(f'Nmero de ensayos: {study_xgb.trials.__len__()}')
# print(f'Mejor ensayo: {study_xgb.best_trial}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor valor: {study_xgb.best_value}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor hiperparámetros: {study_xgb.best_params}') solo se puede utilizar si la optimización es simple no multiple
study_xgb_df = study_xgb.trials_dataframe()
study_xgb_df.sort_values(by='values_0', ascending=True).head(5)

In [None]:
def obtain_best_params(study_xgb, weights:np.ndarray) -> dict:
    '''
    Obtenemos los best params atraves de asociar un peso deseado a cada métrica que estamos optimizando
    -------
    Parámetros:
    - study_xgb: optuna.Study, estudio de Optuna.
    - weights: np.ndarray, pesos deseados para cada métrica.
    -------
    Devuelve:
    - best_params: dict, hiperparámetros óptimos.
    '''
    study_df = study_xgb.trials_dataframe()
    study_df['weighted_average'] = np.average(study_df[[col for col in study_df.columns if 'values' in col]], weights=weights, axis=1)

    # Encontrar el trial con el menor promedio ponderado
    best_trial_index = study_df['weighted_average'].idxmin()
    best_trial = study_df.loc[best_trial_index]

    best_trial = [trial for trial in study_xgb.best_trials if trial.number == best_trial_index]
    best_params = best_trial[0].params
    return best_params, best_trial[0].values

In [None]:
best_params, best_values = obtain_best_params(study_xgb, weights=np.array([0.75, 0.25]))
print(f'Mejores hiperparámetros: {best_params}')
print(f'Mejores valores: Mean rmse Validation: {best_values[0]}, Mean Diff (Val - Train) rmse: {best_values[1]}')

In [None]:
import numpy as np
import xgboost as xgb
from xgboost import callback
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Función para crear DMatrix
def create_dmatrix(X, y):
    return xgb.DMatrix(X, label=y, feature_names=X.columns.tolist())

# Dividir los datos de train en train y val
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Crear DMatrix para los conjuntos de datos
dtrain = create_dmatrix(X_train, y_train)
dtrain_val = create_dmatrix(X_train_val, y_train_val)
dval = create_dmatrix(X_val, y_val)
dtest = create_dmatrix(X_test, y_test)

# Parámetros del modelo
params = {
    **best_params,
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse'],
    'random_state': 42,
    'n_jobs': -1,
}

# Configuración del entrenamiento
num_boost_round = 1000
early_stopping_rounds = 300
evals = [(dtrain_val, 'train'), (dval, 'val'), (dtest, 'test')] #Es importante que el early stopping no sea sobre dtest puesto que estos datos no se conocen.

# Definimos el callback para el early stopping sobre el conjunto de validación nunca el de test.
EarlyStopping_callback = callback.EarlyStopping(metric_name='rmse', data_name='val', maximize=False, save_best=True, rounds=early_stopping_rounds, min_delta=0.001)

# Entrenar el modelo
evals_result = {}
model = xgb.train(
    params,
    dtrain_val,
    num_boost_round=num_boost_round,
    evals=evals,
    evals_result=evals_result,
    early_stopping_rounds=early_stopping_rounds,
    callbacks=[EarlyStopping_callback],
    verbose_eval=100  # Mostrar métricas cada 100 rondas
)

n_rounds = len(evals_result['train']['rmse'])

# Encontrar la mejor ronda
best_round = model.best_iteration

print(f"\nMejor ronda: {best_round}")
print(f"Mejor rmse en train: {evals_result['train']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en validación: {evals_result['val']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en test: {evals_result['test']['rmse'][best_round-1]:.6f}")
print(f"Diferencia entre train y test: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}")

import matplotlib.pyplot as plt
import seaborn as sns

# Configurar el estilo de Seaborn
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)

# Crear la figura y los ejes
fig, ax = plt.subplots(figsize=(12, 6))

# Graficar las curvas de aprendizaje
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['train']['rmse'], label='Train', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['val']['rmse'], label='Validación', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['test']['rmse'], label='Test', ax=ax)

# Añadir la línea vertical para la mejor ronda
ax.axvline(x=best_round, color='r', linestyle='--', label='Best Iteration')

# Añadir el texto para el gap
ax.text(best_round, (evals_result['test']['rmse'][best_round-1] + evals_result['train']['rmse'][best_round-1]) / 2, 
        f"Gap: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}", 
        color='black', ha='center', va='center', backgroundcolor='white')

# Ajustar ylim entre 0.5 y 0.7
ax.set_ylim(0, 0.5)

# Configurar etiquetas y título
ax.set_xlabel('Iterations')
ax.set_ylabel('RMSE')
ax.set_title('XGBoost Learning Curve')

# Ajustar la leyenda
ax.legend(loc='best')

# Mostrar la gráfica
plt.tight_layout()
plt.show()

### Empleando X

In [None]:
import optuna
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Configuración comn de Optuna
def create_study(name, version=None):
    return optuna.create_study(study_name=f'{name}_housing_{version}', directions = ['minimize', 'minimize'],
                               sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_min_trials=2),
                               load_if_exists=True, storage=f'sqlite:///../models/{name}_housing_{version}.db')

def objective_xgb(trial):
    # Definición de parámetros para XGBoost
    params_xgb = {
        'objective': 'reg:squarederror',
        'device': 'gpu',
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'tree_method': 'hist',
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.5, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.5, 1.0),
        'n_jobs': -1,
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_bin': trial.suggest_int('max_bin', 256, 1024),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 1, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'eval_metric': 'rmse'
    }


    # Preparar el conjunto de datos en un formato DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns.tolist())

    # Validación cruzada con XGBoost
    result_xgb = xgb.cv(
        params_xgb,
        dtrain,
        num_boost_round=1000,
        nfold=5,
        early_stopping_rounds=100,
        seed=42,
        verbose_eval=True,
        as_pandas=True
    )
    
    # Extraemos la mejor puntuación AUC-PR
    mean_rmse = result_xgb['test-rmse-mean'].min()
    diff_rmse = np.abs(result_xgb['test-rmse-mean'].min() - result_xgb['train-rmse-mean'].min())
    return mean_rmse, diff_rmse

study_xgb = create_study('xgb_NO_FS', version='1.0')
study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True, gc_after_trial=True)

In [None]:
print(f'Direcciones de optimización: {study_xgb.directions}')
print(f'Nmero de ensayos: {study_xgb.trials.__len__()}')
# print(f'Mejor ensayo: {study_xgb.best_trial}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor valor: {study_xgb.best_value}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor hiperparámetros: {study_xgb.best_params}') solo se puede utilizar si la optimización es simple no multiple
study_xgb_df = study_xgb.trials_dataframe()
study_xgb_df.sort_values(by='values_0', ascending=True).head(5)

In [None]:

def obtain_best_params(study_xgb, weights:np.ndarray) -> dict:
    '''
    Obtenemos los best params atraves de asociar un peso deseado a cada métrica que estamos optimizando
    -------
    Parámetros:
    - study_xgb: optuna.Study, estudio de Optuna.
    - weights: np.ndarray, pesos deseados para cada métrica.
    -------
    Devuelve:
    - best_params: dict, hiperparámetros óptimos.
    '''
    study_df = study_xgb.trials_dataframe()
    study_df['weighted_average'] = np.average(study_df[[col for col in study_df.columns if 'values' in col]], weights=weights, axis=1)

    # Encontrar el trial con el menor promedio ponderado
    best_trial_index = study_df['weighted_average'].idxmin()
    best_trial = study_df.loc[best_trial_index]

    best_trial = [trial for trial in study_xgb.best_trials if trial.number == best_trial_index]
    best_params = best_trial[0].params
    return best_params, best_trial[0].values

best_params, best_values = obtain_best_params(study_xgb, weights=np.array([0.75, 0.25]))
print(f'Mejores hiperparámetros: {best_params}')
print(f'Mejores valores: Mean rmse Validation: {best_values[0]}, Mean Diff (Val - Train) rmse: {best_values[1]}')

In [None]:

import numpy as np
import xgboost as xgb
from xgboost import callback
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Función para crear DMatrix
def create_dmatrix(X, y):
    return xgb.DMatrix(X, label=y, feature_names=X.columns.tolist())

# Dividir los datos de train en train y val
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Crear DMatrix para los conjuntos de datos
dtrain = create_dmatrix(X_train, y_train)
dtrain_val = create_dmatrix(X_train_val, y_train_val)
dval = create_dmatrix(X_val, y_val)
dtest = create_dmatrix(X_test, y_test)

# Parámetros del modelo
params = {
    **best_params,
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse'],
    'random_state': 42,
    'n_jobs': -1,
}

# Configuración del entrenamiento
num_boost_round = 1000
early_stopping_rounds = 300
evals = [(dtrain_val, 'train'), (dval, 'val'), (dtest, 'test')] #Es importante que el early stopping no sea sobre dtest puesto que estos datos no se conocen.

# Definimos el callback para el early stopping sobre el conjunto de validación nunca el de test.
EarlyStopping_callback = callback.EarlyStopping(metric_name='rmse', data_name='val', maximize=False, save_best=True, rounds=early_stopping_rounds, min_delta=0.001)

# Entrenar el modelo
evals_result = {}
model = xgb.train(
    params,
    dtrain_val,
    num_boost_round=num_boost_round,
    evals=evals,
    evals_result=evals_result,
    early_stopping_rounds=early_stopping_rounds,
    callbacks=[EarlyStopping_callback],
    verbose_eval=100  # Mostrar métricas cada 100 rondas
)

n_rounds = len(evals_result['train']['rmse'])

# Encontrar la mejor ronda
best_round = model.best_iteration

print(f"\nMejor ronda: {best_round}")
print(f"Mejor rmse en train: {evals_result['train']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en validación: {evals_result['val']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en test: {evals_result['test']['rmse'][best_round-1]:.6f}")
print(f"Diferencia entre train y test: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}")

import matplotlib.pyplot as plt
import seaborn as sns

# Configurar el estilo de Seaborn
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)

# Crear la figura y los ejes
fig, ax = plt.subplots(figsize=(12, 6))

# Graficar las curvas de aprendizaje
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['train']['rmse'], label='Train', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['val']['rmse'], label='Validación', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['test']['rmse'], label='Test', ax=ax)

# Añadir la línea vertical para la mejor ronda
ax.axvline(x=best_round, color='r', linestyle='--', label='Best Iteration')

# Añadir el texto para el gap
ax.text(best_round, (evals_result['test']['rmse'][best_round-1] + evals_result['train']['rmse'][best_round-1]) / 2, 
        f"Gap: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}", 
        color='black', ha='center', va='center', backgroundcolor='white')

# Ajustar ylim entre 0.5 y 0.7
ax.set_ylim(0, 0.5)

# Configurar etiquetas y título
ax.set_xlabel('Iterations')
ax.set_ylabel('RMSE')
ax.set_title('XGBoost Learning Curve')

# Ajustar la leyenda
ax.legend(loc='best')

# Mostrar la gráfica
plt.tight_layout()
plt.show()

### Optimizando diferentes objective error functions


In [None]:
import optuna
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Configuración comn de Optuna
def create_study(name, version=None):
    return optuna.create_study(study_name=f'{name}_housing_{version}', directions = ['minimize', 'minimize'],
                               sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_min_trials=2),
                               load_if_exists=True, storage=f'sqlite:///../models/{name}_housing_{version}.db')

def objective_xgb(trial):
    # Definición de parámetros para XGBoost
    params_xgb = {
        'objective': trial.suggest_categorical('objective', ['reg:squarederror', 'reg:linear']),
        'device': 'cuda',
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'tree_method': 'auto',
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 1.0),
        'n_jobs': -1,
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree', 1, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'eval_metric': 'rmse'
    }


    # Preparar el conjunto de datos en un formato DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=X_train.columns.tolist())

    # Validación cruzada con XGBoost
    result_xgb = xgb.cv(
        params_xgb,
        dtrain,
        num_boost_round=400,
        nfold=5,
        stratified=False,
        early_stopping_rounds=50,
        seed=42,
        verbose_eval=True,
        as_pandas=True
    )
    
    # Extraemos la mejor puntuación AUC-PR
    mean_rmse = result_xgb['test-rmse-mean'].min()
    diff_rmse = np.abs(result_xgb['test-rmse-mean'].min() - result_xgb['train-rmse-mean'].min())
    return mean_rmse, diff_rmse

study_xgb = create_study('xgb_FS', version='2.5')
study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True, gc_after_trial=True)


In [None]:
print(f'Direcciones de optimización: {study_xgb.directions}')
print(f'Nmero de ensayos: {study_xgb.trials.__len__()}')
# print(f'Mejor ensayo: {study_xgb.best_trial}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor valor: {study_xgb.best_value}') solo se puede utilizar si la optimización es simple no multiple
# print(f'Mejor hiperparámetros: {study_xgb.best_params}') solo se puede utilizar si la optimización es simple no multiple
study_xgb_df = study_xgb.trials_dataframe()
study_xgb_df.sort_values(by='values_0', ascending=True).head(5)

In [None]:
def obtain_best_params(study_xgb, weights:np.ndarray) -> dict:
    '''
    Obtenemos los best params atraves de asociar un peso deseado a cada métrica que estamos optimizando
    -------
    Parámetros:
    - study_xgb: optuna.Study, estudio de Optuna.
    - weights: np.ndarray, pesos deseados para cada métrica.
    -------
    Devuelve:
    - best_params: dict, hiperparámetros óptimos.
    '''
    study_df = study_xgb.trials_dataframe()
    study_df['weighted_average'] = np.average(study_df[[col for col in study_df.columns if 'values' in col]], weights=weights, axis=1)

    # Encontrar el trial con el menor promedio ponderado
    best_trial_index = study_df['weighted_average'].idxmin()
    best_trial = study_df.loc[best_trial_index]

    best_trial = [trial for trial in study_xgb.best_trials if trial.number == best_trial_index]
    best_params = best_trial[0].params
    return best_params, best_trial[0].values

In [None]:
best_params, best_values = obtain_best_params(study_xgb, weights=np.array([0.75, 0.25]))
print(f'Mejores hiperparámetros: {best_params}')
print(f'Mejores valores: Mean rmse Validation: {best_values[0]}, Mean Diff (Val - Train) rmse: {best_values[1]}')

In [None]:
import numpy as np
import xgboost as xgb
from xgboost import callback
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Función para crear DMatrix
def create_dmatrix(X, y):
    return xgb.DMatrix(X, label=y, feature_names=X.columns.tolist())

# Dividir los datos de train en train y val
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Crear DMatrix para los conjuntos de datos
dtrain = create_dmatrix(X_train, y_train)
dtrain_val = create_dmatrix(X_train_val, y_train_val)
dval = create_dmatrix(X_val, y_val)
dtest = create_dmatrix(X_test, y_test)

# Parámetros del modelo
params = {
    **best_params,
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse'],
    'random_state': 42,
    'n_jobs': -1,
}

# Configuración del entrenamiento
num_boost_round = 1000
early_stopping_rounds = 300
evals = [(dtrain_val, 'train'), (dval, 'val'), (dtest, 'test')] #Es importante que el early stopping no sea sobre dtest puesto que estos datos no se conocen.

# Definimos el callback para el early stopping sobre el conjunto de validación nunca el de test.
EarlyStopping_callback = callback.EarlyStopping(metric_name='rmse', data_name='val', maximize=False, save_best=True, rounds=early_stopping_rounds, min_delta=0.001)

# Entrenar el modelo
evals_result = {}
model = xgb.train(
    params,
    dtrain_val,
    num_boost_round=num_boost_round,
    evals=evals,
    evals_result=evals_result,
    early_stopping_rounds=early_stopping_rounds,
    callbacks=[EarlyStopping_callback],
    verbose_eval=100  # Mostrar métricas cada 100 rondas
)

n_rounds = len(evals_result['train']['rmse'])

# Encontrar la mejor ronda
best_round = model.best_iteration

print(f"\nMejor ronda: {best_round}")
print(f"Mejor rmse en train: {evals_result['train']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en validación: {evals_result['val']['rmse'][best_round-1]:.6f}")
print(f"Mejor rmse en test: {evals_result['test']['rmse'][best_round-1]:.6f}")
print(f"Diferencia entre train y test: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}")

import matplotlib.pyplot as plt
import seaborn as sns

# Configurar el estilo de Seaborn
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)

# Crear la figura y los ejes
fig, ax = plt.subplots(figsize=(12, 6))

# Graficar las curvas de aprendizaje
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['train']['rmse'], label='Train', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['val']['rmse'], label='Validación', ax=ax)
sns.lineplot(x=range(1, n_rounds+1), y=evals_result['test']['rmse'], label='Test', ax=ax)

# Añadir la línea vertical para la mejor ronda
ax.axvline(x=best_round, color='r', linestyle='--', label='Best Iteration')

# Añadir el texto para el gap
ax.text(best_round, (evals_result['test']['rmse'][best_round-1] + evals_result['train']['rmse'][best_round-1]) / 2, 
        f"Gap: {evals_result['test']['rmse'][best_round-1] - evals_result['train']['rmse'][best_round-1]:.6f}", 
        color='black', ha='center', va='center', backgroundcolor='white')

# Ajustar ylim entre 0.5 y 0.7
ax.set_ylim(0, 0.5)

# Configurar etiquetas y título
ax.set_xlabel('Iterations')
ax.set_ylabel('RMSE')
ax.set_title('XGBoost Learning Curve')

# Ajustar la leyenda
ax.legend(loc='best')

# Mostrar la gráfica
plt.tight_layout()
plt.show()