#Importacion dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_processed = pd.read_csv("df_nyc_processed.csv")
df_processed.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,distance,pickup_datetime,fare_amount
0,-0.708632,-0.46861,-0.735793,-0.915885,-0.528589,1.7462,-0.373086,-0.025579,0.845593,-0.469043,2015-05-07 19:52:06+00:00,7.5
1,-0.548165,-0.848359,-0.592962,-0.025076,-0.528589,-1.474551,0.20816,0.488126,0.999074,-0.252451,2009-07-17 20:04:56+00:00,7.7
2,-0.862172,-0.378018,0.363217,0.708458,-0.528589,-1.474551,0.498782,-1.566693,1.152555,0.476085,2009-08-24 21:45:00+00:00,12.9
3,-0.012549,1.499369,0.281386,1.717372,1.003365,-1.474551,-0.082463,0.488126,-0.842699,-0.474669,2009-06-26 08:22:21+00:00,5.3
4,1.488769,-0.253732,0.05038,0.333837,2.535319,1.209408,0.498782,-0.025579,0.538631,0.318563,2014-08-28 17:47:00+00:00,16.0


In [None]:
df_processed.isnull().sum()

Unnamed: 0,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,0
dropoff_latitude,0
passenger_count,0
year,0
month,0
weekday,0
hour,0
distance,0


#Particion de datos

In [None]:
from sklearn.model_selection import train_test_split

X = df_processed.drop(["fare_amount","pickup_datetime"], axis=1)
y = df_processed["fare_amount"]

# 87

# Dividir train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, random_state=42
)

-----

In [None]:
!pip install tensorflow



In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
!pip install xgboost



#Modelo de Ensamble Learning : XGBoost Regressor

- Se utiliza para predecir valores continuos, como precios o ventas
- Combina arboles de decisión débiles en uno solo, mas robusto

In [None]:
# XGBoost Regressor

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Definicion 3 configuraciones de hiperparametros para XGBoost
xgb_configs = [
    {"name": "XGBoost-Config-1", "params": {"n_estimators": 100, "max_depth": 3, "learning_rate": 0.01, "random_state": 42, "objective": "reg:squarederror"}},
    {"name": "XGBoost-Config-2", "params": {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1, "random_state": 42, "objective": "reg:squarederror"}},
    {"name": "XGBoost-Config-3", "params": {"n_estimators": 300, "max_depth": 10, "learning_rate": 0.2, "random_state": 42, "objective": "reg:squarederror"}}
]

xgb_results = pd.DataFrame(columns=['Configuration', 'MSE', 'RMSE', 'R2', 'MAE'])


In [None]:
# Train and evaluate each XGBoost configuration
print("--- Entrenamiento y Evaluacio  de las Configuraciones XGBoost ---")
for config in xgb_configs:
    print(f"\n--- Entrenamiento y Evaluacion {config['name']} ---")
    xgb_model = XGBRegressor(**config['params'])
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)

    # Metricas de Evaluacion
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    rmse_xgb = np.sqrt(mse_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

    # Almacenando resultados
    xgb_results = pd.concat([xgb_results, pd.DataFrame([{
        'Configuration': config['name'],
        'MSE': mse_xgb,
        'RMSE': rmse_xgb,
        'R2': r2_xgb,
        'MAE': mae_xgb
    }])], ignore_index=True)

    # Impresion de resultados de las configuraciones en ejecucion
    print(f"MSE: {mse_xgb:.4f}")
    print(f"RMSE: {rmse_xgb:.4f}")
    print(f"R2: {r2_xgb:.4f}")
    print(f"MAE: {mae_xgb:.4f}")

print("\n--- Resultados XGBoost Benchmarking ---")
xgb_results.head()

--- Entrenamiento y Evaluacio  de las Configuraciones XGBoost ---

--- Entrenamiento y Evaluacion XGBoost-Config-1 ---


  xgb_results = pd.concat([xgb_results, pd.DataFrame([{


MSE: 24.8727
RMSE: 4.9872
R2: 0.7082
MAE: 3.1061

--- Entrenamiento y Evaluacion XGBoost-Config-2 ---
MSE: 8.9323
RMSE: 2.9887
R2: 0.8952
MAE: 1.5863

--- Entrenamiento y Evaluacion XGBoost-Config-3 ---
MSE: 9.4064
RMSE: 3.0670
R2: 0.8896
MAE: 1.6011

--- Resultados XGBoost Benchmarking ---


Unnamed: 0,Configuration,MSE,RMSE,R2,MAE
0,XGBoost-Config-1,24.872662,4.98725,0.708203,3.106052
1,XGBoost-Config-2,8.932346,2.988703,0.895209,1.586252
2,XGBoost-Config-3,9.406441,3.066992,0.889647,1.601059


# MLP Regressor con Tensorflow
- Son redes neuronales artificiales con tres o más capas de perceptrones.
- Estas capas son: una capa de entrada, una o más capas ocultas y una capa de salida.
- Los datos fluyen en una sola dirección, es decir, hacia adelante, desde las capas de entrada -> capas ocultas -> capa de salida
- La retropropagación es una técnica en la que el perceptrón multicapa recibe retroalimentación sobre el error en sus resultados y el MLP ajusta sus ponderaciones en consecuencia para realizar predicciones más precisas

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Definicion de los modelos con Keras

"""
modelo 1:
100 neuronas
Funcion de activacion ReLU = Si el valor de entrada es menor que 0, la salida es 0. Si el valor de entrada es mayor o igual que 0, la salida es el valor de entrada.
capa de salida con 1 neurona para regresion
compilacion con optimizador adam y funcion de perdida mse
"""
def build_keras_model_1():
    model = Sequential()
    model.add(Dense(100, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(1)) # Capa de salida
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse']) # Added metrics=['mse']
    return model

"""
modelo 2:
2 capas ocultas con 100 y 50 neuronas
Funcion de activacion ReLU
capa de salida con 1 neurona para regresion
compilacion con optimizador adam y funcion de perdida mse
"""
def build_keras_model_2():
    model = Sequential()
    model.add(Dense(100, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1)) # Capa de salida
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mse', metrics=['mse']) # Added metrics=['mse']
    return model

"""
modelo 3:
3 capas ocultas con 50, 50 y 25 neuronas
Funcion de activacion tanh = Los valores de entrada se mapean en un rango de -1 a 1
capa de salida con 1 neurona para regresion
compilacion con optimizador adam y funcion de perdida mse
"""
def build_keras_model_3():
    model = Sequential()
    model.add(Dense(50, activation='tanh', input_shape=(X_train.shape[1],)))
    model.add(Dense(50, activation='tanh'))
    model.add(Dense(25, activation='tanh'))
    model.add(Dense(1)) # Capa de salida
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse']) # Added metrics=['mse']
    return model

In [None]:
# Definicion de arquitecturas con su respectivo modelo
architectures = [
    {"name": "Keras-Arch-1", "build_fn": build_keras_model_1},
    {"name": "Keras-Arch-2", "build_fn": build_keras_model_2},
    {"name": "Keras-Arch-3", "build_fn": build_keras_model_3}
]

# Almacenar los resultados

mlp_results = pd.DataFrame(columns=['Configuration', 'MSE', 'RMSE', 'R2', 'MAE'])

In [None]:
# Iterar sobre cada arquitectura para entrenar y evaluar
for arch in architectures:
    print(f"\n--- Entrenando y evaluando {arch['name']} ---")

    # Instanciacion del modelo en la lista de arquitecturas
    model = arch['build_fn']()



    history = model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_test, y_test)) # Added validation_data

    # Predict and evaluate
    y_pred = model.predict(X_test, verbose=0)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Guardar resultados en el DataFrame
    mlp_results = pd.concat([mlp_results, pd.DataFrame([{ # Changed results_df to mlp_results
        'Architecture': arch['name'],
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAE': mae
    }])], ignore_index=True)

    # Imprimir resultados de la arquitectura actual
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2: {r2:.4f}")
    print(f"MAE: {mae:.4f}")

print("\n--- Resultados del Benchmark ---")
print(mlp_results)

# Identificar la mejor arquitectura
best_arch_row = mlp_results.loc[mlp_results['MSE'].idxmin()] # Changed results_df to mlp_results
print("\nMejor arquitectura basada en MSE:")
print(f"Arquitectura: {best_arch_row['Architecture']}")
print(f"MSE: {best_arch_row['MSE']:.4f}")


--- Entrenando y evaluando Keras-Arch-1 ---
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 35.7207 - mse: 35.7207 - val_loss: 12.0324 - val_mse: 12.0324
Epoch 2/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 12.4006 - mse: 12.4006 - val_loss: 11.3932 - val_mse: 11.3932
Epoch 3/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.8602 - mse: 11.8602 - val_loss: 11.0874 - val_mse: 11.0874
Epoch 4/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 11.3626 - mse: 11.3626 - val_loss: 10.8390 - val_mse: 10.8390
Epoch 5/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.2155 - mse: 11.2155 - val_loss: 10.7995 - val_mse: 10.7995
Epoch 6/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 11.0405 - mse: 11.0405 - val_loss: 10.8082 - val_mse: 10.8082
Epoch 7/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━

  mlp_results = pd.concat([mlp_results, pd.DataFrame([{ # Changed results_df to mlp_results
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 16.9190 - mse: 16.9190 - val_loss: 11.1315 - val_mse: 11.1315
Epoch 2/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 12.0041 - mse: 12.0041 - val_loss: 11.3486 - val_mse: 11.3486
Epoch 3/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.3767 - mse: 11.3767 - val_loss: 11.5436 - val_mse: 11.5436
Epoch 4/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 10.7932 - mse: 10.7932 - val_loss: 12.5278 - val_mse: 12.5278
Epoch 5/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.1715 - mse: 11.1715 - val_loss: 10.6366 - val_mse: 10.6366
Epoch 6/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 11.0666 - mse: 11.0666 - val_loss: 10.5387 - val_mse: 10.5387
Epoch 7/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 51.0123 - mse: 51.0123 - val_loss: 12.6421 - val_mse: 12.6421
Epoch 2/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 12.8850 - mse: 12.8850 - val_loss: 11.9171 - val_mse: 11.9171
Epoch 3/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.9933 - mse: 11.9933 - val_loss: 10.9908 - val_mse: 10.9908
Epoch 4/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 11.4590 - mse: 11.4590 - val_loss: 10.6881 - val_mse: 10.6881
Epoch 5/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 10.8033 - mse: 10.8033 - val_loss: 10.3475 - val_mse: 10.3475
Epoch 6/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 10.3569 - mse: 10.3569 - val_loss: 9.9605 - val_mse: 9.9605
Epoch 7/100
[1m5233/5233[0m [32m━━━━━━━━━━━━━━━━━━━

# K-Fold Para XGBoost

## Evalúa el rendimiento de un modelo dividiendo los datos en k subconjuntos (folds)

Aclaración sobre la evaluación con datos de prueba:



Durante la fase de evaluación comparativa de la validación cruzada de K-Fold, los datos de X_train e y_train se dividirán en los pliegues de entrenamiento y validación en cada iteración.

Los modelos se entrenarán en los pliegues de entrenamiento y se evaluarán en los de validación. Las métricas reportadas durante el proceso de K-Fold (como se muestra en el resultado anterior) son las métricas promedio de estos pliegues de validación para cada configuración del modelo. Este proceso ayuda a seleccionar la mejor arquitectura del modelo y los hiperparámetros.

El conjunto separado de X_test e y_test se reservará y se utilizará solo una vez al final para proporcionar una evaluación imparcial del rendimiento del modelo final elegido con datos no analizados.

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_fold_metrics = {config['name']: [] for config in xgb_configs}

print("--- Performing K-Fold Cross-Validation for XGBoost Configurations ---")
for config in xgb_configs:
    print(f"\n--- Evaluating {config['name']} with K-Fold ---")
    fold = 0
    for train_index, val_index in kf.split(X_train, y_train):
        fold += 1
        print(f"  Fold {fold}")
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        xgb_model_fold = XGBRegressor(**config['params'])
        xgb_model_fold.fit(X_train_fold, y_train_fold)
        y_pred_xgb_fold = xgb_model_fold.predict(X_val_fold)

        mse_xgb_fold = mean_squared_error(y_val_fold, y_pred_xgb_fold)
        rmse_xgb_fold = np.sqrt(mse_xgb_fold)
        r2_xgb_fold = r2_score(y_val_fold, y_pred_xgb_fold)
        mae_xgb_fold = mean_absolute_error(y_val_fold, y_pred_xgb_fold)

        xgb_fold_metrics[config['name']].append({
            'MSE': mse_xgb_fold,
            'RMSE': rmse_xgb_fold,
            'R2': r2_xgb_fold,
            'MAE': mae_xgb_fold
        })

    # Calculate and print average metrics for the current configuration
    avg_mse = np.mean([m['MSE'] for m in xgb_fold_metrics[config['name']]])
    avg_rmse = np.mean([m['RMSE'] for m in xgb_fold_metrics[config['name']]])
    avg_r2 = np.mean([m['R2'] for m in xgb_fold_metrics[config['name']]])
    avg_mae = np.mean([m['MAE'] for m in xgb_fold_metrics[config['name']]])

    print(f"\nAverage Metrics for {config['name']}:")
    print(f"  Average MSE: {avg_mse:.4f}")
    print(f"  Average RMSE: {avg_rmse:.4f}")
    print(f"  Average R2: {avg_r2:.4f}")
    print(f"  Average MAE: {avg_mae:.4f}")


--- Performing K-Fold Cross-Validation for XGBoost Configurations ---

--- Evaluating XGBoost-Config-1 with K-Fold ---
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5

Average Metrics for XGBoost-Config-1:
  Average MSE: 24.9208
  Average RMSE: 4.9918
  Average R2: 0.7066
  Average MAE: 3.1062

--- Evaluating XGBoost-Config-2 with K-Fold ---
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5

Average Metrics for XGBoost-Config-2:
  Average MSE: 9.8355
  Average RMSE: 3.1356
  Average R2: 0.8842
  Average MAE: 1.6173

--- Evaluating XGBoost-Config-3 with K-Fold ---
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5

Average Metrics for XGBoost-Config-3:
  Average MSE: 10.4382
  Average RMSE: 3.2305
  Average R2: 0.8771
  Average MAE: 1.6357


# K-Fold para Keras

In [None]:
keras_fold_metrics = {arch['name']: [] for arch in architectures}

print("\n--- Performing K-Fold Cross-Validation for Keras Architectures ---")
for arch in architectures:
    print(f"\n--- Evaluating {arch['name']} with K-Fold ---")
    fold = 0
    for train_index, val_index in kf.split(X_train, y_train):
        fold += 1
        print(f"  Fold {fold}")
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        model = arch['build_fn']()

        # Train the model
        history = model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0, validation_data=(X_val_fold, y_val_fold))

        # Predict and evaluate
        y_pred = model.predict(X_val_fold, verbose=0)
        mse = mean_squared_error(y_val_fold, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val_fold, y_pred)
        mae = mean_absolute_error(y_val_fold, y_pred)

        keras_fold_metrics[arch['name']].append({
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'MAE': mae
        })

    # Calculate and print average metrics for the current architecture
    avg_mse = np.mean([m['MSE'] for m in keras_fold_metrics[arch['name']]])
    avg_rmse = np.mean([m['RMSE'] for m in keras_fold_metrics[arch['name']]])
    avg_r2 = np.mean([m['R2'] for m in keras_fold_metrics[arch['name']]])
    avg_mae = np.mean([m['MAE'] for m in keras_fold_metrics[arch['name']]])

    print(f"\nAverage Metrics for {arch['name']}:")
    print(f"  Average MSE: {avg_mse:.4f}")
    print(f"  Average RMSE: {avg_rmse:.4f}")
    print(f"  Average R2: {avg_r2:.4f}")
    print(f"  Average MAE: {avg_mae:.4f}")


--- Performing K-Fold Cross-Validation for Keras Architectures ---

--- Evaluating Keras-Arch-1 with K-Fold ---
  Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Average Metrics for Keras-Arch-1:
  Average MSE: 10.3591
  Average RMSE: 3.2183
  Average R2: 0.8780
  Average MAE: 1.6580

--- Evaluating Keras-Arch-2 with K-Fold ---
  Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Average Metrics for Keras-Arch-2:
  Average MSE: 10.6265
  Average RMSE: 3.2596
  Average R2: 0.8749
  Average MAE: 1.7282

--- Evaluating Keras-Arch-3 with K-Fold ---
  Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Fold 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Average Metrics for Keras-Arch-3:
  Average MSE: 10.3354
  Average RMSE: 3.2145
  Average R2: 0.8783
  Average MAE: 1.6522


# Almacenamiento de todas la resultados

In [None]:
all_avg_metrics = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'R2', 'MAE'])

# Calculate and store average metrics for XGBoost models
for config_name, metrics_list in xgb_fold_metrics.items():
    avg_mse = np.mean([m['MSE'] for m in metrics_list])
    avg_rmse = np.mean([m['RMSE'] for m in metrics_list])
    avg_r2 = np.mean([m['R2'] for m in metrics_list])
    avg_mae = np.mean([m['MAE'] for m in metrics_list])
    all_avg_metrics = pd.concat([all_avg_metrics, pd.DataFrame([{
        'Model': config_name,
        'MSE': avg_mse,
        'RMSE': avg_rmse,
        'R2': avg_r2,
        'MAE': avg_mae
    }])], ignore_index=True)

# Calculate and store average metrics for Keras models
for arch_name, metrics_list in keras_fold_metrics.items():
    avg_mse = np.mean([m['MSE'] for m in metrics_list])
    avg_rmse = np.mean([m['RMSE'] for m in metrics_list])
    avg_r2 = np.mean([m['R2'] for m in metrics_list])
    avg_mae = np.mean([m['MAE'] for m in metrics_list])
    all_avg_metrics = pd.concat([all_avg_metrics, pd.DataFrame([{
        'Model': arch_name,
        'MSE': avg_mse,
        'RMSE': avg_rmse,
        'R2': avg_r2,
        'MAE': avg_mae
    }])], ignore_index=True)

# Sort by average MSE
all_avg_metrics_sorted = all_avg_metrics.sort_values(by='MSE', ascending=True)

print("\n--- Average Performance Metrics Across K-Folds ---")
display(all_avg_metrics_sorted)


--- Average Performance Metrics Across K-Folds ---


  all_avg_metrics = pd.concat([all_avg_metrics, pd.DataFrame([{


Unnamed: 0,Model,MSE,RMSE,R2,MAE
1,XGBoost-Config-2,9.835542,3.135611,0.884206,1.617263
5,Keras-Arch-3,10.335447,3.214515,0.878294,1.652186
3,Keras-Arch-1,10.359127,3.218312,0.878023,1.658039
2,XGBoost-Config-3,10.438228,3.23049,0.877088,1.635676
4,Keras-Arch-2,10.626532,3.259581,0.874854,1.728228
0,XGBoost-Config-1,24.920771,4.991794,0.706606,3.106247


#Implementacion de GridSearch para XGBoost Regressor

In [None]:
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

# 1. Identify the best performing model type
# Based on the 'all_avg_metrics_sorted' DataFrame, XGBoost-Config-2 had the lowest average MSE (10.6765),
# which is better than the best Keras architecture (Keras-Arch-1 with MSE 11.1133).
# Therefore, the best performing model type is XGBoost.

# 2. Define a parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
}

# 3. Instantiate the appropriate model (XGBoost)
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

# 4. Initialize a GridSearchCV object
# Using the previously defined kf KFold object for consistency in cross-validation
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                               scoring='neg_mean_squared_error', cv=kf, verbose=2)

# 5. Fit the GridSearchCV object to the training data
print("\n--- Performing GridSearchCV for XGBoost ---")
grid_search_xgb.fit(X_train, y_train)

# 6. Print the best hyperparameters
print("\nBest hyperparameters found by GridSearchCV:")
print(grid_search_xgb.best_params_)

# 7. Print the best score
print("\nBest cross-validation score (negative MSE):")
print(grid_search_xgb.best_score_)


--- Performing GridSearchCV for XGBoost ---
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   2.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   1.6s
[C

El error cuadrático medio (MSE) se puede interpretar como la diferencia cuadrática +/- entre el valor predicho y el valor real que esperaríamos ver en promedio. Se mide en la misma unidad que el valor objetivo al cuadrado.

In [None]:
# Retrieve the best hyperparameters
best_xgb_params = grid_search_xgb.best_params_

# Instantiate a new XGBRegressor model with best hyperparameters
best_xgb_model = XGBRegressor(**best_xgb_params, random_state=42, objective='reg:squarederror')

# Train the best model on the entire training dataset
print("\n--- Training the best XGBoost model on the full training data ---")
best_xgb_model.fit(X_train, y_train)
print("Training complete.")

# Make predictions on the separate test set
print("\n--- Evaluating the best XGBoost model on the test set ---")
y_pred_best_xgb = best_xgb_model.predict(X_test)

# Calculate and print evaluation metrics
mse_best_xgb = mean_squared_error(y_test, y_pred_best_xgb)
rmse_best_xgb = np.sqrt(mse_best_xgb)
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)
mae_best_xgb = mean_absolute_error(y_test, y_pred_best_xgb)

print(f"Test Set MSE: {mse_best_xgb:.4f}")
print(f"Test Set RMSE: {rmse_best_xgb:.4f}")
print(f"Test Set R2: {r2_best_xgb:.4f}")
print(f"Test Set MAE: {mae_best_xgb:.4f}")


--- Training the best XGBoost model on the full training data ---
Training complete.

--- Evaluating the best XGBoost model on the test set ---
Test Set MSE: 8.8180
Test Set RMSE: 2.9695
Test Set R2: 0.8966
Test Set MAE: 1.5607


In [None]:
print("--- Summary of Benchmarking Results ---")

# 1. Display average K-Fold cross-validation results
print("\nAverage K-Fold Cross-Validation Performance of all Models:")
display(all_avg_metrics_sorted)

# 2. State the best performing model type based on K-Fold results
best_model_type_kfold = all_avg_metrics_sorted.iloc[0]['Model'].split('-')[0]
print(f"\nBased on average K-Fold cross-validation results, the best performing model type is: {best_model_type_kfold}")

# 3. Present the best hyperparameters found by GridSearchCV
print("\nBest hyperparameters found by GridSearchCV for the best model type (XGBoost):")
print(grid_search_xgb.best_params_)

# 4. Display the evaluation metrics on the separate test set for the best model
print("\nPerformance of the best model (XGBoost with optimal hyperparameters) on the separate test set:")
print(f"Test Set MSE: {mse_best_xgb:.4f}")
print(f"Test Set RMSE: {rmse_best_xgb:.4f}")
print(f"Test Set R2: {r2_best_xgb:.4f}")
print(f"Test Set MAE: {mae_best_xgb:.4f}")

# 5. Provide a brief written summary
print("\n--- Summary and Comparison ---")
print(f"The best performing model based on average K-Fold cross-validation MSE was {all_avg_metrics_sorted.iloc[0]['Model']}.")
print(f"GridSearchCV for XGBoost identified the optimal hyperparameters as: {grid_search_xgb.best_params_}.")
print(f"When trained with these optimal hyperparameters on the full training data, the XGBoost model achieved the following metrics on the unseen test set:")
print(f"  MSE: {mse_best_xgb:.4f}")
print(f"  RMSE: {rmse_best_xgb:.4f}")
print(f"  R2: {r2_best_xgb:.4f}")
print(f"  MAE: {mae_best_xgb:.4f}")
print("\nComparing the test set performance to the average K-Fold performance of the best XGBoost configuration (XGBoost-Config-2), the MSE on the test set (%.4f) is slightly lower than the average K-Fold MSE (%.4f)." % (mse_best_xgb, all_avg_metrics_sorted.loc[all_avg_metrics_sorted['Model'] == 'XGBoost-Config-2', 'MSE'].iloc[0]))
print(f"The R2 score on the test set (%.4f) is slightly higher than the average K-Fold R2 (%.4f)." % (r2_best_xgb, all_avg_metrics_sorted.loc[all_avg_metrics_sorted['Model'] == 'XGBoost-Config-2', 'R2'].iloc[0]))
print("This indicates that the model trained with optimal hyperparameters generalizes well to unseen data and performs slightly better than the average performance observed during cross-validation.")

--- Summary of Benchmarking Results ---

Average K-Fold Cross-Validation Performance of all Models:


Unnamed: 0,Model,MSE,RMSE,R2,MAE
1,XGBoost-Config-2,9.835542,3.135611,0.884206,1.617263
5,Keras-Arch-3,10.335447,3.214515,0.878294,1.652186
3,Keras-Arch-1,10.359127,3.218312,0.878023,1.658039
2,XGBoost-Config-3,10.438228,3.23049,0.877088,1.635676
4,Keras-Arch-2,10.626532,3.259581,0.874854,1.728228
0,XGBoost-Config-1,24.920771,4.991794,0.706606,3.106247



Based on average K-Fold cross-validation results, the best performing model type is: XGBoost

Best hyperparameters found by GridSearchCV for the best model type (XGBoost):
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}

Performance of the best model (XGBoost with optimal hyperparameters) on the separate test set:
Test Set MSE: 8.8180
Test Set RMSE: 2.9695
Test Set R2: 0.8966
Test Set MAE: 1.5607

--- Summary and Comparison ---
The best performing model based on average K-Fold cross-validation MSE was XGBoost-Config-2.
GridSearchCV for XGBoost identified the optimal hyperparameters as: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}.
When trained with these optimal hyperparameters on the full training data, the XGBoost model achieved the following metrics on the unseen test set:
  MSE: 8.8180
  RMSE: 2.9695
  R2: 0.8966
  MAE: 1.5607

Comparing the test set performance to the average K-Fold performance of the best XGBoost configuration (XGBoost-Config-2), the M