In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

warnings.filterwarnings('ignore')

# Configurar estilo de gráficos
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [2]:
# 1. CARGAR Y PREPARAR LOS DATOS
train_df = pd.read_csv('../data/processed/energy_data_processed.csv')
test_df = pd.read_csv('../data/processed/energy_data_processed_test.csv')

In [3]:
# 2. SEPARAR CARACTERÍSTICAS Y VARIABLE OBJETIVO
y_train = train_df['Energy Consumption']
X_train = train_df.drop('Energy Consumption', axis=1)

y_test = test_df['Energy Consumption']
X_test = test_df.drop('Energy Consumption', axis=1)

print(f"Forma del dataset de entrenamiento: {X_train.shape}")
print(f"Forma del dataset de prueba: {X_test.shape}")
print(f"Variable objetivo: {y_train.name}")

Forma del dataset de entrenamiento: (1000, 9)
Forma del dataset de prueba: (1000, 9)
Variable objetivo: Energy Consumption


In [4]:
# 3. ESCALADO DE CARACTERÍSTICAS (opcional para árboles, pero útil para consistencia)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# 4. MODELOS ENSEMBLE
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')
}

# Entrenar y evaluar modelos básicos
results = {}
for name, model in models.items():
    print(f"\n=== {name} BÁSICO ===")
    model.fit(X_train_scaled, y_train)
    y_pred_test = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    
    results[name] = {'MSE': mse, 'RMSE': rmse, 'R²': r2, 'MAE': mae}

    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"MAE: {mae:.4f}")

# Diagnóstico para XGBoost
# Agregar después del bucle for en la celda 3 (después de entrenar los modelos básicos)
y_pred_train_xgb = models['XGBoost'].predict(X_train_scaled)
mse_train_xgb = mean_squared_error(y_train, y_pred_train_xgb)
r2_train_xgb = r2_score(y_train, y_pred_train_xgb)
print(f"\nXGBoost - Entrenamiento: R² = {r2_train_xgb:.4f}, MSE = {mse_train_xgb:.4f}")
print(f"XGBoost - Prueba: R² = {results['XGBoost']['R²']:.4f}, MSE = {results['XGBoost']['MSE']:.4f}")

# Ejemplo de 5 predicciones vs reales en prueba
y_pred_test_xgb = models['XGBoost'].predict(X_test_scaled)
print("\nEjemplo de 5 predicciones vs reales (Prueba):")
sample_idx = np.random.choice(len(y_test), min(5, len(y_test)), replace=False)
for i in sample_idx:
    print(f"Real: {y_test.iloc[i]:.2f}, Pred: {y_pred_test_xgb[i]:.2f}, Diferencia: {abs(y_test.iloc[i] - y_pred_test_xgb[i]):.2f}")

# Tamaño de datasets
print(f"\nTamaños: Train={len(y_train)}, Test={len(y_test)}")

# Inspección de datos
print("\nPrimeras 5 filas de X_test:")
print(X_test.head())
print("\nPrimeras 5 filas de y_test:")
print(y_test.head())


=== Random Forest BÁSICO ===
MSE: 2097.7550
RMSE: 45.8013
R²: 0.9976
MAE: 36.0567

=== Gradient Boosting BÁSICO ===
MSE: 2536.2069
RMSE: 50.3608
R²: 0.9971
MAE: 39.1547

=== XGBoost BÁSICO ===
MSE: 29.4600
RMSE: 5.4277
R²: 1.0000
MAE: 3.8408

XGBoost - Entrenamiento: R² = 1.0000, MSE = 29.4600
XGBoost - Prueba: R² = 1.0000, MSE = 29.4600

Ejemplo de 5 predicciones vs reales (Prueba):
Real: 3463.09, Pred: 3463.11, Diferencia: 0.02
Real: 5219.66, Pred: 5218.00, Diferencia: 1.66
Real: 3106.77, Pred: 3099.42, Diferencia: 7.35
Real: 4922.82, Pred: 4917.68, Diferencia: 5.14
Real: 4687.67, Pred: 4677.81, Diferencia: 9.86

Tamaños: Train=1000, Test=1000

Primeras 5 filas de X_test:
   Square Footage  Number of Occupants  Appliances Used  Average Temperature  \
0            7063                   76               10                29.84   
1           44372                   66               45                16.72   
2           19255                   37               17                14.30

In [16]:
# Celda adicional: Diagnóstico de fuga de datos y correlaciones
print("\n=== DIAGNÓSTICO ADICIONAL ===")

# 1. Verificar duplicados entre train y test
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
duplicates = pd.concat([X_train_df, X_test_df]).duplicated().sum()
print(f"Filas duplicadas entre train y test: {duplicates}")

# 2. Correlaciones entre características y variable objetivo
correlations = X_train.corrwith(y_train)
print("\nCorrelaciones entre características y 'Energy Consumption':")
print(correlations.sort_values(ascending=False))

# 3. Validación cruzada para XGBoost básico
cv_scores = cross_val_score(models['XGBoost'], X_train_scaled, y_train, cv=5, scoring='r2')
print(f"\nValidación cruzada (5-fold) para XGBoost - R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 4. Comparar primeras filas de X_train y X_test
print("\nPrimeras 5 filas de X_train:")
print(X_train.head())


=== DIAGNÓSTICO ADICIONAL ===
Filas duplicadas entre train y test: 1000

Correlaciones entre características y 'Energy Consumption':
Square Footage               0.774873
Building Type_Industrial     0.415468
Number of Occupants          0.354485
Appliances Used              0.312792
Day of Week_Weekday          0.004393
Day of Week_Weekend         -0.004393
Building Type_Commercial    -0.027627
Average Temperature         -0.034487
Building Type_Residential   -0.378708
dtype: float64

Validación cruzada (5-fold) para XGBoost - R²: 0.9839 (+/- 0.0042)

Primeras 5 filas de X_train:
   Square Footage  Number of Occupants  Appliances Used  Average Temperature  \
0            7063                   76               10                29.84   
1           44372                   66               45                16.72   
2           19255                   37               17                14.30   
3           13265                   14               41                32.82   
4          

In [6]:
# 5. OPTIMIZACIÓN CON BAYESIAN SEARCH PARA XGBoost (ejemplo, puedes extender a otros)
bayes_search = BayesSearchCV(
    XGBRegressor(random_state=42, eval_metric='rmse'),
    {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'subsample': Real(0.6, 1.0),
        'colsample_bytree': Real(0.6, 1.0)
    },
    n_iter=30,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_scaled, y_train)
print(f"Mejores parámetros para XGBoost: {bayes_search.best_params_}")
print(f"Mejor score CV: {-bayes_search.best_score_:.4f}")


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [7]:
# 6. MODELO OPTIMIZADO (usamos XGBoost optimizado como ejemplo principal)
ensemble_optimized = bayes_search.best_estimator_

# Predicciones para entrenamiento y prueba
y_pred_train_opt = ensemble_optimized.predict(X_train_scaled)
y_pred_test_opt = ensemble_optimized.predict(X_test_scaled)

# Métricas para entrenamiento
mse_train_opt = mean_squared_error(y_train, y_pred_train_opt)
rmse_train_opt = np.sqrt(mse_train_opt)
r2_train_opt = r2_score(y_train, y_pred_train_opt)
mae_train_opt = mean_absolute_error(y_train, y_pred_train_opt)

# Métricas para prueba
mse_test_opt = mean_squared_error(y_test, y_pred_test_opt)
rmse_test_opt = np.sqrt(mse_test_opt)
r2_test_opt = r2_score(y_test, y_pred_test_opt)
mae_test_opt = mean_absolute_error(y_test, y_pred_test_opt)

print(f"\n=== MODELO OPTIMIZADO (ENTRENAMIENTO) ===")
print(f"MSE: {mse_train_opt:.4f}")
print(f"RMSE: {rmse_train_opt:.4f}")
print(f"R²: {r2_train_opt:.4f}")
print(f"MAE: {mae_train_opt:.4f}")

print(f"\n=== MODELO OPTIMIZADO (PRUEBA) ===")
print(f"MSE: {mse_test_opt:.4f}")
print(f"RMSE: {rmse_test_opt:.4f}")
print(f"R²: {r2_test_opt:.4f}")
print(f"MAE: {mae_test_opt:.4f}")

# Validación cruzada
cv_scores = cross_val_score(ensemble_optimized, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"\nCV RMSE: {np.sqrt(-cv_scores.mean()):.4f} (+/- {np.sqrt(cv_scores.std() * 2):.4f})")



=== MODELO OPTIMIZADO (ENTRENAMIENTO) ===
MSE: 1246.6236
RMSE: 35.3076
R²: 0.9986
MAE: 27.7898

=== MODELO OPTIMIZADO (PRUEBA) ===
MSE: 1246.6236
RMSE: 35.3076
R²: 0.9986
MAE: 27.7898

CV RMSE: 67.4988 (+/- 36.0738)


In [8]:
# 7. GUARDAR RESULTADOS EN CSV
train_results = pd.DataFrame({
    'Valores Reales': y_train,
    'Predicciones': y_pred_train_opt,
    'Diferencia': y_train - y_pred_train_opt
})
train_results.to_csv('../data/results/ensemble_predictions_train.csv', index=False)
print("\nPredicciones de entrenamiento guardadas en '../data/results/ensemble_predictions_train.csv'")

test_results = pd.DataFrame({
    'Valores Reales': y_test,
    'Predicciones': y_pred_test_opt,
    'Diferencia': y_test - y_pred_test_opt
})
test_results.to_csv('../data/results/ensemble_predictions_test.csv', index=False)
print("Predicciones de prueba guardadas en '../data/results/ensemble_predictions_test.csv'")


Predicciones de entrenamiento guardadas en '../data/results/ensemble_predictions_train.csv'
Predicciones de prueba guardadas en '../data/results/ensemble_predictions_test.csv'


In [10]:
# 8. VISUALIZACIONES
output_dir = "../data/figures/"

# Visualizaciones para el conjunto de prueba (modelo optimizado)
fig_test, axes_test = plt.subplots(2, 2, figsize=(15, 12))

axes_test[0,0].scatter(y_test, y_pred_test_opt, alpha=0.6)
axes_test[0,0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes_test[0,0].set_xlabel('Valores Reales')
axes_test[0,0].set_ylabel('Predicciones')
axes_test[0,0].set_title(f'Ensemble Optimizado (Prueba) - R² = {r2_test_opt:.4f}')
axes_test[0,0].grid(True, alpha=0.3)

residuos_test = y_test - y_pred_test_opt
axes_test[0,1].scatter(y_pred_test_opt, residuos_test, alpha=0.6)
axes_test[0,1].axhline(y=0, color='r', linestyle='--')
axes_test[0,1].set_xlabel('Predicciones')
axes_test[0,1].set_ylabel('Residuos')
axes_test[0,1].set_title('Gráfico de Residuos (Prueba)')
axes_test[0,1].grid(True, alpha=0.3)

# Comparación de modelos (MSE y R²)
model_names = list(results.keys())
mse_values = [results[m]['MSE'] for m in model_names]
r2_values = [results[m]['R²'] for m in model_names]

x_pos = np.arange(len(model_names))
axes_test[1,0].bar(x_pos, mse_values, alpha=0.7)
axes_test[1,0].set_xlabel('Modelo')
axes_test[1,0].set_ylabel('MSE')
axes_test[1,0].set_title('Comparación MSE por Modelo')
axes_test[1,0].set_xticks(x_pos)
axes_test[1,0].set_xticklabels(model_names, rotation=45)

axes_test[1,1].bar(x_pos, r2_values, alpha=0.7, color='green')
axes_test[1,1].set_xlabel('Modelo')
axes_test[1,1].set_ylabel('R²')
axes_test[1,1].set_title('Comparación R² por Modelo')
axes_test[1,1].set_xticks(x_pos)
axes_test[1,1].set_xticklabels(model_names, rotation=45)

plt.tight_layout()
fig_test.savefig(f"{output_dir}ensemble_comparative_analysis_test.png", dpi=300, bbox_inches='tight')
plt.close(fig_test)
print(f"📊 Gráfico de prueba guardado en: {output_dir}ensemble_comparative_analysis_test.png")

# Visualizaciones para el conjunto de entrenamiento
fig_train, axes_train = plt.subplots(1, 2, figsize=(15, 6))

axes_train[0].scatter(y_train, y_pred_train_opt, alpha=0.6)
axes_train[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes_train[0].set_xlabel('Valores Reales')
axes_train[0].set_ylabel('Predicciones')
axes_train[0].set_title(f'Ensemble Optimizado (Entrenamiento) - R² = {r2_train_opt:.4f}')
axes_train[0].grid(True, alpha=0.3)

residuos_train = y_train - y_pred_train_opt
axes_train[1].scatter(y_pred_train_opt, residuos_train, alpha=0.6)
axes_train[1].axhline(y=0, color='r', linestyle='--')
axes_train[1].set_xlabel('Predicciones')
axes_train[1].set_ylabel('Residuos')
axes_train[1].set_title('Gráfico de Residuos (Entrenamiento)')
axes_train[1].grid(True, alpha=0.3)

plt.tight_layout()
fig_train.savefig(f"{output_dir}ensemble_comparative_analysis_train.png", dpi=300, bbox_inches='tight')
plt.close(fig_train)
print(f"📊 Gráfico de entrenamiento guardado en: {output_dir}ensemble_comparative_analysis_train.png")

📊 Gráfico de prueba guardado en: ../data/figures/ensemble_comparative_analysis_test.png
📊 Gráfico de entrenamiento guardado en: ../data/figures/ensemble_comparative_analysis_train.png


In [11]:
# 9. TABLA RESUMEN DE RESULTADOS
print("\n=== RESUMEN DE RESULTADOS ===")
resumen = pd.DataFrame(results).T
resumen['Modelo'] = resumen.index
resumen = resumen[['Modelo', 'MSE', 'RMSE', 'R²', 'MAE']]
print(resumen.round(4))


=== RESUMEN DE RESULTADOS ===
                              Modelo        MSE     RMSE      R²      MAE
Random Forest          Random Forest  2097.7550  45.8013  0.9976  36.0567
Gradient Boosting  Gradient Boosting  2536.2069  50.3608  0.9971  39.1547
XGBoost                      XGBoost    29.4600   5.4277  1.0000   3.8408


In [12]:
# 10. GUARDAR EL MODELO
joblib.dump(ensemble_optimized, '../data/results/ensemble_model.pkl')
joblib.dump(scaler, '../data/results/ensemble_scaler.pkl')
print("\nModelo y scaler guardados como 'ensemble_model.pkl' y 'ensemble_scaler.pkl'")


Modelo y scaler guardados como 'ensemble_model.pkl' y 'ensemble_scaler.pkl'


In [13]:
# 11. FUNCIÓN PARA NUEVAS PREDICCIONES
def predecir_nuevos_datos(nuevos_datos, modelo=ensemble_optimized, escalador=scaler):
    """
    Función para hacer predicciones en nuevos datos

    Parameters:
    nuevos_datos: array-like o DataFrame, datos a predecir
    modelo: modelo ensemble entrenado
    escalador: StandardScaler ajustado

    Returns:
    predicciones: array con las predicciones
    """
    if isinstance(nuevos_datos, pd.DataFrame):
        nuevos_datos = nuevos_datos.values
    datos_escalados = escalador.transform(nuevos_datos)
    predicciones = modelo.predict(datos_escalados)
    return predicciones

print("\n¡Modelos ensemble implementados exitosamente!")


¡Modelos ensemble implementados exitosamente!
