In [9]:
# Análisis Exploratorio de Datos - Demanda Energética Cuba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit

In [10]:
def load_and_prepare_data(file_path):
    """Carga y prepara los datos iniciales"""
    df = pd.read_csv(file_path)
    
    # Convertir fecha a datetime
    df['fecha'] = pd.to_datetime(df['fecha'])
    
    # Crear variables derivadas adicionales
    df['deficit_real'] = df['demanda_maxima'] - df['disponibilidad_total']
    df['margen_seguridad'] = df['disponibilidad_total'] - df['demanda_maxima']
    df['eficiencia_07am'] = df['disponibilidad_07am'] / df['disponibilidad_total'] * 100
    df['utilizacion_07am'] = df['demanda_07am'] / df['disponibilidad_07am'] * 100
    
    # Categorizar días
    df['tipo_dia'] = df['dia_semana'].map({
        0: 'Lunes', 1: 'Martes', 2: 'Miércoles', 3: 'Jueves',
        4: 'Viernes', 5: 'Sábado', 6: 'Domingo'
    })
    
    return df


file_path = '../data/processed/cleaned_energy_data.csv'
df = load_and_prepare_data(file_path)
print(df)


                  fecha   año  mes  dia  dia_semana  es_fin_semana  \
0   2022-12-30 08:29:00  2022   12   30           4              0   
1   2022-12-23 10:33:00  2022   12   23           4              0   
2   2022-12-17 08:56:00  2022   12   17           5              1   
3   2022-12-14 08:42:00  2022   12   14           2              0   
4   2022-12-13 08:46:00  2022   12   13           1              0   
..                  ...   ...  ...  ...         ...            ...   
691 2025-05-05 09:10:00  2025    5    5           0              0   
692 2025-05-03 08:39:00  2025    5    3           5              1   
693 2025-05-02 11:58:00  2025    5    2           4              0   
694 2025-05-01 11:50:00  2025    5    1           3              0   
695 2025-05-04 11:00:00  2025    5    4           6              1   

     demanda_maxima  disponibilidad_total  afectacion_predicha  \
0            2600.0                3109.0                  NaN   
1            2700.0        

In [11]:
# Variables que vas a usar como entrada (features)
features = [
    'disponibilidad_total', 'disponibilidad_07am', 
    'demanda_07am', 'deficit_real', 'margen_seguridad', 
    'eficiencia_07am', 'utilizacion_07am', 'dia_semana'
]

# Variable a predecir
target = 'demanda_maxima'

# X = entrada, y = salida
X = df[features]
y = df[target]


In [17]:
from sklearn.model_selection import TimeSeriesSplit

# Alternativamente, para un simple train-test split:
train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]
y_train = y_train[~y_train.isna()]

# Combina X_train e y_train
train = pd.concat([X_train, y_train], axis=1)

# Elimina cualquier fila con NaN en features o target
train = train.dropna()

# Divide de nuevo
X_train = train.drop(columns=target)
y_train = train[target]


test = pd.concat([X_test, y_test], axis=1)
test = test.dropna()
X_test = test.drop(columns=target)
y_test = test[target]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(542, 8) (542,)
(137, 8) (137,)


In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_percentage_error

models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(),
    'Ridge': Ridge()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # MAPE requiere que y_test no tenga ceros
    try:
        mape = mean_absolute_percentage_error(y_test, y_pred)
    except:
        mape = None

    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    }

    print(f"{name} - MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.3f} | MAPE: {mape:.2%}" if mape is not None else
          f"{name} - MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.3f}")



Random Forest - MAE: 101.92 | RMSE: 117.88 | R2: 0.502 | MAPE: 3.13%
Gradient Boosting - MAE: 88.55 | RMSE: 102.88 | R2: 0.620 | MAPE: 2.72%
SVR - MAE: 272.43 | RMSE: 300.75 | R2: -2.244 | MAPE: 8.34%
Ridge - MAE: 0.00 | RMSE: 0.00 | R2: 1.000 | MAPE: 0.00%


In [20]:
from sklearn.model_selection import GridSearchCV

param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'SVR': {
        'C': [1.0, 10.0],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    }
}


from sklearn.model_selection import TimeSeriesSplit

cv = TimeSeriesSplit(n_splits=5)  # Mejor para datos temporales

best_models = {}

for name, model in models.items():
    print(f"🔍 Buscando mejores parámetros para: {name}")
    grid = GridSearchCV(model, param_grids[name], cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    best_models[name] = {
        'Best Estimator': best_model,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    }

    print(f"{name} ✅ Best params: {grid.best_params_}")
    print(f"{name} - MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.3f} | MAPE: {mape:.2%}")


🔍 Buscando mejores parámetros para: Random Forest
Random Forest ✅ Best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest - MAE: 101.92 | RMSE: 117.88 | R2: 0.502 | MAPE: 3.13%
🔍 Buscando mejores parámetros para: Gradient Boosting
Gradient Boosting ✅ Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Gradient Boosting - MAE: 81.94 | RMSE: 96.11 | R2: 0.669 | MAPE: 2.51%
🔍 Buscando mejores parámetros para: SVR
SVR ✅ Best params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
SVR - MAE: 0.05 | RMSE: 0.05 | R2: 1.000 | MAPE: 0.00%
🔍 Buscando mejores parámetros para: Ridge
Ridge ✅ Best params: {'alpha': 0.1}
Ridge - MAE: 0.00 | RMSE: 0.00 | R2: 1.000 | MAPE: 0.00%
