In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
import joblib

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# =============================================
# Step 1. Load all merged Parquet files
# =============================================
def load_all_merged_data(meteo_dir: Path) -> dict:
    """
    Loads all Parquet files from the given directory into a dictionary,
    using the file stem (plant name) as the key.
    Assumes each file contains a UTC-aware DatetimeIndex and a 'generation' column,
    along with meteorological variables.
    """
    plant_dfs = {}
    for file in meteo_dir.glob("*.parquet"):
        df = pd.read_parquet(file)
        # Ensure the index is a UTC-aware DatetimeIndex
        if not isinstance(df.index, pd.DatetimeIndex):
            df["date"] = pd.to_datetime(df["date"], utc=True)
            df = df.set_index("date")
        plant_name = file.stem  # e.g. "parque_eolico_agua_clara"
        plant_dfs[plant_name] = df
    return plant_dfs

# =============================================
# Step 2. Preprocess each DataFrame
#   a. Drop only the initial consecutive days with zero generation.
#   b. Drop the last 26 hours.
# =============================================
def drop_initial_zero_generation_days(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops the very first consecutive days where the total generation is zero.
    Once a day with non-zero generation is encountered, subsequent days are retained.
    """
    df = df.copy()
    df["date_only"] = df.index.normalize()
    unique_dates = sorted(df["date_only"].unique())
    drop_dates = []
    for date in unique_dates:
        daily_sum = df.loc[df["date_only"] == date, "generation"].sum()
        if daily_sum == 0:
            drop_dates.append(date)
        else:
            break  # Stop after the first day with non-zero generation
    df_clean = df[~df["date_only"].isin(drop_dates)].copy()
    df_clean.drop(columns=["date_only"], inplace=True)
    return df_clean

def drop_last_26_hours(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops rows from the DataFrame where the timestamp is within the last 26 hours.
    """
    last_timestamp = df.index.max()
    cutoff = last_timestamp - timedelta(hours=26)
    return df[df.index <= cutoff]

# =============================================
# Step 3. Grid Search Hyperparameter Tuning for Models
# =============================================
def tune_model_grid(model, param_grid, X_train, y_train, cv=3):
    """
    Performs grid search using GridSearchCV on the given model.
    Returns the best estimator.
    """
    grid_cv = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring="r2",
        n_jobs=-1
    )
    grid_cv.fit(X_train, y_train)
    return grid_cv.best_estimator_

def tune_and_evaluate_models(df: pd.DataFrame, plant: str) -> (dict, dict):
    """
    Given a preprocessed DataFrame for a plant, this function:
      - Drops rows with missing values.
      - Splits data into training (first 80%) and testing (last 20%) sets (time-ordered).
      - Uses all numeric meteorological variables (excluding 'generation') as features.
      - Trains three models:
            * Linear Regression (no tuning)
            * Random Forest (with GridSearchCV)
            * Gradient Boosting (with GridSearchCV)
      - Evaluates each model using Mean Squared Error and R².
    
    Returns two dictionaries:
      - performance: mapping model names to metrics.
      - model_objs: mapping model names to the trained model objects.
    """
    df = df.dropna()
    if df.empty:
        print(f"No data available for {plant} after dropping missing values.")
        return {}, {}
    
    X = df.drop(columns=["generation"])
    y = df["generation"]
    X = X.select_dtypes(include=[np.number])
    
    split_index = int(len(df) * 0.8)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
    
    performance = {}
    model_objs = {}
    
    # Linear Regression (no tuning)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    r2_lr = r2_score(y_test, y_pred_lr)
    performance["LinearRegression"] = {"MSE": mse_lr, "R2": r2_lr}
    model_objs["LinearRegression"] = lr
    print(f"{plant} - LinearRegression: MSE = {mse_lr:.2f}, R2 = {r2_lr:.2f}")
    
    # Random Forest with GridSearchCV
    rf_param_grid = {
        "n_estimators": [50, 100],
        "max_depth": [5, 10],
        "min_samples_split": [2, 4],
        "min_samples_leaf": [1, 2]
    }
    rf = RandomForestRegressor(random_state=42)
    rf_best = tune_model_grid(rf, rf_param_grid, X_train, y_train, cv=3)
    y_pred_rf = rf_best.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    performance["RandomForest"] = {"MSE": mse_rf, "R2": r2_rf}
    model_objs["RandomForest"] = rf_best
    print(f"{plant} - RandomForest (tuned): MSE = {mse_rf:.2f}, R2 = {r2_rf:.2f}")
    
    # Gradient Boosting with GridSearchCV
    gb_param_grid = {
        "n_estimators": [50, 100],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 5],
        "min_samples_split": [2, 4],
        "min_samples_leaf": [1, 2]
    }
    gb = GradientBoostingRegressor(random_state=42)
    gb_best = tune_model_grid(gb, gb_param_grid, X_train, y_train, cv=3)
    y_pred_gb = gb_best.predict(X_test)
    mse_gb = mean_squared_error(y_test, y_pred_gb)
    r2_gb = r2_score(y_test, y_pred_gb)
    performance["GradientBoosting"] = {"MSE": mse_gb, "R2": r2_gb}
    model_objs["GradientBoosting"] = gb_best
    print(f"{plant} - GradientBoosting (tuned): MSE = {mse_gb:.2f}, R2 = {r2_gb:.2f}")
    
    return performance, model_objs

def select_and_save_best_model(performance: dict, model_objs: dict, plant: str, model_dir: Path):
    """
    Selects the best model (highest R²) and saves it to disk using joblib.
    The model is saved with a .h5 extension.
    """
    if not performance:
        print(f"No performance metrics for {plant}. Skipping model saving.")
        return None
    best_model_name = max(performance, key=lambda m: performance[m]["R2"])
    best_model = model_objs[best_model_name]
    model_file = model_dir / f"{plant}_best_model.h5"
    joblib.dump(best_model, model_file)
    print(f"Best model for {plant} is {best_model_name} (R2 = {performance[best_model_name]['R2']:.2f}) and has been saved to {model_file}")
    return best_model_name

# =============================================
# Step 4. Main processing: Load, preprocess, model, and save results
# =============================================
def main():
    # Define paths (adjust as needed)
    meteo_dir = Path("../data/interim/meteo_data_with_generation")
    output_data_dir = Path("../data/interim/meteo_data_with_generation/processed_models")
    output_data_dir.mkdir(parents=True, exist_ok=True)
    model_dir = output_data_dir / "models"
    model_dir.mkdir(parents=True, exist_ok=True)
    
    # Load all merged data from Parquet files
    plant_dfs = load_all_merged_data(meteo_dir)
    
    overall_results = {}
    best_models = {}
    
    for plant, df in plant_dfs.items():
        print(f"\nProcessing plant: {plant}")
        # Step 2a: Drop the initial consecutive days with zero generation
        df_clean = drop_initial_zero_generation_days(df)
        # Step 2b: Drop the last 26 hours (incomplete data)
        df_clean = drop_last_26_hours(df_clean)
        # Save cleaned DataFrame (optional)
        clean_path = output_data_dir / f"{plant}_clean.parquet"
        df_clean.to_parquet(clean_path, index=True)
        print(f"Saved cleaned data for {plant} to {clean_path}")
        
        # Step 3: Train and evaluate models using meteorological features to predict generation
        if not df_clean.empty:
            performance, model_objs = tune_and_evaluate_models(df_clean, plant)
            overall_results[plant] = performance
            best_model_name = select_and_save_best_model(performance, model_objs, plant, model_dir)
            best_models[plant] = best_model_name
        else:
            print(f"No data remains for {plant} after cleaning.")
    
    # Save overall performance metrics to a CSV file for later review
    results_df = pd.DataFrame.from_dict({plant: metrics for plant, metrics in overall_results.items()}, orient="index")
    results_csv = output_data_dir / "model_performance_results.csv"
    results_df.to_csv(results_csv)
    print("\nOverall model performance results:")
    print(results_df)
    print(f"Saved performance metrics to {results_csv}")

# Execute the main function (suitable for a Jupyter Notebook)
main()


Processing plant: parque_solar_girasol
Saved cleaned data for parque_solar_girasol to ..\data\interim\meteo_data_with_generation\processed_models\parque_solar_girasol_clean.parquet
parque_solar_girasol - LinearRegression: MSE = 117.56, R2 = 0.88
parque_solar_girasol - RandomForest (tuned): MSE = 99.95, R2 = 0.90
parque_solar_girasol - GradientBoosting (tuned): MSE = 98.48, R2 = 0.90
Best model for parque_solar_girasol is GradientBoosting (R2 = 0.90) and has been saved to ..\data\interim\meteo_data_with_generation\processed_models\models\parque_solar_girasol_best_model.h5

Overall model performance results:
                                                       LinearRegression  \
parque_solar_girasol  {'MSE': 117.56472067441558, 'R2': 0.8802702746...   

                                                           RandomForest  \
parque_solar_girasol  {'MSE': 99.95428011029138, 'R2': 0.89820501905...   

                                                       GradientBoosting  
parque_so

In [1]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
import joblib
import unicodedata

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# =============================================
# 1. Load all merged Parquet files
# =============================================
def load_all_merged_data(meteo_dir: Path) -> dict:
    """
    Carga todos los archivos Parquet de un directorio en un diccionario,
    usando el nombre del archivo (sin extensión) como clave.
    Cada archivo debe tener un DatetimeIndex UTC y una columna 'generation',
    además de variables meteorológicas.
    """
    plant_dfs = {}
    for file in meteo_dir.glob("*.parquet"):
        df = pd.read_parquet(file)
        if not isinstance(df.index, pd.DatetimeIndex):
            df["date"] = pd.to_datetime(df["date"], utc=True)
            df = df.set_index("date")
        plant_name = file.stem  # ej.: "parque_eolico_agua_clara"
        plant_dfs[plant_name] = df
    return plant_dfs

# =============================================
# 2. Preprocess each DataFrame
#    a. Drop initial consecutive days with zero generation
#    b. Drop the last 26 hours
# =============================================
def drop_initial_zero_generation_days(df: pd.DataFrame) -> pd.DataFrame:
    """
    Elimina los primeros días consecutivos donde la generación total es cero.
    Una vez que se encuentra un día con generación no nula, se conservan los días siguientes.
    """
    df = df.copy()
    df["date_only"] = df.index.normalize()
    unique_dates = sorted(df["date_only"].unique())
    drop_dates = []
    for date in unique_dates:
        daily_sum = df.loc[df["date_only"] == date, "generation"].sum()
        if daily_sum == 0:
            drop_dates.append(date)
        else:
            break
    df_clean = df[~df["date_only"].isin(drop_dates)].copy()
    df_clean.drop(columns=["date_only"], inplace=True)
    return df_clean

def drop_last_26_hours(df: pd.DataFrame) -> pd.DataFrame:
    """
    Elimina filas cuyo timestamp esté dentro de las últimas 26 horas.
    """
    if df.empty:
        return df
    last_timestamp = df.index.max()
    cutoff = last_timestamp - datetime.timedelta(hours=26)
    return df[df.index <= cutoff]

# =============================================
# 2.1: Feature Engineering: Lags y Promedios Móviles
# =============================================
def add_lag_features(df: pd.DataFrame, cols, lags=[1,2,3]) -> pd.DataFrame:
    """
    Crea variables de retardo para las columnas indicadas.
    Ejemplo: para 'temperature_2m' y lag=1, crea 'temperature_2m_lag1'.
    """
    for col in cols:
        for lag in lags:
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
    return df

def add_moving_average_features(df: pd.DataFrame, cols, windows=[3,6]) -> pd.DataFrame:
    """
    Crea variables de promedio móvil para las columnas indicadas.
    Ejemplo: para 'temperature_2m' y ventana=3, crea 'temperature_2m_ma3'.
    """
    for col in cols:
        for w in windows:
            df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
    return df

# =============================================
# 2.2: Scaling function (StandardScaler)
# =============================================
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),
                                  columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test),
                                 columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled, scaler

# =============================================
# 3. Model Tuning (classical models) + LSTM
# =============================================

# Parámetros para grid search de modelos clásicos
RF_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}
GB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 5],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}
XGB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0]
}
LGB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "num_leaves": [31, 50]
}
SVR_PARAM_GRID = {
    "C": [0.1, 1, 10],
    "epsilon": [0.01, 0.1, 1],
    "kernel": ["rbf", "linear"]
}

def grid_search_model(model, param_grid, X_train, y_train, cv=3):
    grid_cv = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring="r2",
        n_jobs=-1
    )
    grid_cv.fit(X_train, y_train)
    return grid_cv.best_estimator_

def create_lstm_sequences(X, y, look_back=24):
    X_seq, y_seq = [], []
    for i in range(look_back, len(X)):
        X_seq.append(X.iloc[i-look_back:i].values)
        y_seq.append(y.iloc[i])
    return np.array(X_seq), np.array(y_seq)

def build_lstm_model(input_shape, units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(LSTM(units, input_shape=input_shape, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mse")
    return model

# Función para crear un ensemble de stacking (con grid search en el meta-modelo)
def stacking_ensemble(X_train_scaled, y_train, X_test_scaled, y_test, model_objs):
    from sklearn.ensemble import StackingRegressor
    # Seleccionamos algunos modelos base (ajusta según tus resultados)
    estimators = []
    for name in ["RandomForest", "GradientBoosting", "XGBoost", "LightGBM", "SVR"]:
        if name in model_objs:
            estimators.append((name, model_objs[name]))
    if not estimators:
        print("No hay modelos base para el ensemble.")
        return None, None, None
    # Meta-modelo: usaremos Ridge y afinaremos su parámetro alpha con grid search
    base_meta = Ridge()
    ensemble = StackingRegressor(
        estimators=estimators,
        final_estimator=base_meta,
        cv=3,
        n_jobs=-1
    )
    # Grid search para afinar el meta-modelo
    param_grid = {
        "final_estimator__alpha": [0.1, 1, 10]
    }
    grid = GridSearchCV(ensemble, param_grid=param_grid, cv=3, scoring="r2", n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    ensemble_best = grid.best_estimator_
    y_pred_ensemble = ensemble_best.predict(X_test_scaled)
    mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
    r2_ensemble = r2_score(y_test, y_pred_ensemble)
    print(f"Stacking Ensemble: MSE={mse_ensemble:.2f}, R2={r2_ensemble:.2f}")
    return ensemble_best, mse_ensemble, r2_ensemble

def tune_and_evaluate_models(df: pd.DataFrame, plant: str):
    """
    Para el DataFrame dado:
      - Se eliminan filas con valores faltantes.
      - Se separan las características y la variable objetivo.
      - Se divide el dataset en entrenamiento (80%) y prueba (20%) de forma temporal.
      - Se escalán las características (StandardScaler).
      - Se entrenan y afinan varios modelos clásicos (LR, RF, GB, XGB, LGB, SVR) mediante grid search.
      - También se entrena un modelo LSTM simple usando secuencias (con los datos escalados, sin PCA).
      - Además se crea un ensemble stacking que combina los modelos clásicos, afinando el meta-modelo.
      - Retorna métricas (MSE, R²) y los modelos entrenados.
    """
    results = {}
    model_objs = {}

    df = df.dropna()
    if df.empty:
        print(f"No hay datos disponibles para {plant} después de eliminar NaN.")
        return {}, {}

    # Separar características y target
    X = df.drop(columns=["generation"])
    y = df["generation"]
    X = X.select_dtypes(include=[np.number])
    
    split_index = int(len(df) * 0.8)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
    
    if X_train.empty or X_test.empty:
        print(f"Datos insuficientes para {plant} después de dividir.")
        return {}, {}

    # Escalar características
    X_train_scaled, X_test_scaled, _ = scale_features(X_train, X_test)
    
    # --- 3.1: Linear Regression ---
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    y_pred_lr = lr.predict(X_test_scaled)
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    r2_lr = r2_score(y_test, y_pred_lr)
    results["LinearRegression"] = {"MSE": mse_lr, "R2": r2_lr}
    model_objs["LinearRegression"] = lr
    print(f"{plant} - LinearRegression: MSE={mse_lr:.2f}, R2={r2_lr:.2f}")
    
    # --- 3.2: RandomForest (GridSearchCV) ---
    rf = RandomForestRegressor(random_state=42)
    rf_best = grid_search_model(rf, RF_PARAM_GRID, X_train_scaled, y_train)
    y_pred_rf = rf_best.predict(X_test_scaled)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    results["RandomForest"] = {"MSE": mse_rf, "R2": r2_rf}
    model_objs["RandomForest"] = rf_best
    print(f"{plant} - RandomForest (tuned): MSE={mse_rf:.2f}, R2={r2_rf:.2f}")
    
    # --- 3.3: GradientBoosting (GridSearchCV) ---
    gb = GradientBoostingRegressor(random_state=42)
    gb_best = grid_search_model(gb, GB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_gb = gb_best.predict(X_test_scaled)
    mse_gb = mean_squared_error(y_test, y_pred_gb)
    r2_gb = r2_score(y_test, y_pred_gb)
    results["GradientBoosting"] = {"MSE": mse_gb, "R2": r2_gb}
    model_objs["GradientBoosting"] = gb_best
    print(f"{plant} - GradientBoosting (tuned): MSE={mse_gb:.2f}, R2={r2_gb:.2f}")
    
    # --- 3.4: XGBoost (GridSearchCV) ---
    xgb = XGBRegressor(random_state=42, objective="reg:squarederror", verbosity=0)
    xgb_best = grid_search_model(xgb, XGB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_xgb = xgb_best.predict(X_test_scaled)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    results["XGBoost"] = {"MSE": mse_xgb, "R2": r2_xgb}
    model_objs["XGBoost"] = xgb_best
    print(f"{plant} - XGBoost (tuned): MSE={mse_xgb:.2f}, R2={r2_xgb:.2f}")
    
    # --- 3.5: LightGBM (GridSearchCV) ---
    lgb = LGBMRegressor(random_state=42)
    lgb_best = grid_search_model(lgb, LGB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_lgb = lgb_best.predict(X_test_scaled)
    mse_lgb = mean_squared_error(y_test, y_pred_lgb)
    r2_lgb = r2_score(y_test, y_pred_lgb)
    results["LightGBM"] = {"MSE": mse_lgb, "R2": r2_lgb}
    model_objs["LightGBM"] = lgb_best
    print(f"{plant} - LightGBM (tuned): MSE={mse_lgb:.2f}, R2={r2_lgb:.2f}")
    
    # --- 3.6: SVR (GridSearchCV) ---
    svr = SVR()
    svr_best = grid_search_model(svr, SVR_PARAM_GRID, X_train_scaled, y_train)
    y_pred_svr = svr_best.predict(X_test_scaled)
    mse_svr = mean_squared_error(y_test, y_pred_svr)
    r2_svr = r2_score(y_test, y_pred_svr)
    results["SVR"] = {"MSE": mse_svr, "R2": r2_svr}
    model_objs["SVR"] = svr_best
    print(f"{plant} - SVR (tuned): MSE={mse_svr:.2f}, R2={r2_svr:.2f}")
    
    # --- 3.7: LSTM (simple) ---
    # Para LSTM se usan los datos escalados originales (sin PCA) para mantener la secuencia temporal.
    X_train_seq, y_train_seq = create_lstm_sequences(X_train_scaled, y_train, look_back=24)
    X_test_seq, y_test_seq = create_lstm_sequences(X_test_scaled, y_test, look_back=24)
    
    if len(X_train_seq) > 0 and len(X_test_seq) > 0:
        input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])
        lstm_model = build_lstm_model(input_shape, units=50, dropout_rate=0.2)
        es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        lstm_model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32,
                       validation_split=0.2, callbacks=[es], verbose=0)
        y_pred_lstm = lstm_model.predict(X_test_seq).flatten()
        mse_lstm = mean_squared_error(y_test_seq, y_pred_lstm)
        r2_lstm = r2_score(y_test_seq, y_pred_lstm)
        results["LSTM"] = {"MSE": mse_lstm, "R2": r2_lstm}
        model_objs["LSTM"] = lstm_model
        print(f"{plant} - LSTM: MSE={mse_lstm:.2f}, R2={r2_lstm:.2f}")
    else:
        print(f"{plant} - Not enough data for LSTM sequences.")
    
    # --- 3.8: Ensemble Stacking ---
    ensemble_model, mse_ens, r2_ens = stacking_ensemble(X_train_scaled, y_train, X_test_scaled, y_test, model_objs)
    if ensemble_model is not None:
        results["StackingEnsemble"] = {"MSE": mse_ens, "R2": r2_ens}
        model_objs["StackingEnsemble"] = ensemble_model
    
    return results, model_objs

def select_and_save_best_model(performance: dict, model_objs: dict, plant: str, model_dir: Path):
    """
    Selecciona el mejor modelo clásico (excluyendo LSTM y ensemble) en base a R² y lo guarda en disco con extensión .h5.
    """
    if not performance:
        print(f"No hay métricas de desempeño para {plant}. Se omite el guardado del modelo.")
        return None
    
    # Consideramos solo modelos clásicos (excluyendo LSTM y el ensemble)
    classical_models = {m: performance[m] for m in performance if m not in ["LSTM", "StackingEnsemble"]}
    if not classical_models:
        print(f"No hay modelos clásicos para guardar para {plant}.")
        return None
    
    best_model_name = max(classical_models, key=lambda m: classical_models[m]["R2"])
    best_model = model_objs[best_model_name]
    model_file = model_dir / f"{plant}_best_model.h5"
    joblib.dump(best_model, model_file)
    print(f"El mejor modelo clásico para {plant} es {best_model_name} (R2 = {classical_models[best_model_name]['R2']:.2f})")
    print(f"Guardado en {model_file}")
    return best_model_name

# =============================================
# 4. Main processing: Load, preprocess, tune, evaluate, and save results
# =============================================
def main():
    meteo_dir = Path("../data/interim/meteo_data_with_generation")
    output_data_dir = Path("../data/interim/meteo_data_with_generation/processed_models")
    output_data_dir.mkdir(parents=True, exist_ok=True)
    model_dir = output_data_dir / "models"
    model_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Load all merged data from Parquet files
    plant_dfs = load_all_merged_data(meteo_dir)
    
    overall_results = {}
    best_models = {}
    
    # Parámetros para lags y promedios móviles
    lags = [1, 2, 3]         # Ajusta según tu análisis
    ma_windows = [3, 6]      # Ajusta según tu análisis
    
    for plant, df in plant_dfs.items():
        print(f"\nProcessing plant: {plant}")
        # 2a: Eliminar días iniciales de generación cero
        df_clean = drop_initial_zero_generation_days(df)
        # 2b: Eliminar las últimas 26 horas
        df_clean = drop_last_26_hours(df_clean)
        
        # Agregar variables de retardo y promedios móviles
        numeric_cols = [c for c in df_clean.columns if c != "generation" and np.issubdtype(df_clean[c].dtype, np.number)]
        df_clean = add_lag_features(df_clean, numeric_cols, lags=lags)
        df_clean = add_moving_average_features(df_clean, numeric_cols, windows=ma_windows)
        
        # Quitar filas con NaN generadas por shift/rolling
        df_clean.dropna(inplace=True)
        
        # Guardar datos limpios y aumentados
        clean_path = output_data_dir / f"{plant}_clean.parquet"
        df_clean.to_parquet(clean_path, index=True)
        print(f"Saved cleaned+augmented data for {plant} to {clean_path}")
        
        if df_clean.empty:
            print(f"No data remains for {plant} after feature engineering.")
            continue
        
        # 3. Tune & evaluate models (incluye LSTM y ensemble stacking)
        performance, model_objs = tune_and_evaluate_models(df_clean, plant)
        if not performance:
            print(f"No performance metrics for {plant}. Skipping model saving.")
            continue
        
        overall_results[plant] = performance
        best_model_name = select_and_save_best_model(performance, model_objs, plant, model_dir)
        best_models[plant] = best_model_name
    
    results_df = pd.DataFrame.from_dict({plant: metrics for plant, metrics in overall_results.items()}, orient="index")
    results_csv = output_data_dir / "model_performance_results.csv"
    results_df.to_csv(results_csv)
    print("\nOverall model performance results:")
    print(results_df)
    print(f"Saved performance metrics to {results_csv}")

if __name__ == "__main__":
    main()


Processing plant: parque_solar_girasol


  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f

Saved cleaned+augmented data for parque_solar_girasol to ..\data\interim\meteo_data_with_generation\processed_models\parque_solar_girasol_clean.parquet
parque_solar_girasol - LinearRegression: MSE=101.82, R2=0.90
parque_solar_girasol - RandomForest (tuned): MSE=96.50, R2=0.90
parque_solar_girasol - GradientBoosting (tuned): MSE=94.32, R2=0.90
parque_solar_girasol - XGBoost (tuned): MSE=94.15, R2=0.90


[WinError 2] The system cannot find the file specified
  File "c:\Users\ferna\Documents\Desktop\11 - Masters\00 - Master AI\99 - Proyecto Final\energy-generation-prediction-dashboard\my_environment\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34303
[LightGBM] [Info] Number of data points in the train set: 24783, number of used features: 150
[LightGBM] [Info] Start training from score 25.366749
parque_solar_girasol - LightGBM (tuned): MSE=93.85, R2=0.90
parque_solar_girasol - SVR (tuned): MSE=103.74, R2=0.89


  super().__init__(**kwargs)


[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
parque_solar_girasol - LSTM: MSE=109.78, R2=0.89
Stacking Ensemble: MSE=93.40, R2=0.90
El mejor modelo clásico para parque_solar_girasol es LightGBM (R2 = 0.90)
Guardado en ..\data\interim\meteo_data_with_generation\processed_models\models\parque_solar_girasol_best_model.h5

Overall model performance results:
                                                       LinearRegression  \
parque_solar_girasol  {'MSE': 101.82041584145682, 'R2': 0.8963179219...   

                                                           RandomForest  \
parque_solar_girasol  {'MSE': 96.50003689553715, 'R2': 0.90173557749...   

                                                       GradientBoosting  \
parque_solar_girasol  {'MSE': 94.31935660359805, 'R2': 0.90395612887...   

                                                                XGBoost  \
parque_solar_girasol  {'MSE': 94.14829237258422, 'R2': 0.90413032080...   

          

In [2]:
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
import joblib
import unicodedata

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# =============================================
# 1. Load all merged Parquet files
# =============================================
def load_all_merged_data(meteo_dir: Path) -> dict:
    """
    Loads all Parquet files from the given directory into a dictionary,
    using the file stem (plant name) as the key.
    Each file must have a UTC-aware DatetimeIndex and a 'generation' column,
    plus meteorological variables.
    """
    plant_dfs = {}
    for file in meteo_dir.glob("*.parquet"):
        df = pd.read_parquet(file)
        if not isinstance(df.index, pd.DatetimeIndex):
            df["date"] = pd.to_datetime(df["date"], utc=True)
            df = df.set_index("date")
        plant_name = file.stem  # e.g. "parque_eolico_agua_clara"
        plant_dfs[plant_name] = df
    return plant_dfs

# =============================================
# 2. Preprocess each DataFrame
#    a. Drop initial consecutive days with zero generation
#    b. Drop the last 26 hours
# =============================================
def drop_initial_zero_generation_days(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops the very first consecutive days where total generation is zero.
    Once a day with non-zero generation is encountered, subsequent days remain.
    """
    df = df.copy()
    df["date_only"] = df.index.normalize()
    unique_dates = sorted(df["date_only"].unique())
    drop_dates = []
    for date in unique_dates:
        daily_sum = df.loc[df["date_only"] == date, "generation"].sum()
        if daily_sum == 0:
            drop_dates.append(date)
        else:
            break
    df_clean = df[~df["date_only"].isin(drop_dates)].copy()
    df_clean.drop(columns=["date_only"], inplace=True)
    return df_clean

def drop_last_26_hours(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drops rows where the timestamp is within the last 26 hours.
    """
    if df.empty:
        return df
    last_timestamp = df.index.max()
    cutoff = last_timestamp - datetime.timedelta(hours=26)
    return df[df.index <= cutoff]

# =============================================
# 2.1: Additional Feature Engineering Functions
# =============================================
def add_lag_features(df: pd.DataFrame, cols, lags=[1, 2, 3]) -> pd.DataFrame:
    """
    Creates lag features for the given columns.
    Example: for 'temperature_2m' and lag=1, creates 'temperature_2m_lag1'.
    """
    for col in cols:
        for lag in lags:
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
    return df

def add_moving_average_features(df: pd.DataFrame, cols, windows=[3, 6]) -> pd.DataFrame:
    """
    Creates moving average features for the given columns.
    Example: for 'temperature_2m' and window=3, creates 'temperature_2m_ma3'.
    """
    for col in cols:
        for w in windows:
            df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
    return df

def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds cyclical temporal features: hour, day of week, and month (using sine and cosine).
    """
    df = df.copy()
    df["hour"] = df.index.hour
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    df["day_of_week"] = df.index.dayofweek
    df["dow_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
    df["dow_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
    df["month"] = df.index.month
    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
    return df

def add_differencing_feature(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Adds a feature that represents the difference between the current value
    and the previous value of the specified column.
    """
    df = df.copy()
    df[f"{col}_diff"] = df[col].diff()
    return df

# =============================================
# 2.2: Scaling function (StandardScaler)
# =============================================
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),
                                  columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test),
                                 columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled, scaler

# =============================================
# 3. Model Tuning (classical models) + LSTM + Ensemble Stacking
# =============================================

# Parameter grids for classical models
RF_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}
GB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 5],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}
XGB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0]
}
LGB_PARAM_GRID = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5],
    "learning_rate": [0.01, 0.1],
    "num_leaves": [31, 50]
}
SVR_PARAM_GRID = {
    "C": [0.1, 1, 10],
    "epsilon": [0.01, 0.1, 1],
    "kernel": ["rbf", "linear"]
}

def grid_search_model(model, param_grid, X_train, y_train, cv=3):
    grid_cv = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring="r2",
        n_jobs=2  # Controlamos la paralelización para usar menos recursos
    )
    grid_cv.fit(X_train, y_train)
    return grid_cv.best_estimator_

def create_lstm_sequences(X, y, look_back=24):
    X_seq, y_seq = [], []
    for i in range(look_back, len(X)):
        X_seq.append(X.iloc[i-look_back:i].values)
        y_seq.append(y.iloc[i])
    return np.array(X_seq), np.array(y_seq)

def build_lstm_model(input_shape, units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(LSTM(units, input_shape=input_shape, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mse")
    return model

def stacking_ensemble(X_train_scaled, y_train, X_test_scaled, y_test, model_objs):
    from sklearn.ensemble import StackingRegressor
    # Seleccionar modelos base a combinar
    estimators = []
    for name in ["RandomForest", "GradientBoosting", "XGBoost", "LightGBM", "SVR"]:
        if name in model_objs:
            estimators.append((name, model_objs[name]))
    if not estimators:
        print("No hay modelos base para el ensemble.")
        return None, None, None
    # Meta-modelo: usamos Ridge y afinamos su parámetro alpha con grid search
    base_meta = Ridge()
    ensemble = StackingRegressor(
        estimators=estimators,
        final_estimator=base_meta,
        cv=3,
        n_jobs=2  # Paralelización controlada
    )
    param_grid = {"final_estimator__alpha": [0.1, 1, 10]}
    grid = GridSearchCV(ensemble, param_grid=param_grid, cv=3, scoring="r2", n_jobs=2)
    grid.fit(X_train_scaled, y_train)
    ensemble_best = grid.best_estimator_
    y_pred_ensemble = ensemble_best.predict(X_test_scaled)
    mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
    r2_ensemble = r2_score(y_test, y_pred_ensemble)
    print(f"Stacking Ensemble: MSE={mse_ensemble:.2f}, R2={r2_ensemble:.2f}")
    return ensemble_best, mse_ensemble, r2_ensemble

def tune_and_evaluate_models(df: pd.DataFrame, plant: str):
    """
    Para el DataFrame dado:
      - Se eliminan filas con valores faltantes.
      - Se agregan nuevas características: temporales cíclicas, lags, promedios móviles y diferenciación.
      - Se separan las características y la variable objetivo.
      - Se divide el dataset en entrenamiento (80%) y prueba (20%) en orden temporal.
      - Se escalán las características usando StandardScaler.
      - Se entrenan y afinan modelos clásicos (LR, RF, GB, XGB, LGB, SVR) mediante grid search.
      - Se entrena un modelo LSTM simple usando secuencias (con los datos escalados originales).
      - Se crea un ensemble stacking que combina los modelos clásicos, afinando el meta-modelo.
      - Retorna métricas (MSE, R²) y los modelos entrenados.
    """
    results = {}
    model_objs = {}

    df = df.dropna()
    if df.empty:
        print(f"No hay datos disponibles para {plant} después de eliminar NaN.")
        return {}, {}

    # Agregar características temporales cíclicas
    df = add_temporal_features(df)
    # Agregar variables de retardo y promedios móviles a todas las columnas numéricas (excepto generation)
    numeric_cols = [c for c in df.columns if c != "generation" and np.issubdtype(df[c].dtype, np.number)]
    df = add_lag_features(df, numeric_cols, lags=[1, 2, 3])
    df = add_moving_average_features(df, numeric_cols, windows=[3, 6])
    # Agregar diferenciación de la generación
    df = add_differencing_feature(df, "generation")
    
    # Eliminar filas con NaN generados por shift, rolling o differencing
    df.dropna(inplace=True)
    
    # Separar características y target
    X = df.drop(columns=["generation"])
    y = df["generation"]
    X = X.select_dtypes(include=[np.number])
    
    split_index = int(len(df) * 0.8)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
    
    if X_train.empty or X_test.empty:
        print(f"Datos insuficientes para {plant} después de dividir.")
        return {}, {}

    # Escalar características
    X_train_scaled, X_test_scaled, _ = scale_features(X_train, X_test)
    
    # --- 3.1: Linear Regression ---
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    y_pred_lr = lr.predict(X_test_scaled)
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    r2_lr = r2_score(y_test, y_pred_lr)
    results["LinearRegression"] = {"MSE": mse_lr, "R2": r2_lr}
    model_objs["LinearRegression"] = lr
    print(f"{plant} - LinearRegression: MSE={mse_lr:.2f}, R2={r2_lr:.2f}")
    
    # --- 3.2: RandomForest (GridSearchCV) ---
    rf = RandomForestRegressor(random_state=42)
    rf_best = grid_search_model(rf, RF_PARAM_GRID, X_train_scaled, y_train)
    y_pred_rf = rf_best.predict(X_test_scaled)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    results["RandomForest"] = {"MSE": mse_rf, "R2": r2_rf}
    model_objs["RandomForest"] = rf_best
    print(f"{plant} - RandomForest (tuned): MSE={mse_rf:.2f}, R2={r2_rf:.2f}")
    
    # --- 3.3: GradientBoosting (GridSearchCV) ---
    gb = GradientBoostingRegressor(random_state=42)
    gb_best = grid_search_model(gb, GB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_gb = gb_best.predict(X_test_scaled)
    mse_gb = mean_squared_error(y_test, y_pred_gb)
    r2_gb = r2_score(y_test, y_pred_gb)
    results["GradientBoosting"] = {"MSE": mse_gb, "R2": r2_gb}
    model_objs["GradientBoosting"] = gb_best
    print(f"{plant} - GradientBoosting (tuned): MSE={mse_gb:.2f}, R2={r2_gb:.2f}")
    
    # --- 3.4: XGBoost (GridSearchCV) ---
    xgb = XGBRegressor(random_state=42, objective="reg:squarederror", verbosity=0)
    xgb_best = grid_search_model(xgb, XGB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_xgb = xgb_best.predict(X_test_scaled)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    results["XGBoost"] = {"MSE": mse_xgb, "R2": r2_xgb}
    model_objs["XGBoost"] = xgb_best
    print(f"{plant} - XGBoost (tuned): MSE={mse_xgb:.2f}, R2={r2_xgb:.2f}")
    
    # --- 3.5: LightGBM (GridSearchCV) ---
    lgb = LGBMRegressor(random_state=42)
    lgb_best = grid_search_model(lgb, LGB_PARAM_GRID, X_train_scaled, y_train)
    y_pred_lgb = lgb_best.predict(X_test_scaled)
    mse_lgb = mean_squared_error(y_test, y_pred_lgb)
    r2_lgb = r2_score(y_test, y_pred_lgb)
    results["LightGBM"] = {"MSE": mse_lgb, "R2": r2_lgb}
    model_objs["LightGBM"] = lgb_best
    print(f"{plant} - LightGBM (tuned): MSE={mse_lgb:.2f}, R2={r2_lgb:.2f}")
    
    # --- 3.6: SVR (GridSearchCV) ---
    svr = SVR()
    svr_best = grid_search_model(svr, SVR_PARAM_GRID, X_train_scaled, y_train)
    y_pred_svr = svr_best.predict(X_test_scaled)
    mse_svr = mean_squared_error(y_test, y_pred_svr)
    r2_svr = r2_score(y_test, y_pred_svr)
    results["SVR"] = {"MSE": mse_svr, "R2": r2_svr}
    model_objs["SVR"] = svr_best
    print(f"{plant} - SVR (tuned): MSE={mse_svr:.2f}, R2={r2_svr:.2f}")
    
    # --- 3.7: LSTM (simple) ---
    # Para LSTM se usan los datos escalados originales (sin PCA) para mantener la secuencia temporal.
    X_train_seq, y_train_seq = create_lstm_sequences(X_train_scaled, y_train, look_back=24)
    X_test_seq, y_test_seq = create_lstm_sequences(X_test_scaled, y_test, look_back=24)
    
    if len(X_train_seq) > 0 and len(X_test_seq) > 0:
        input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])
        lstm_model = build_lstm_model(input_shape, units=50, dropout_rate=0.2)
        es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        lstm_model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=32,
                       validation_split=0.2, callbacks=[es], verbose=0)
        y_pred_lstm = lstm_model.predict(X_test_seq).flatten()
        mse_lstm = mean_squared_error(y_test_seq, y_pred_lstm)
        r2_lstm = r2_score(y_test_seq, y_pred_lstm)
        results["LSTM"] = {"MSE": mse_lstm, "R2": r2_lstm}
        model_objs["LSTM"] = lstm_model
        print(f"{plant} - LSTM: MSE={mse_lstm:.2f}, R2={r2_lstm:.2f}")
    else:
        print(f"{plant} - Not enough data for LSTM sequences.")
    
    # --- 3.8: Ensemble Stacking ---
    ensemble_model, mse_ens, r2_ens = stacking_ensemble(X_train_scaled, y_train, X_test_scaled, y_test, model_objs)
    if ensemble_model is not None:
        results["StackingEnsemble"] = {"MSE": mse_ens, "R2": r2_ens}
        model_objs["StackingEnsemble"] = ensemble_model
    
    return results, model_objs

def select_and_save_best_model(performance: dict, model_objs: dict, plant: str, model_dir: Path):
    """
    Selecciona el mejor modelo clásico (excluyendo LSTM y ensemble) en base a R² y lo guarda en disco con extensión .h5.
    """
    if not performance:
        print(f"No hay métricas de desempeño para {plant}. Se omite el guardado del modelo.")
        return None
    
    classical_models = {m: performance[m] for m in performance if m not in ["LSTM", "StackingEnsemble"]}
    if not classical_models:
        print(f"No hay modelos clásicos para guardar para {plant}.")
        return None
    
    best_model_name = max(classical_models, key=lambda m: classical_models[m]["R2"])
    best_model = model_objs[best_model_name]
    model_file = model_dir / f"{plant}_best_model.h5"
    joblib.dump(best_model, model_file)
    print(f"El mejor modelo clásico para {plant} es {best_model_name} (R2 = {classical_models[best_model_name]['R2']:.2f})")
    print(f"Guardado en {model_file}")
    return best_model_name

# =============================================
# 4. Main processing: Load, preprocess, tune, evaluate, and save results
# =============================================
def main():
    meteo_dir = Path("../data/interim/meteo_data_with_generation")
    output_data_dir = Path("../data/interim/meteo_data_with_generation/processed_models")
    output_data_dir.mkdir(parents=True, exist_ok=True)
    model_dir = output_data_dir / "models"
    model_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Cargar todos los datos fusionados
    plant_dfs = load_all_merged_data(meteo_dir)
    
    overall_results = {}
    best_models = {}
    
    # Parámetros para lags y promedios móviles
    lags = [1, 2, 3]         # Ajusta según tu análisis
    ma_windows = [3, 6]        # Ajusta según tu análisis
    
    for plant, df in plant_dfs.items():
        print(f"\nProcessing plant: {plant}")
        # 2a: Eliminar días iniciales sin generación
        df_clean = drop_initial_zero_generation_days(df)
        # 2b: Eliminar las últimas 26 horas
        df_clean = drop_last_26_hours(df_clean)
        
        # Agregar características temporales cíclicas
        df_clean = add_temporal_features(df_clean)
        # Agregar variables de retardo y promedios móviles
        numeric_cols = [c for c in df_clean.columns if c != "generation" and np.issubdtype(df_clean[c].dtype, np.number)]
        df_clean = add_lag_features(df_clean, numeric_cols, lags=lags)
        df_clean = add_moving_average_features(df_clean, numeric_cols, windows=ma_windows)
        # Agregar la diferenciación de la generación
        df_clean = add_differencing_feature(df_clean, "generation")
        
        # Quitar filas con NaN generadas por shift, rolling o differencing
        df_clean.dropna(inplace=True)
        
        # Guardar datos limpios y aumentados
        clean_path = output_data_dir / f"{plant}_clean.parquet"
        df_clean.to_parquet(clean_path, index=True)
        print(f"Saved cleaned+augmented data for {plant} to {clean_path}")
        
        if df_clean.empty:
            print(f"No data remains for {plant} after feature engineering.")
            continue
        
        # 3: Afinar y evaluar modelos (incluye LSTM y ensemble stacking)
        performance, model_objs = tune_and_evaluate_models(df_clean, plant)
        if not performance:
            print(f"No performance metrics for {plant}. Skipping model saving.")
            continue
        
        overall_results[plant] = performance
        best_model_name = select_and_save_best_model(performance, model_objs, plant, model_dir)
        best_models[plant] = best_model_name
    
    results_df = pd.DataFrame.from_dict({plant: metrics for plant, metrics in overall_results.items()}, orient="index")
    results_csv = output_data_dir / "model_performance_results.csv"
    results_df.to_csv(results_csv)
    print("\nOverall model performance results:")
    print(results_df)
    print(f"Saved performance metrics to {results_csv}")

if __name__ == "__main__":
    main()


Processing plant: parque_solar_girasol


  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods=1).mean()
  df[f"{col}_ma{w}"] = df[col].rolling(window=w, min_periods

Saved cleaned+augmented data for parque_solar_girasol to ..\data\interim\meteo_data_with_generation\processed_models\parque_solar_girasol_clean.parquet


  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{col}_lag{lag}"] = df[col].shift(lag)
  df[f"{co

parque_solar_girasol - LinearRegression: MSE=36.27, R2=0.96
parque_solar_girasol - RandomForest (tuned): MSE=15.51, R2=0.98
parque_solar_girasol - GradientBoosting (tuned): MSE=13.22, R2=0.99
parque_solar_girasol - XGBoost (tuned): MSE=12.79, R2=0.99


[WinError 2] The system cannot find the file specified
  File "c:\Users\ferna\Documents\Desktop\11 - Masters\00 - Master AI\99 - Proyecto Final\energy-generation-prediction-dashboard\my_environment\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\ferna\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193789
[LightGBM] [Info] Number of data points in the train set: 24778, number of used features: 1060
[LightGBM] [Info] Start training from score 25.367652
parque_solar_girasol - LightGBM (tuned): MSE=13.62, R2=0.99
parque_solar_girasol - SVR (tuned): MSE=39.25, R2=0.96


  super().__init__(**kwargs)


[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
parque_solar_girasol - LSTM: MSE=72.72, R2=0.93
Stacking Ensemble: MSE=12.93, R2=0.99
El mejor modelo clásico para parque_solar_girasol es XGBoost (R2 = 0.99)
Guardado en ..\data\interim\meteo_data_with_generation\processed_models\models\parque_solar_girasol_best_model.h5

Overall model performance results:
                                                       LinearRegression  \
parque_solar_girasol  {'MSE': 36.267017012809276, 'R2': 0.9630725389...   

                                                           RandomForest  \
parque_solar_girasol  {'MSE': 15.508413443380011, 'R2': 0.9842091690...   

                                                       GradientBoosting  \
parque_solar_girasol  {'MSE': 13.222887949187195, 'R2': 0.9865363153...   

                                                                XGBoost  \
parque_solar_girasol  {'MSE': 12.793333141201018, 'R2': 0.9869736926...   

            