In [1]:
import pandas as pd
from pathlib import Path

# Filepaths (adjust if needed)
GEN_PATH   = Path("../data/interim/post_despacho_transformed.parquet")
METEO_PATH = Path("../data/raw/open_mete_data/parque_eolico_agua_clara.parquet")

# 1️⃣ Load generation data
df_gen = pd.read_parquet(GEN_PATH)
df_gen["timestamp"] = pd.to_datetime(df_gen["timestamp"], utc=True)
df_gen = df_gen.set_index("timestamp")

# The generation column must exactly match your plant name
GEN_COL = "parque eolico agua clara"
if GEN_COL not in df_gen.columns:
    raise KeyError(f"Column '{GEN_COL}' not found. Available: {list(df_gen.columns)}")

df_target = df_gen[[GEN_COL]].rename(columns={GEN_COL: "generation"})

# 2️⃣ Load meteorological data
df_meteo = pd.read_parquet(METEO_PATH)
df_meteo["date"] = pd.to_datetime(df_meteo["date"], utc=True)
df_meteo = df_meteo.set_index("date")

# 3️⃣ Merge on timestamp (inner join keeps only matching hours)
df_merged = df_target.join(df_meteo, how="inner")

# 4️⃣ Reset index for a clean DataFrame
df_model = df_merged.reset_index().rename(columns={"index": "date"})

# Quick check
print("Result shape:", df_model.shape)
display(df_model.head())

Result shape: (53232, 34)


Unnamed: 0,date,generation,temperature_2m,shortwave_radiation,diffuse_radiation,global_tilted_irradiance,shortwave_radiation_instant,diffuse_radiation_instant,global_tilted_irradiance_instant,direct_radiation,...,vapour_pressure_deficit,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,cloud_cover_2m,wind_speed_10m,wind_direction_10m,wind_gusts_10m,vertical_velocity_1000hPa
0,2019-02-22 00:00:00+00:00,0.0,25.032499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.092668,0.0,0.0,3.0,0.0,,19.110542,86.760368,43.199997,
1,2019-02-22 01:00:00+00:00,0.0,24.432499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.954725,2.0,0.0,2.0,0.0,,17.81909,81.869987,35.639999,
2,2019-02-22 02:00:00+00:00,0.0,23.582499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.789165,3.0,0.0,3.0,0.0,,15.876775,86.099579,33.48,
3,2019-02-22 03:00:00+00:00,0.0,22.9825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.75122,5.0,0.0,5.0,0.0,,14.417988,92.86235,29.16,
4,2019-02-22 04:00:00+00:00,0.0,22.332499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.642745,4.0,0.0,4.0,0.0,,13.339445,93.093994,26.639999,


In [2]:
# Drop everything before March 1, 2019
df_model = df_model[df_model["date"] >= "2019-03-02"]

# Remove unwanted columns
df_model = df_model.drop(columns=["visibility", "vertical_velocity_1000hPa", "cloud_cover_2m"])

# Verify result
print(df_model.shape)
df_model.head(10)

(53040, 31)


Unnamed: 0,date,generation,temperature_2m,shortwave_radiation,diffuse_radiation,global_tilted_irradiance,shortwave_radiation_instant,diffuse_radiation_instant,global_tilted_irradiance_instant,direct_radiation,...,surface_pressure,et0_fao_evapotranspiration,vapour_pressure_deficit,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_direction_10m,wind_gusts_10m
192,2019-03-02 00:00:00+00:00,16.6,24.582499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,997.946594,0.040523,0.650935,75.0,3.0,15.0,74.0,15.94601,61.699341,30.239998
193,2019-03-02 01:00:00+00:00,14.4,24.2325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,998.610046,0.034643,0.624192,69.0,6.0,10.0,65.0,14.458382,71.113823,28.799999
194,2019-03-02 02:00:00+00:00,19.8,24.032499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,998.989075,0.032348,0.646754,55.0,2.0,4.0,53.0,13.039754,83.659904,26.280001
195,2019-03-02 03:00:00+00:00,19.8,23.7325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,998.968994,0.034901,0.671926,31.0,1.0,2.0,29.0,13.138765,99.46225,24.48
196,2019-03-02 04:00:00+00:00,19.8,23.282499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,998.841064,0.029729,0.593385,32.0,1.0,4.0,29.0,13.138765,99.46225,24.48
197,2019-03-02 05:00:00+00:00,19.8,22.782499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,998.709778,0.021152,0.487077,67.0,1.0,10.0,64.0,12.661564,104.826523,24.119999
198,2019-03-02 06:00:00+00:00,19.8,22.432499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,997.902283,0.017131,0.428823,59.0,0.0,9.0,55.0,12.641076,109.983192,23.759998
199,2019-03-02 07:00:00+00:00,19.8,21.932499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,997.280457,0.010669,0.347459,23.0,0.0,8.0,17.0,12.245293,114.304543,23.039999
200,2019-03-02 08:00:00+00:00,19.8,21.582499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,996.864807,0.00483,0.29178,6.0,0.0,6.0,0.0,11.119281,119.054512,22.68
201,2019-03-02 09:00:00+00:00,19.8,21.2325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,996.841309,0.002898,0.286325,7.0,0.0,7.0,0.0,10.144082,117.47435,20.519999


In [3]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53040 entries, 192 to 53231
Data columns (total 31 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   date                              53040 non-null  datetime64[ns, UTC]
 1   generation                        53040 non-null  float64            
 2   temperature_2m                    53040 non-null  float32            
 3   shortwave_radiation               53040 non-null  float32            
 4   diffuse_radiation                 53040 non-null  float32            
 5   global_tilted_irradiance          53040 non-null  float32            
 6   shortwave_radiation_instant       53040 non-null  float32            
 7   diffuse_radiation_instant         53040 non-null  float32            
 8   global_tilted_irradiance_instant  53040 non-null  float32            
 9   direct_radiation                  53040 non-null  float32       

In [None]:
import os, numpy as np, pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
import joblib

LOOKUP_CSV = "../data/lookup/central_info.csv"
GEN_PATH   = "../data/interim/post_despacho_transformed.parquet"
WEATHER_DIR= Path("../data/raw/open_mete_data")
OUTPUT_ROOT= Path("data/models")
TIMESTEPS  = 24

df_gen = pd.read_parquet(GEN_PATH)
df_gen["timestamp"] = pd.to_datetime(df_gen["timestamp"], utc=True)
df_gen.set_index("timestamp", inplace=True)
df_lookup = pd.read_csv(LOOKUP_CSV)

for _, plant_row in df_lookup.iterrows():
    plant = plant_row["CENTRAL"]
    start_date = pd.to_datetime(plant_row["FirstAppearance"], dayfirst=True).tz_localize("UTC")
    weather_file = WEATHER_DIR / f"{plant.lower().replace(' ','_')}.parquet"
    if not weather_file.exists():
        print(f"⚠️ Skipping {plant} — no weather data")
        continue

    print(f"\n▶ Processing {plant}")
    out_dir = OUTPUT_ROOT / plant.lower().replace(" ","_"); out_dir.mkdir(exist_ok=True)

    df_weather = pd.read_parquet(weather_file)
    df_weather["date"] = pd.to_datetime(df_weather["date"], utc=True)
    df_weather.set_index("date", inplace=True)
    df_weather = df_weather[df_weather.index >= start_date]
    df_weather.drop(columns=["visibility","vertical_velocity_1000hPa","cloud_cover_2m"], inplace=True)

    df_target = df_gen[[plant]].rename(columns={plant:"generation"}); df_target.index.name="date"
    df_model = df_target.join(df_weather, how="inner").dropna().reset_index()

    X, y = [], []
    feats = df_model.drop(columns=["date","generation"]).values
    targ = df_model["generation"].values
    for i in range(TIMESTEPS, len(df_model)):
        X.append(feats[i-TIMESTEPS:i]); y.append(targ[i])
    X, y = np.array(X), np.array(y)
    split = int(0.8*len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    scaler = MinMaxScaler()
    X_train_flat = X_train.reshape(-1, X_train.shape[-1])
    X_test_flat  = X_test.reshape(-1, X_test.shape[-1])
    scaler.fit(X_train_flat)
    X_train = scaler.transform(X_train_flat).reshape(X_train.shape)
    X_test  = scaler.transform(X_test_flat).reshape(X_test.shape)
    joblib.dump(scaler, out_dir/"scaler.joblib")

    results = []

    # RandomForest
    print("  ▶ Training RandomForest")
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train.reshape(X_train.shape[0], -1), y_train)
    rmse_rf = np.sqrt(mean_squared_error(y_test, rf.predict(X_test.reshape(X_test.shape[0], -1))))
    results.append(("RandomForest", rf, rmse_rf))

    # LightGBM
    print("  ▶ Training LightGBM")
    lgb = LGBMRegressor(n_estimators=200, random_state=42)
    lgb.fit(X_train.reshape(X_train.shape[0], -1), y_train)
    rmse_lgb = np.sqrt(mean_squared_error(y_test, lgb.predict(X_test.reshape(X_test.shape[0], -1))))
    results.append(("LightGBM", lgb, rmse_lgb))

    # LSTM
    print("  ▶ Training LSTM")
    lstm = Sequential([LSTM(64, input_shape=(TIMESTEPS, X_train.shape[2])), Dense(32, activation="relu"), Dense(1)])
    lstm.compile("adam","mse")
    lstm.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.1,
             callbacks=[EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)], verbose=0)
    rmse_lstm = np.sqrt(mean_squared_error(y_test, lstm.predict(X_test).flatten()))
    results.append(("LSTM", lstm, rmse_lstm))

    # Select best
    best_name, best_model, best_rmse = min(results, key=lambda x: x[2])
    print(f"    ✔ Best model: {best_name} (RMSE={best_rmse:.3f})")

    # Save only the winner
    if best_name == "LSTM":
        best_model.save(out_dir/"best_model.h5")
    else:
        joblib.dump(best_model, out_dir/"best_model.joblib")

print("\n✅ All plants done!")


⚠️ Skipping aes andres — no weather data
⚠️ Skipping aguacate 1 — no weather data
⚠️ Skipping aguacate 2 — no weather data
⚠️ Skipping aniana vargas 1 — no weather data
⚠️ Skipping aniana vargas 2 — no weather data
⚠️ Skipping baiguaque 1 — no weather data
⚠️ Skipping baiguaque 2 — no weather data
⚠️ Skipping barahona carbon — no weather data
⚠️ Skipping bersal — no weather data
⚠️ Skipping brazo derecho — no weather data
⚠️ Skipping cepp 1 — no weather data
⚠️ Skipping cepp 2 — no weather data
⚠️ Skipping cespm 1 — no weather data
⚠️ Skipping cespm 2 — no weather data
⚠️ Skipping cespm 3 — no weather data
⚠️ Skipping contra embalse moncion 1 — no weather data
⚠️ Skipping contra embalse moncion 2 — no weather data
⚠️ Skipping domingo rodriguez 1 — no weather data
⚠️ Skipping domingo rodriguez 2 — no weather data
⚠️ Skipping el salto — no weather data
⚠️ Skipping estrella del mar — no weather data
⚠️ Skipping estrella del mar 2 — no weather data
⚠️ Skipping estrella del mar 3 — no weath

  start_date = pd.to_datetime(plant_row["FirstAppearance"], dayfirst=True).tz_localize("UTC")


  ▶ Training RandomForest


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# Paths
GEN_PATH   = "../data/interim/post_despacho_transformed.parquet"
WEATHER_PARQ= "../data/raw/open_mete_data/parque_eolico_agua_clara.parquet"
OUTPUT_DIR = Path("data/models/parque_eolico_agua_clara")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TIMESTEPS  = 24

# Load generation
df_gen = pd.read_parquet(GEN_PATH)
df_gen["timestamp"] = pd.to_datetime(df_gen["timestamp"], utc=True)
df_target = df_gen.set_index("timestamp")[["parque eolico agua clara"]].rename(columns={"parque eolico agua clara":"generation"})
df_target.index.name = "date"

# Load weather
df_weather = pd.read_parquet(WEATHER_PARQ)
df_weather["date"] = pd.to_datetime(df_weather["date"], utc=True)
df_weather.set_index("date", inplace=True)
df_weather.drop(columns=["visibility","vertical_velocity_1000hPa","cloud_cover_2m"], inplace=True)

print("✔ Data loaded — merging...")
df = df_target.join(df_weather, how="inner").dropna().reset_index()
print(f"✔ Merged shape: {df.shape}")

# Build supervised windows
X, y = [], []
for i in range(TIMESTEPS, len(df)):
    X.append(df.drop(columns=["date","generation"]).values[i-TIMESTEPS:i])
    y.append(df["generation"].iloc[i])
X, y = np.array(X), np.array(y)
split = int(0.8*len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
print(f"✔ Train/Test split: {X_train.shape[0]} / {X_test.shape[0]} samples")

# Scale features
scaler = MinMaxScaler()
X_train_flat = X_train.reshape(-1, X_train.shape[-1])
X_test_flat  = X_test.reshape(-1, X_test.shape[-1])
scaler.fit(X_train_flat)
X_train = scaler.transform(X_train_flat).reshape(X_train.shape)
X_test  = scaler.transform(X_test_flat).reshape(X_test.shape)
joblib.dump(scaler, OUTPUT_DIR/"scaler.joblib")

# Train & evaluate helper
def train_eval(model, name, flatten=False):
    print(f"▶ Training {name}")
    if flatten:
        model.fit(X_train.reshape(len(X_train), -1), y_train)
        preds = model.predict(X_test.reshape(len(X_test), -1))
    else:
        model.compile("adam","mse")
        model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.1,
                  callbacks=[EarlyStopping(monitor="val_loss", patience=5)], verbose=0)
        preds = model.predict(X_test).flatten()
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae  = mean_absolute_error(y_test, preds)
    print(f"  ✔ {name}: RMSE={rmse:.3f}, MAE={mae:.3f}")
    return name, model, rmse

# Train all three
results = []
results.append(train_eval(RandomForestRegressor(n_estimators=100, random_state=42), "RandomForest", flatten=True))
results.append(train_eval(LGBMRegressor(n_estimators=200, random_state=42), "LightGBM", flatten=True))
lstm = Sequential([LSTM(64, input_shape=(TIMESTEPS, X_train.shape[2])), Dense(32, activation="relu"), Dense(1)])
results.append(train_eval(lstm, "LSTM", flatten=False))

# Select best
best_name, best_model, best_rmse = min(results, key=lambda x: x[2])
print(f"\n✅ Best model: {best_name} (RMSE={best_rmse:.3f}) — saving...")

# Save best only
if best_name == "LSTM":
    best_model.save(OUTPUT_DIR/"best_model.h5")
else:
    joblib.dump(best_model, OUTPUT_DIR/"best_model.joblib")

print("🎉 Done!")

✔ Data loaded — merging...
✔ Merged shape: (53232, 31)


MemoryError: Unable to allocate 5.89 MiB for an array with shape (29, 53232) and data type float32