In [27]:
import pandas as pd
import numpy as np

stl_path = "processed_data/stl_energy_production_with_weather.csv"
hp_path = "processed_data/hp_energy_production_with_weather.csv"

stl_df = pd.read_csv(stl_path)
hp_df = pd.read_csv(hp_path)

stl_df["Date"] = pd.to_datetime(stl_df["Date"])
hp_df["Date"] = pd.to_datetime(hp_df["Date"])

stl_df["Month"] = stl_df["Date"].dt.month
hp_df["Month"] = hp_df["Date"].dt.month

# Calculate Efficiency
stl_df["Efficiency"] = stl_df["Energy Production (kWh)"] / stl_df["Water_Flow_m3_s"]
hp_df["Efficiency"] = hp_df["Energy Production (kWh)"] / hp_df["Water_Flow_m3_s"]

stl_df["Efficiency"] = stl_df["Efficiency"].replace([np.inf, -np.inf], np.nan)
hp_df["Efficiency"] = hp_df["Efficiency"].replace([np.inf, -np.inf], np.nan)

stl_df.dropna(subset=["Efficiency"], inplace=True)
hp_df.dropna(subset=["Efficiency"], inplace=True)

# Feature Engineering
for df in [stl_df, hp_df]:    
    df["WaterFlow_Diff_1d"] = df["Water_Flow_m3_s"].diff(1)
    df["WaterFlow_Diff_7d"] = df["Water_Flow_m3_s"].diff(7)

    df["WaterFlow_3day_avg"] = df["Water_Flow_m3_s"].rolling(3).mean()
    df["WaterFlow_7day_avg"] = df["Water_Flow_m3_s"].rolling(7).mean()

    df["Temp_Deviation"] = df["avgtempC"] - df["avgtempC"].mean()
    df["WaterFlow_Humidity"] = df["Water_Flow_m3_s"] * df["humidity"]

    df["month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)

    df["Normalized_Efficiency"] = df["Efficiency"] / df["Efficiency"].mean()

    df["Prev_Day_Efficiency"] = df["Efficiency"].shift(1)
    df["Prev_Week_Efficiency"] = df["Efficiency"].shift(7)

stl_df.dropna(inplace=True)
hp_df.dropna(inplace=True)

print("STL Data Preview:")
print(stl_df.head())

print("HP Data Preview:")
print(hp_df.head())

print("STL Data Rows:", len(stl_df))
print("HP Data Rows:", len(hp_df))

stl_updated_path = "processed_data/stl_energy_production_with_engineered_features.csv"
hp_updated_path = "processed_data/hp_energy_production_with_engineered_features.csv"

stl_df.to_csv(stl_updated_path, index=False)
hp_df.to_csv(hp_updated_path, index=False)


STL Data Preview:
         Date   Station  Energy Production (kWh)  Water_Flow_m3_s  avgtempC  \
7  2022-01-04  Amberd 3              1455.962245     18022.752294        -2   
8  2022-01-04  Amberd 3              1455.962245     18147.491347        -2   
9  2022-01-05  Amberd 3               321.229222     18441.472148        -2   
10 2022-01-05  Amberd 3               321.229222     16296.513761        -2   
11 2022-01-05  Amberd 3               321.229222     16646.615149        -2   

    totalprecipMM  humidity  pressureMB  Month  Efficiency  ...  \
7             0.4        56        1020      1    0.080785  ...   
8             0.4        56        1020      1    0.080229  ...   
9             0.0        59        1025      1    0.017419  ...   
10            0.0        59        1025      1    0.019712  ...   
11            0.0        59        1025      1    0.019297  ...   

    WaterFlow_Diff_7d  WaterFlow_3day_avg  WaterFlow_7day_avg  Temp_Deviation  \
7         -433.861103  