In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


stl_path = "../processed_data/stl_energy_production_with_engineered_features.csv"
hp_path = "../processed_data/hp_energy_production_with_engineered_features.csv"


stl_df = pd.read_csv(stl_path)
hp_df = pd.read_csv(hp_path)

original_features = ["Water_Flow_m3_s", "avgtempC", "totalprecipMM", "humidity", "pressureMB"]
engineered_features = original_features + [
    "WaterFlow_Diff_1d", "WaterFlow_Diff_7d",
    "WaterFlow_3day_avg", "WaterFlow_7day_avg",
    "Temp_Deviation", "WaterFlow_Humidity",
    "month_sin", "month_cos",
    "Normalized_Efficiency", "Prev_Day_Efficiency", "Prev_Week_Efficiency"
]
target = "Efficiency"


def train_and_evaluate_model(df, feature_set, model_name):
    X = df[feature_set]
    y = df[target]

    # Train-Test Split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": PolynomialFeatures(degree=2),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
    }

    results = []

    for model_name, model in models.items():
        if "Polynomial" in model_name:
            poly = PolynomialFeatures(degree=2)
            X_train_poly = poly.fit_transform(X_train)
            X_test_poly = poly.transform(X_test)
            model = LinearRegression()
            model.fit(X_train_poly, y_train)
            y_train_pred = model.predict(X_train_poly)
            y_test_pred = model.predict(X_test_poly)
        else:
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

        # Evaluate Model
        r2_train = r2_score(y_train, y_train_pred)
        r2_test = r2_score(y_test, y_test_pred)
        mae = mean_absolute_error(y_test, y_test_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        results.append([model_name, r2_train, r2_test, mae, rmse])

    return pd.DataFrame(results, columns=["Model", "R² Train", "R² Test", "MAE", "RMSE"])

# Train & Evaluate on Original Features
stl_results_original = train_and_evaluate_model(stl_df, original_features, "STL Model - Original Features")
hp_results_original = train_and_evaluate_model(hp_df, original_features, "HP Model - Original Features")

# Train & Evaluate on Engineered Features
stl_results_engineered = train_and_evaluate_model(stl_df, engineered_features, "STL Model - Engineered Features")
hp_results_engineered = train_and_evaluate_model(hp_df, engineered_features, "HP Model - Engineered Features")

print("\n STL Model - Original Features")
print(stl_results_original.to_string(index=False))

print("\n HP Model - Original Features")
print(hp_results_original.to_string(index=False))

print("\n STL Model - Engineered Features")
print(stl_results_engineered.to_string(index=False))

print("\n HP Model - Engineered Features")
print(hp_results_engineered.to_string(index=False))


 STL Model - Original Features
                           Model  R² Train   R² Test      MAE     RMSE
               Linear Regression  0.004784 -0.000149 0.195327 6.151465
Polynomial Regression (Degree 2)  0.008164 -0.000022 0.196357 6.151076
                   Decision Tree  0.679205  0.117265 0.187506 5.779117
                   Random Forest  0.642711  0.085215 0.184914 5.883094

 HP Model - Original Features
                           Model  R² Train   R² Test      MAE     RMSE
               Linear Regression  0.003212 -0.000053 0.228711 5.076118
Polynomial Regression (Degree 2)  0.016187  0.000029 0.238992 5.075910
                   Decision Tree  0.505896  0.105657 0.241495 4.800343
                   Random Forest  0.484905  0.068866 0.233817 4.898086

 STL Model - Engineered Features
                           Model  R² Train  R² Test          MAE         RMSE
               Linear Regression  1.000000 1.000000 2.384950e-16 3.809516e-15
Polynomial Regression (Degree 2)  0.9

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge


advanced_models = {
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(n_estimators=200, random_state=42),
    "CatBoost": CatBoostRegressor(n_estimators=200, random_state=42, verbose=0),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "Support Vector Regression": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Ridge Regression": Ridge()
}

def train_and_evaluate_advanced_models(df, feature_set, dataset_name):
    X = df[feature_set]
    y = df["Efficiency"]  


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = []

    for model_name, model in advanced_models.items():
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate Model
        r2_train = r2_score(y_train, y_train_pred)
        r2_test = r2_score(y_test, y_test_pred)
        mae = mean_absolute_error(y_test, y_test_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

        results.append([model_name, r2_train, r2_test, mae, rmse])

    results_df = pd.DataFrame(results, columns=["Model", "R² Train", "R² Test", "MAE", "RMSE"])

    print(f"\n{dataset_name} Advanced Model Results")
    print(results_df.to_string(index=False))

    return results_df  


stl_results = train_and_evaluate_advanced_models(stl_df, engineered_features, "STL")
hp_results = train_and_evaluate_advanced_models(hp_df, engineered_features, "HP")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2746
[LightGBM] [Info] Number of data points in the train set: 8040, number of used features: 16
[LightGBM] [Info] Start training from score -0.015102

STL Advanced Model Results
                    Model  R² Train  R² Test          MAE     RMSE
                  XGBoost  0.999993 0.117788 1.359856e-01 5.777403
                 LightGBM  0.653523 0.052679 1.452606e-01 5.986799
                 CatBoost  0.999726 0.117427 1.362831e-01 5.778585
        Gradient Boosting  0.999975 0.118079 1.306344e-01 5.776449
Support Vector Regression  0.014339 0.000665 1.965852e-01 6.148960
      K-Nearest Neighbors  0.450712 0.045111 1.865888e-01 6.010668
         Ridge Regression  1.000000 1.000000 5.620185e-07 0.000018
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000470

In [14]:
best_stl_models = stl_results[stl_results["Model"].isin(["XGBoost", "CatBoost", "Gradient Boosting"])]
best_hp_models = hp_results[hp_results["Model"].isin(["XGBoost", "CatBoost", "Gradient Boosting"])]


best_stl_models.to_csv("../processed_data/STL_best_models.csv", index=False)
best_hp_models.to_csv("../processed_data/HP_best_models.csv", index=False)

print("\n Best STL Models Saved:")
print(best_stl_models.to_string(index=False))

print("\n Best HP Models Saved:")
print(best_hp_models.to_string(index=False))


 Best STL Models Saved:
            Model  R² Train  R² Test      MAE     RMSE
          XGBoost  0.999993 0.117788 0.135986 5.777403
         CatBoost  0.999726 0.117427 0.136283 5.778585
Gradient Boosting  0.999975 0.118079 0.130634 5.776449

 Best HP Models Saved:
            Model  R² Train  R² Test      MAE     RMSE
          XGBoost  0.999995 0.118390 0.138161 4.766050
         CatBoost  0.999432 0.117227 0.142247 4.769190
Gradient Boosting  0.999952 0.136466 0.126658 4.716936
