In [12]:
# pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/e2/7b/8c1b410cd0604cee9a167a19f7e1746f5b92ae7d02ad574ab560b73c5a48/xgboost-2.1.1-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 330.3 kB/s eta 0:06:19
   ---------------------------------------- 0.0/124.9 MB 495.5 kB/s eta 0:04:12
   ---------------------------------------- 0.2/124.9 MB 1.3 MB/s eta 0:01:35
   ---------------------------------------- 0.3/124.9 MB 1.9 MB/s eta 0:01:06
   ---------------------------------------- 0.4/124.9 MB 2.2 MB/s eta 0:00:57
   ---------------------------------------- 0.5/124.9 MB 2.2 MB/s eta 0:00:56
   ---------------------------------------- 0.7/124.9 MB 2.6 MB/s eta 0:00:48
   -----------------------

In [13]:
import sys
import os

# add the parent directory ('BNetzA') to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np

from models import *
from Data_Preprocessing.data_preprocessing import *

import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

pd.set_option('display.float_format', '{:.4f}'.format)

# Loading Datasets

In [14]:
# load Data
path_to_excel = "../data/EVS4_20140118_dataV9.xlsx"
df = load_data(path_to_excel, sheet_name="DatasetV9")

In [15]:
# df_test_list = [df_test, df_test_xlog, df_test_xlog_ylog, df_test_ylog, df_test_agg, df_test_agg_log, df_test_non_agg, df_test_group_agg]
# baseline, xlog, xlog ylog, ylog, agg, agg ylog, non agg, group agg
# missing: agg xlog, agg xlog ylog
# missing: non agg xlog, non agg xlog ylog, non agg ylog
# missing: group agg xlog, group agg xlog ylog, group agg ylog

In [16]:
outcome_variables = ["cTOTEXn", "cTOTEXn", "cTOTEXn_log", "cTOTEXn_log", "cTOTEXn", "cTOTEXn_log", "cTOTEXn", "cTOTEXn"]
names = ["Baseline", "XLog", "XLog YLog", "YLog", "Aggregates Only", "Aggregates Only YLog", "Disaggregates Only", "Aggregates N1-4 N5-7"]
outcome_transformation = ["None", "None", "log", "log", "None", "log", "None", "None"]
random_states = [39, 40, 41, 42, 43]
columns = ["Model", "Training RMSE", "Training MAE", "Training MAPE", 
           "Testing RMSE", "Testing MAE", "Testing MAPE"]

# Lasso Regression

In [17]:
lasso_results = pd.DataFrame(columns=columns)
lasso_models = []
lasso_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        lasso_result, lasso_model, lasso_vip = lasso_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Lasso {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        lasso_results = pd.concat([lasso_results, lasso_result], axis=0, ignore_index=True)
        lasso_models.append(lasso_model)
        lasso_vips.append(lasso_vip)

lasso_results.loc[:, lasso_results.columns != 'Model'] = lasso_results.loc[:, lasso_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lasso_results_df = lasso_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
lasso_results_df

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Lasso Aggregates Only YLog,19509173.422,7579512.996,0.124,11268863.23,5562088.28,0.14
6,Lasso XLog YLog,20123969.038,7400527.32,0.122,15132882.688,6881961.558,0.148
1,Lasso Aggregates Only,7583134.194,4640771.03,0.174,17908963.364,7210410.592,0.196
4,Lasso Disaggregates Only,4387758.65,2950621.708,0.146,16269428.382,7617384.236,0.364
3,Lasso Baseline,4451456.616,2969861.236,0.146,16244300.174,7705064.898,0.368
0,Lasso Aggregates N1-4 N5-7,5449122.156,3832855.048,0.178,15520720.84,7637367.868,0.426
7,Lasso YLog,238548674.418,31717017.534,0.322,1519404205.04,349358998.668,0.85
5,Lasso XLog,72088678.586,37785655.446,1.692,61913800.818,36692748.23,2.14


In [50]:
lasso_xlog_ylog_vips = [lasso_vips[2], lasso_vips[10], lasso_vips[18], lasso_vips[26], lasso_vips[34]] 
lasso_xlog_ylog_frequency = variable_frequency(lasso_xlog_ylog_vips, "Lasso XLog YLog")
lasso_xlog_ylog_stable = lasso_xlog_ylog_frequency[lasso_xlog_ylog_frequency["Lasso XLog YLog"] > 2]
lasso_xlog_ylog_stable

Unnamed: 0,Variable,Lasso XLog YLog
0,yEnergy.losses.tot,5
1,yInstalledPower.other.tot,5
2,yNet.length.excl.house.tot,5
3,yEnergy.delivered.N1357.sum,5
4,ySubstations.N4,5
5,yArea.other.N7,4
6,ySubstations.own.N4,4
7,yPeakload.max,4
8,yPeakload.injection.N6,4
9,yInstalledPower.KWKG.N7,3


In [54]:
lasso_agg_log_vips = [lasso_vips[5], lasso_vips[13], lasso_vips[21], lasso_vips[29], lasso_vips[37]] 
lasso_agg_log_frequency = variable_frequency(lasso_agg_log_vips, "Lasso Agg YLog")
lasso_agg_log_frequency[lasso_agg_log_frequency["Lasso Agg YLog"] > 2]

Unnamed: 0,Variable,Lasso Agg YLog
0,yEnergy.losses.tot,5
1,yNet.length.excl.house.tot,5
2,yInstalledPower.other.tot,5
3,yInstalledPower.KWKG.other.tot,5
4,yEnergy.delivered.N1357.sum,5
5,yMeters.over10MWh.RPM.tot,4
6,yInjection.tot,4
7,yEnergy.delivered.net.N67.sum,4
8,yEnergy.delivered.tot,4
9,yMeters.active.tot,4


# Linear Regression with selected Features from Lasso¶

In [20]:
lr_results = pd.DataFrame(columns=columns)
lr_models = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        lr_result, lr_model= lasso_feature_selection_linear_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Linear Regresion {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        lr_results = pd.concat([lr_results, lr_result], axis=0, ignore_index=True)
        lr_models.append(lr_model)

lr_results.loc[:, lr_results.columns != 'Model'] = lr_results.loc[:, lr_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lr_results_df = lr_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
lr_results_df

  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)
  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)


Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
6,Linear Regresion XLog YLog,487800379.9220,104332919.2720,0.4420,229702712.92,72940355.5620,0.4320
5,Linear Regresion XLog,63601424.9820,39385620.9120,2.0020,85784974.108,48256660.5380,2.9920
4,Linear Regresion Disaggregates Only,648367409.9080,319848771.4020,17.6160,643618781.918,349764355.9160,14.2760
0,Linear Regresion Aggregates N1-4 N5-7,1242643658.4160,623977878.3020,33.1200,486123863.412,403092698.0300,41.9900
1,Linear Regresion Aggregates Only,1998918536.6360,997825616.5020,55.4820,1603287914.282,1086946275.7420,50.7760
3,Linear Regresion Baseline,1860657042.6320,969593276.4720,49.7220,1076086961.054,778028283.2820,59.2000
2,Linear Regresion Aggregates Only YLog,79428880699346979790821960663359523328972094963...,60304090977933408094616889967667969377799881359...,4711726675320407234252200295739791769600.0000,1.2231958811940117e+43,2735149140704838026703498183813245383475200.0000,5691663269258927241751952602693632.0000
7,Linear Regresion YLog,inf,12823623758464513792783743443605805311372320172...,35605014506246150251062695185839537016538423650...,inf,10142320547350045689240233885590865790529548774...,79202675282793282530381507115535542896856951833...


In [11]:
lr_results.loc[:, lr_results.columns != 'Model'] = lr_results.loc[:, lr_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lr_results_df = lr_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
lr_results_df

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Linear Regresion Aggregates Only YLog,15662174.834,6470256.848,0.12,10119945.728,5421080.066,0.138
6,Linear Regresion XLog YLog,15695991.158,6450477.182,0.116,14841106.866,6892647.448,0.15
1,Linear Regresion Aggregates Only,6968612.122,4517576.964,0.184,16672876.648,6854655.028,0.204
4,Linear Regresion Disaggregates Only,3087245.384,2234400.206,0.136,36073805.172,14134950.324,0.37
0,Linear Regresion Aggregates N1-4 N5-7,4026616.516,2995976.878,0.166,17920304.996,7680720.812,0.388
3,Linear Regresion Baseline,3013376.348,2182353.814,0.14,31828459.84,12707307.488,0.394
7,Linear Regresion YLog,35254376.848,11705541.858,0.232,2970443916.666,674762907.35,1.11
5,Linear Regresion XLog,63012231.918,37332414.562,1.82,84939751.722,46865510.812,2.83


In [33]:
lr_models[5].summary()

0,1,2,3
Dep. Variable:,cTOTEXn_log,R-squared:,-524.572
Model:,OLS,Adj. R-squared:,-585.606
Method:,Least Squares,F-statistic:,-8.595
Date:,"Fri, 16 Aug 2024",Prob (F-statistic):,1.0
Time:,23:14:41,Log-Likelihood:,-833.32
No. Observations:,174,AIC:,1705.0
Df Residuals:,155,BIC:,1765.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,16.9784,2.336,7.267,0.000,12.363,21.593
yConnections.streetlights.sum,0.0071,2.938,0.002,0.998,-5.796,5.810
yMeters.cp.ctrl.tot,0.0100,2.716,0.004,0.997,-5.356,5.376
yMeters.active.tot,0.0354,7.564,0.005,0.996,-14.907,14.978
yMeters.over10MWh.RPM.tot,0.0139,6.251,0.002,0.998,-12.335,12.362
yMeters.others.tot,0.0171,3.146,0.005,0.996,-6.197,6.231
yNet.length.excl.house.tot,0.1183,13.746,0.009,0.993,-27.035,27.271
ySubstations.own.tot,0,13.637,0,1.000,-26.939,26.939
yInstalledPower.renewables.hydro.tot,0.0004,3.245,0.000,1.000,-6.411,6.411

0,1,2,3
Omnibus:,20.357,Durbin-Watson:,2.174
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.12
Skew:,-0.903,Prob(JB):,5.79e-06
Kurtosis:,3.259,Cond. No.,9370.0


In [19]:
lr_models[5].summary()

0,1,2,3
Dep. Variable:,cTOTEXn_log,R-squared:,0.987
Model:,OLS,Adj. R-squared:,0.985
Method:,Least Squares,F-statistic:,635.3
Date:,"Fri, 16 Aug 2024",Prob (F-statistic):,2.7399999999999998e-135
Time:,17:30:16,Log-Likelihood:,87.058
No. Observations:,174,AIC:,-136.1
Df Residuals:,155,BIC:,-76.09
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,16.9784,0.012,1440.761,0.000,16.955,17.002
yConnections.streetlights.sum,0.0071,0.015,0.481,0.631,-0.022,0.036
yMeters.cp.ctrl.tot,0.0100,0.014,0.728,0.468,-0.017,0.037
yMeters.active.tot,0.0354,0.038,0.929,0.354,-0.040,0.111
yMeters.over10MWh.RPM.tot,0.0139,0.032,0.439,0.661,-0.048,0.076
yMeters.others.tot,0.0171,0.016,1.076,0.284,-0.014,0.048
yNet.length.excl.house.tot,0.1183,0.069,1.706,0.090,-0.019,0.255
ySubstations.own.tot,-0.0082,0.069,-0.119,0.906,-0.144,0.128
yInstalledPower.renewables.hydro.tot,0.0004,0.016,0.025,0.980,-0.032,0.033

0,1,2,3
Omnibus:,1.476,Durbin-Watson:,2.225
Prob(Omnibus):,0.478,Jarque-Bera (JB):,1.496
Skew:,-0.218,Prob(JB):,0.473
Kurtosis:,2.874,Cond. No.,9370.0


In [12]:
lr_results.loc[:, lr_results.columns != 'Model'] = lr_results.loc[:, lr_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lr_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Linear Regresion Aggregates Only YLog,14677973.33,6067041.108,0.114,9699432.586,5298720.8000,0.1400
6,Linear Regresion XLog YLog,15345685.914,6299959.022,0.114,15030208.144,6867786.8940,0.1520
1,Linear Regresion Aggregates Only,6789600.646,4405321.256,0.184,16359456.664,6706496.7620,0.1980
3,Linear Regresion Baseline,2051665.524,1587536.02,0.118,26143451.154,10879766.3880,0.4840
4,Linear Regresion Disaggregates Only,2123288.34,1621571.98,0.118,29745202.13,12349881.1780,0.5460
0,Linear Regresion Aggregates N1-4 N5-7,3323075.776,2508061.032,0.16,16217742.882,8458170.1880,0.6380
5,Linear Regresion XLog,54432064.948,34150405.99,1.696,89508940.924,50923828.3020,2.4240
7,Linear Regresion YLog,26041250.98,9175203.854,0.188,inf,10142320547350045689240233885590865790529548774...,24478134620635746182772844188168143481390764669...


# Random Forest Regression

In [14]:
rf_results = pd.DataFrame(columns=columns)
rf_models = []
rf_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        rf_result, rf_model, rf_vip = random_forest_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Random Forest {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        rf_results = pd.concat([rf_results, rf_result], axis=0, ignore_index=True)
        rf_models.append(rf_model)
        rf_vips.append(rf_vip)

rf_results.loc[:, rf_results.columns != 'Model'] = rf_results.loc[:, rf_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
rf_results_df = rf_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
rf_results_df

Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Evaluating the model...


                    Model Training RMSE Training MAE Training MAPE  \
0  Random Forest Baseline   30248617.33   6199937.38          0.09   

  Testing RMSE Testing MAE Testing MAPE  
0   7621079.89  4469284.64         0.17  
Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Evaluating the model...


                Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Random Forest XLog   30305327.84   6277681.97          0.09   7810148.75   

  Testing MAE Testing MAPE  
0  4522345.53         0.17  
Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Evaluating the model...


                     Model Training RMSE Training MAE Training MAPE  \
0  Random Forest XLog YLog   36440603.78   7026397.02          0.07   

  Testing RMSE Testing MAE Testing MAPE  
0   4776229.17  3354260.27  

In [None]:
rf_agg_log_vips = [rf_vips[5], rf_vips[13], rf_vips[21], rf_vips[29], rf_vips[37]] 
rf_agg_log_frequency = variable_frequency(rf_agg_log_vips, "Random Forest Aggregates Only YLog")
rf_agg_log_stable = rf_agg_log_frequency[rf_agg_log_frequency["Random Forest Aggregates Only YLog"] > 2]
rf_agg_log_stable

In [15]:
rf_results.loc[:, rf_results.columns != 'Model'] = rf_results.loc[:, rf_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
rf_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
7,Random Forest YLog,36693302.464,7167030.312,0.07,39793393.218,12615968.87,0.182
6,Random Forest XLog YLog,36467920.79,7086270.668,0.07,38545130.344,12294038.122,0.186
2,Random Forest Aggregates Only YLog,37720341.426,7369140.622,0.072,37824967.054,11750373.702,0.192
0,Random Forest Aggregates N1-4 N5-7,29553159.662,6340270.458,0.08,41580339.844,13574560.12,0.222
1,Random Forest Aggregates Only,29774959.314,6523046.758,0.082,38484881.392,12521322.52,0.222
3,Random Forest Baseline,29180181.574,6122334.116,0.08,36600651.98,12138023.852,0.232
5,Random Forest XLog,29336375.242,6246264.006,0.082,37443336.818,12519869.848,0.234
4,Random Forest Disaggregates Only,28743123.856,6035507.31,0.084,37138590.872,12468855.204,0.284


# Decision Tree

In [36]:
dt_results = pd.DataFrame(columns=columns)
dt_models = []
dt_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        dt_result, dt_model, dt_vip = decision_tree_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Decision Tree {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        dt_results = pd.concat([dt_results, dt_result], axis=0, ignore_index=True)
        dt_models.append(dt_model)
        dt_vips.append(dt_vip)

dt_results.loc[:, dt_results.columns != 'Model'] = dt_results.loc[:, dt_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
dt_results_df = dt_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
dt_results_df

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
3,Decision Tree Baseline,14732790.008,3109712.33,0.036,30017159.052,11046364.936,0.216
5,Decision Tree XLog,14653850.478,3098652.2,0.034,32852911.88,11860650.16,0.218
4,Decision Tree Disaggregates Only,5082944.614,1144066.488,0.024,30410442.156,10751974.834,0.222
7,Decision Tree YLog,28298985.566,6019625.546,0.056,53822603.478,17095497.86,0.244
2,Decision Tree Aggregates Only YLog,29351417.37,6708681.112,0.064,66000628.93,21318852.934,0.246
1,Decision Tree Aggregates Only,16350463.316,4012843.758,0.06,34796633.54,12521636.364,0.258
0,Decision Tree Aggregates N1-4 N5-7,5296451.136,1293876.296,0.024,34774280.68,12657849.172,0.264
6,Decision Tree XLog YLog,33598548.24,7324825.73,0.062,87157855.342,28943622.348,0.286


In [65]:
dt_baseline_vips = [dt_vips[0], dt_vips[8], dt_vips[16], dt_vips[24], dt_vips[32]] 
dt_baseline_frequency = variable_frequency(dt_baseline_vips, "Decision Tree Baseline")
dt_baseline_stable = dt_baseline_frequency[dt_baseline_frequency["Decision Tree Baseline"] > 2]
dt_baseline_stable

Unnamed: 0,Variable,Decision Tree Baseline
0,yEnergy.delivered.N7,3
1,yPeakload.injection.max,3


# Cluster-Based Modeling

In [39]:
cluster_results = pd.DataFrame(columns=columns)
cluster_models_c0 = []
cluster_models_c1 = []

outcome_variables_cbm = ["cTOTEXn", "cTOTEXn_log"]
names_cbm = ["Baseline", "YLog"]
outcome_transformation_cbm = ["None", "log"]
indices = [0, 3]

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(outcome_transformation_cbm)):
        cluster_result, model_c0, model_c1 = cluster_based_modeling(df_train_list[indices[i]], df_test_list[indices[i]], outcome_variables_cbm[i], f"Cluster-Based Modeling {names_cbm[i]}", outcome_transformation = outcome_transformation_cbm[i], random_state = random_state)
        cluster_results = pd.concat([cluster_results, cluster_result], axis=0, ignore_index=True)
        cluster_models_c0.append(model_c0)
        cluster_models_c1.append(model_c1)

cluster_results.loc[:, cluster_results.columns != 'Model'] = cluster_results.loc[:, cluster_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
cluster_results_df = cluster_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")
cluster_results_df

found 0 physical cores < 1
  File "C:\Users\ducan\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Cluster-Based Modeling Baseline_Random Forest_...,4780463.04,2728475.16,0.09,13572616.87,5772370.99,0.13
4,Cluster-Based Modeling YLog_Random Forest_Rand...,27483207.17,5697578.64,0.08,139050062.04,34880178.93,0.18
1,Cluster-Based Modeling Baseline_Lasso_Random F...,32420609.7,7188183.67,0.1,9969182.36,4985002.13,0.2
3,Cluster-Based Modeling YLog_Lasso_Random Forest,37159398.2425,7131082.75,0.115,17027769.0375,8001083.31,0.205
0,Cluster-Based Modeling Baseline_Lasso_Lasso,3887549.8,2387052.8933,0.1,13715203.4233,6878530.04,0.2567


Due to very unstable best performing model, this approach is not anymore feasible for the use case. In 10 different runs, 5 different models have emerged.

# Summary Results

## Evaluation Metrics

In [97]:
best_lasso = lasso_results_df[lasso_results_df["Model"].isin(["Lasso Aggregates Only YLog", "Lasso XLog YLog"])]
best_lr = lr_results_df[lr_results_df["Model"].isin(["Linear Regresion XLog YLog"])]
# best_rf = rf_results_df[rf_results_df["Model"].isin(["Random Forest Aggregates Only YLog"])]
best_dt = dt_results_df[dt_results_df["Model"].isin(["Decision Tree Baseline"])]
combined_best = pd.concat([best_lasso, best_lr, best_dt], axis=0)
combined_best.sort_values(by=["Testing MAPE", "Testing RMSE"], ascending=True)

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Lasso Aggregates Only YLog,19509173.422,7579512.996,0.124,11268863.23,5562088.28,0.14
6,Lasso XLog YLog,20123969.038,7400527.32,0.122,15132882.688,6881961.558,0.148
3,Decision Tree Baseline,14732790.008,3109712.33,0.036,30017159.052,11046364.936,0.216
6,Linear Regresion XLog YLog,487800379.922,104332919.272,0.442,229702712.92,72940355.562,0.432


## Variable Importance

In [91]:
# perform an outer join on the 'variable' column
frequency_df = pd.merge(lasso_xlog_ylog_frequency, lasso_agg_log_frequency, on='Variable', how='outer')
frequency_df = pd.merge(frequency_df, dt_baseline_frequency, on='Variable', how='outer')

# Replace NaN values with 0 to indicate that the variable was not present in that model
frequency_df.fillna(0, inplace=True)

columns_to_sum = frequency_df.columns.difference(['Variable'])
frequency_df['Total'] = frequency_df[columns_to_sum].sum(axis=1)

frequency_df.sort_values(by="Total", ascending=False).head(20)

Unnamed: 0,Variable,Lasso XLog YLog,Lasso Agg YLog,Decision Tree Baseline,Total
0,yEnergy.losses.tot,5.0,5.0,1.0,11.0
1,yInstalledPower.other.tot,5.0,5.0,0.0,10.0
2,yNet.length.excl.house.tot,5.0,5.0,0.0,10.0
3,yEnergy.delivered.N1357.sum,5.0,5.0,0.0,10.0
15,yInstalledPower.KWKG.other.tot,3.0,5.0,0.0,8.0
18,yEnergy.delivered.tot,2.0,4.0,1.0,7.0
13,yMeters.noncp.ctrl.excl.house.tot,3.0,3.0,0.0,6.0
8,yPeakload.injection.N6,4.0,0.0,2.0,6.0
4,ySubstations.N4,5.0,0.0,1.0,6.0
16,yMeters.over10MWh.RPM.tot,2.0,4.0,0.0,6.0


# Final Best Models with Robust Variables and Interpretation