In [1]:
import sys
import os

# add the parent directory ('BNetzA') to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np

from models import *
from Data_Preprocessing.data_preprocessing import *

pd.set_option('display.float_format', '{:.4f}'.format)

# Loading Datasets

In [3]:
# load Data
path_to_excel = "../data/EVS4_20140118_dataV9.xlsx"
df = load_data(path_to_excel, sheet_name="DatasetV9")

In [4]:
# df_test_list = [df_test, df_test_xlog, df_test_xlog_ylog, df_test_ylog, df_test_agg, df_test_agg_log, df_test_non_agg, df_test_group_agg]
# baseline, xlog, xlog ylog, ylog, agg, agg ylog, non agg, group agg
# missing: agg xlog, agg xlog ylog
# missing: non agg xlog, non agg xlog ylog, non agg ylog
# missing: group agg xlog, group agg xlog ylog, group agg ylog

In [5]:
outcome_variables = ["cTOTEXn", "cTOTEXn", "cTOTEXn_log", "cTOTEXn_log", "cTOTEXn", "cTOTEXn_log", "cTOTEXn", "cTOTEXn"]
names = ["Baseline", "XLog", "XLog YLog", "YLog", "Aggregates Only", "Aggregates Only YLog", "Disaggregates Only", "Aggregates N1-4 N5-7"]
outcome_transformation = ["None", "None", "log", "log", "None", "log", "None", "None"]
random_states = [39, 40, 41, 42, 43]
columns = ["Model", "Training RMSE", "Training MAE", "Training MAPE", 
           "Testing RMSE", "Testing MAE", "Testing MAPE"]

# Lasso Regression

In [7]:
lasso_results = pd.DataFrame(columns=columns)
lasso_models = []
lasso_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        lasso_result, lasso_model, lasso_vip = lasso_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Lasso {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        lasso_results = pd.concat([lasso_results, lasso_result], axis=0, ignore_index=True)
        lasso_models.append(lasso_model)
        lasso_vips.append(lasso_vip)


Performing Lasso regression...
Evaluating the model...


            Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Lasso Baseline    2731309.38   2035203.94          0.12   4539891.03   

  Testing MAE Testing MAPE  
0  2903265.44         0.14  
Performing Lasso regression...
Evaluating the model...


        Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Lasso XLog   74197991.25  33866556.29          1.16  30472127.44   

   Testing MAE Testing MAPE  
0  23205370.94         1.11  
Performing Lasso regression...
Evaluating the model...


             Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Lasso XLog YLog   18936841.70   7113588.16          0.12   5285677.70   

  Testing MAE Testing MAPE  
0  3203273.32         0.13  
Performing Lasso regression...
Evaluating the model...


        Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Lasso YLog   40279573.12  13119174.42          0.25   7039002.36   

  Test

In [8]:
lasso_results.loc[:, lasso_results.columns != 'Model'] = lasso_results.loc[:, lasso_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lasso_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Lasso Aggregates Only YLog,18648730.224,7132720.574,0.124,11474313.838,5767560.512,0.144
6,Lasso XLog YLog,19829971.63,7274104.426,0.122,14612299.344,6762032.586,0.148
1,Lasso Aggregates Only,7346120.926,4453508.428,0.17,16685105.938,6693552.026,0.182
3,Lasso Baseline,2575815.954,1942412.242,0.114,14796210.044,7271969.312,0.348
4,Lasso Disaggregates Only,2592068.15,1973005.264,0.12,14503388.336,7203839.952,0.362
0,Lasso Aggregates N1-4 N5-7,3902520.224,2876126.114,0.15,14739039.146,7286246.854,0.394
7,Lasso YLog,40772998.96,12765053.754,0.242,1426286147.124,328380000.228,0.78
5,Lasso XLog,62190832.156,29936685.826,1.132,68745023.608,35368388.984,1.294


In [9]:
lasso_results.sort_values(by = ["Model", "Testing MAPE"]).tail(50)

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
7,Lasso Aggregates N1-4 N5-7,3793722.6,2883379.53,0.17,4285078.96,3110579.73,0.18
23,Lasso Aggregates N1-4 N5-7,4773444.97,3365306.78,0.15,12057065.41,5665450.57,0.2
31,Lasso Aggregates N1-4 N5-7,4067980.47,2834644.96,0.13,43757160.27,18625648.33,0.2
39,Lasso Aggregates N1-4 N5-7,3422607.29,2633712.5,0.15,5921199.37,4308814.1,0.69
15,Lasso Aggregates N1-4 N5-7,3454845.79,2663586.8,0.15,7674691.72,4720741.54,0.7
28,Lasso Aggregates Only,5008046.47,3591346.19,0.18,49750612.74,16274148.58,0.15
4,Lasso Aggregates Only,8824328.09,4918414.8,0.17,3740575.17,2783264.35,0.18
36,Lasso Aggregates Only,8165661.47,4686459.72,0.16,5060980.36,3203160.84,0.18
12,Lasso Aggregates Only,7071385.25,4477632.61,0.18,11014568.43,5005643.76,0.2
20,Lasso Aggregates Only,7661183.35,4593688.82,0.16,13858792.99,6201542.6,0.2


# Linear Regression with selected Features from Lasso¶

In [11]:
lr_results = pd.DataFrame(columns=columns)
lr_models = []
lr_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        lr_result, lr_model, lr_vip = lasso_feature_selection_linear_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Linear Regresion {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        lr_results = pd.concat([lr_results, lr_result], axis=0, ignore_index=True)
        lr_models.append(lr_model)
        lr_vips.append(lr_vip)


Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                       Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Baseline    2156198.63   1625396.86          0.12   

  Testing RMSE Testing MAE Testing MAPE  
0   4939449.57  3168336.74         0.16  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog   69097752.47  40319206.55          1.90   

  Testing RMSE  Testing MAE Testing MAPE  
0  43097998.47  32975362.02         1.65  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                        Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog YLog   19583339.44   6984516.07          0.11   

  Testing RMSE Testing MAE Testing MAPE  
0   5835659.44  3378350.17         0.14  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion YLog   21784706.90   9036803.16          0.20   

  Testing RMSE Testing MAE Testing MAPE  
0   7005482.97  5054416.06         0.24  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                              Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Aggregates Only    7701973.67   4779841.00          0.18   

  Testing RMSE Testing MAE Testing MAPE  
0   3961116.20  2866758.84         0.20  
Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates Only YLog   14708561.97   5997091.10   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.11   5782398.57  3393150.52         0.14  




Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                                 Model Training RMSE Training MAE  \
0  Linear Regresion Disaggregates Only    2363283.78   1780816.67   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.13   5873434.01  3552919.05         0.17  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates N1-4 N5-7    3171823.87   2459976.10   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.18   5007325.56  3600087.42         0.23  




Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                       Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Baseline    2159310.02   1638262.47          0.11   

  Testing RMSE Testing MAE Testing MAPE  
0  13930809.34  6909706.50         0.83  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog   62267832.66  38977256.59          1.85   

  Testing RMSE  Testing MAE Testing MAPE  
0  44302911.53  35734466.68         3.98  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                        Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog YLog   17831895.64   7327705.70          0.12   

  Testing RMSE Testing MAE Testing MAPE  
0  13108589.66  5635299.14         0.13  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion YLog   40959928.96  13517965.29          0.24   

  Testing RMSE  Testing MAE Testing MAPE  
0  37985203.28  14214948.67         0.48  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                              Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Aggregates Only    6837271.57   4597763.33          0.20   

  Testing RMSE Testing MAE Testing MAPE  
0  11345358.28  5687601.89         0.25  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates Only YLog   16385723.20   6840895.21   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.12   5462924.65  3114912.45         0.13  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                 Model Training RMSE Training MAE  \
0  Linear Regresion Disaggregates Only    2078860.38   1606461.05   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.11  13113304.20  6628629.52         1.01  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates N1-4 N5-7    2961576.92   2263282.66   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.15   8157243.21  5100436.52         1.21  




Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                       Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Baseline    2066878.73   1614569.82          0.13   

  Testing RMSE Testing MAE Testing MAPE  
0  24029032.52  8420751.01         0.17  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog   19919905.23  15586873.68          0.95   

   Testing RMSE  Testing MAE Testing MAPE  
0  205750408.68  76714231.93         1.44  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                        Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog YLog   11463119.76   5337571.65          0.12   

  Testing RMSE Testing MAE Testing MAPE  
0   5336718.87  3480425.55         0.14  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion YLog   12487751.38   5958234.30          0.15   

  Testing RMSE                                        Testing MAE  \
0          inf  5071160273675022463943788372482968818960774860...   

                                        Testing MAPE  
0  1223906731031787252413448862325007866892471501...  
Performing Lasso regression for feature selection...


  output_errors = np.average((y_true - y_pred) ** 2, axis=0, weights=sample_weight)


Performing Linear regression...
Evaluating the model...


                              Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Aggregates Only    7183408.45   4524839.26          0.17   

  Testing RMSE Testing MAE Testing MAPE  
0   9820787.30  5291213.01         0.22  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates Only YLog   14092445.65   5725000.84   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.11   9887305.26  5268585.82         0.16  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                 Model Training RMSE Training MAE  \
0  Linear Regresion Disaggregates Only    2074097.94   1586384.98   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.12  22775171.92  9181043.23         0.25  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates N1-4 N5-7    3875168.04   2918145.12   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.17  10039308.36  5693078.49         0.25  




Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                       Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Baseline    1793432.24   1460210.22          0.12   

  Testing RMSE  Testing MAE Testing MAPE  
0  81109912.84  30929202.91         0.32  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog   66474023.19  39695234.38          1.77   

  Testing RMSE  Testing MAE Testing MAPE  
0  91300044.20  54864654.47         1.46  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                        Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog YLog   12184616.74   4928062.83          0.10   

  Testing RMSE  Testing MAE Testing MAPE  
0  46697457.79  19104981.10         0.23  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion YLog   40577543.37  10032286.31          0.18   

    Testing RMSE   Testing MAE Testing MAPE  
0  1710650670.65  415968681.41         1.77  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                              Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Aggregates Only    4701951.42   3420709.80          0.19   

  Testing RMSE  Testing MAE Testing MAPE  
0  50946470.24  16196177.49         0.15  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates Only YLog   12690612.55   5099410.60   

  Training MAPE Testing RMSE  Testing MAE Testing MAPE  
0          0.11  22883786.61  11888583.53         0.15  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                 Model Training RMSE Training MAE  \
0  Linear Regresion Disaggregates Only    1879189.25   1476169.30   

  Training MAPE Testing RMSE  Testing MAE Testing MAPE  
0          0.11  99147627.92  36414644.59         0.32  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates N1-4 N5-7    3549126.10   2589880.58   

  Training MAPE Testing RMSE  Testing MAE Testing MAPE  
0          0.14  40004971.18  18959130.64         0.21  




Performing Lasso regression for feature selection...
Performing Linear regression...
Evaluating the model...


                       Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Baseline    2082508.00   1599240.73          0.11   

  Testing RMSE Testing MAE Testing MAPE  
0   6708051.50  4970834.78         0.94  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog   54400811.19  36173458.75          2.01   

  Testing RMSE  Testing MAE Testing MAPE  
0  63093341.74  54330426.41         3.59  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                        Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion XLog YLog   15665457.99   6921938.86          0.12   

  Testing RMSE Testing MAE Testing MAPE  
0   4172614.96  2739878.51         0.12  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                   Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion YLog   14396324.29   7330730.21          0.17   

  Testing RMSE  Testing MAE Testing MAPE  
0  18215467.02  11236919.20         0.53  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                              Model Training RMSE Training MAE Training MAPE  \
0  Linear Regresion Aggregates Only    7523398.12   4703452.89          0.18   

  Testing RMSE Testing MAE Testing MAPE  
0   5723551.30  3490732.58         0.17  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates Only YLog   15512523.28   6672807.79   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.12   4480747.84  2828371.68         0.12  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                 Model Training RMSE Training MAE  \
0  Linear Regresion Disaggregates Only    2221010.35   1658027.90   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.12   7816472.60  5972169.50         0.98  
Performing Lasso regression for feature selection...




Performing Linear regression...
Evaluating the model...


                                   Model Training RMSE Training MAE  \
0  Linear Regresion Aggregates N1-4 N5-7    3057683.95   2309020.70   

  Training MAPE Testing RMSE Testing MAE Testing MAPE  
0          0.16  17879866.10  8938117.87         1.29  




In [12]:
lr_results.loc[:, lr_results.columns != 'Model'] = lr_results.loc[:, lr_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
lr_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

Unnamed: 0,Model,Training RMSE,Training MAE,Training MAPE,Testing RMSE,Testing MAE,Testing MAPE
2,Linear Regresion Aggregates Only YLog,14677973.33,6067041.108,0.114,9699432.586,5298720.8000,0.1400
6,Linear Regresion XLog YLog,15345685.914,6299959.022,0.114,15030208.144,6867786.8940,0.1520
1,Linear Regresion Aggregates Only,6789600.646,4405321.256,0.184,16359456.664,6706496.7620,0.1980
3,Linear Regresion Baseline,2051665.524,1587536.02,0.118,26143451.154,10879766.3880,0.4840
4,Linear Regresion Disaggregates Only,2123288.34,1621571.98,0.118,29745202.13,12349881.1780,0.5460
0,Linear Regresion Aggregates N1-4 N5-7,3323075.776,2508061.032,0.16,16217742.882,8458170.1880,0.6380
5,Linear Regresion XLog,54432064.948,34150405.99,1.696,89508940.924,50923828.3020,2.4240
7,Linear Regresion YLog,26041250.98,9175203.854,0.188,inf,10142320547350045689240233885590865790529548774...,24478134620635746182772844188168143481390764669...


# Random Forest Regression

In [None]:
rf_results = pd.DataFrame(columns=columns)
rf_models = []
rf_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        rf_result, rf_model, rf_vip = random_forest_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Random Forest {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        rf_results = pd.concat([rf_results, rf_result], axis=0, ignore_index=True)
        rf_models.append(rf_model)
        rf_vips.append(rf_vip)


Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Evaluating the model...


                    Model Training RMSE Training MAE Training MAPE  \
0  Random Forest Baseline   30248617.33   6199937.38          0.09   

  Testing RMSE Testing MAE Testing MAPE  
0   7621079.89  4469284.64         0.17  
Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Evaluating the model...


                Model Training RMSE Training MAE Training MAPE Testing RMSE  \
0  Random Forest XLog   30305327.84   6277681.97          0.09   7810148.75   

  Testing MAE Testing MAPE  
0  4522345.53         0.17  
Performing Random Forest...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [None]:
rf_results.loc[:, rf_results.columns != 'Model'] = rf_results.loc[:, rf_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
rf_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

# Decision Tree

In [None]:
dt_results = pd.DataFrame(columns=columns)
dt_models = []
dt_vips = []

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(df_train_list)):
        dt_result, dt_model, dt_vip = decision_tree_regression(df_train_list[i], df_test_list[i], outcome_variables[i], f"Decision Tree {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        dt_results = pd.concat([dt_results, dt_result], axis=0, ignore_index=True)
        dt_models.append(dt_model)
        dt_vips.append(dt_vip)


In [None]:
dt_results.loc[:, dt_results.columns != 'Model'] = dt_results.loc[:, dt_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
dt_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")

# Cluster-Based Modeling

In [None]:
cluster_results = pd.DataFrame(columns=columns)
cluster_models = []
cluster_vips = []

outcome_variables_cbm = ["cTOTEXn", "cTOTEXn", "cTOTEXn_log", "cTOTEXn_log"]
names_cbm = ["Baseline", "XLog", "XLog YLog", "YLog"]
outcome_transformation = ["None", "None", "log", "log"]

for random_state in random_states:
    df_train, df_test = prepare_base_data(df, random_state = random_state)
    df_train_list, df_test_list = create_variations(df_train, df_test)
    for i in range(len(names_cbm)):
        cluster_result, cluster_model, cluster_vip = cluster_based_modeling(df_train_list[i], df_test_list[i], outcome_variables[i], f"Cluster-Based Modeling {names[i]}", outcome_transformation = outcome_transformation[i], random_state = random_state)
        cluster_results = pd.concat([cluster_results, cluster_result], axis=0, ignore_index=True)
        cluster_models.append(cluster_model)
        cluster_vips.append(cluster_vip)


In [None]:
cluster_results.loc[:, cluster_results.columns != 'Model'] = cluster_results.loc[:, cluster_results.columns != 'Model'].apply(pd.to_numeric, errors='coerce')
cluster_results.groupby(by = "Model").mean().reset_index().sort_values(by = "Testing MAPE")