In [17]:
import polars as pl
import pandas as pd
from great_tables import GT, md, html, from_column, style, loc
from assignment_3_tools import parquet_to_dict, pickle_to_dict

In [91]:
result_path = '../../Data/GoogleDrive/MLP_Results/'

result_pl = {
    key:value.collect().drop('Dataset_Name')
    for key, value in parquet_to_dict(result_path).items()
}

result_pl = {
    key: value if key.endswith('baseline') else value.drop('Grid_Variable')
    for key, value in result_pl.items()
}

for key, frame in result_pl.items():
    if key.endswith('baseline'):
        continue
    else:
        result_dict = frame.to_dict(as_series=False)
        params = []
        for entry in result_dict['Parameters']:
            param_values = "".join([str(value) for value in entry.values()])
            params.append(param_values)
        result_dict['Parameters'] = params
        result_pl[key] = pl.DataFrame(result_dict)

param_name = {key.split('-')[1]:key[4:] for key in result_pl if not key.endswith('baseline')}

### figure out how to separate param in baseline
for param, name in param_name.items():
    prefixes = ['test', 'grid']
    for prefix in prefixes:
        alt = f"{prefix}{name}"
        baseline = f"{prefix}_results-baseline"
        alt_df = result_dict[alt]
        baseline_df = result_dict[baseline]
        combo = concat([alt_df, baseline_df])

    


{'alpha': '_results-alpha', 'learning_rate_init': '_results-learning_rate_init', 'max_iter': '_results-max_iter', 'batch_size': '_results-batch_size', 'n_iter_no_change': '_results-n_iter_no_change', 'learning_rate': '_results-learning_rate', 'momentum': '_results-momentum', 'hidden_layer_sizes': '_results_neurons-hidden_layer_sizes', 'activation': '_results-activation', 'solver': '_results-solver'}


In [88]:
GT(result_pl['test_results-baseline'].to_pandas())

0,1,2,3,4,5
baseline,"{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'hidden_layer_sizes': array([100]), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'solver': 'adam'}",0.7527384144006128,0.8142536877188669,0.7320845349757549,67.24831981658936
Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time


In [32]:
def param_baseline(param_name, result_type):
    if result_type == "test":
        baseline_test = result_pl["test_results_baseline"].collect().to_pandas().drop(columns='Dataset_Name')
    elif result_type == "grid":
        baseline_test = result_pl["grid_results_baseline"].collect().to_pandas().drop(columns='Dataset_Name')
    else:
        raise ValueError("result_type needs to be test or grid")
    base_dict = dict()
    for param in baseline_test['Parameters']:
        for key, value in param.items():
            if key == param_name:
                base_dict[key] = value
                baseline_test.drop(columns='Parameters')
                baseline_test['Parameters'] = str(base_dict)
    return baseline_test
    
def format_table(df, result_type):
    if result_type == "test":
        table = (
            GT(df.sort_values(by='Recall', ascending=False))
            .fmt_number(columns=["Recall","ROC_AUC","Accuracy"], decimals=2)
            .fmt_number(columns=["Fit_Time"], decimals=0)
        )
    elif result_type == "grid":
        table = (
            GT(df.sort_values(by='Recall', ascending=False))
            .fmt_number(columns=["Recall"], decimals=2)
            .fmt_number(columns=["Fit_Time"], decimals=0)
        )
    else:
        raise ValueError("result_type needs to be test or grid")
    return table

In [38]:
alpha_test = result_pl["test_results_alpha"].collect().to_pandas().drop(columns=['Dataset_Name','Grid_Variable'])
alpha_grid = result_pl["grid_results_alpha"].collect().to_pandas().drop(columns=['Dataset_Name','Grid_Variable'])
alpha_test_base = param_baseline("alpha", "test")
alpha_grid_base = param_baseline("alpha", "grid")
alpha_test_df = pd.concat([alpha_test_base, alpha_test], ignore_index=True)
alpha_grid_df = pd.concat([alpha_grid_base, alpha_grid], ignore_index=True)


In [39]:
format_table(alpha_test_df,"test")

0,1,2,3,4,5
baseline,{'alpha': 0.0001},0.75,0.81,0.73,67
,{'alpha': 0.0},0.72,0.81,0.75,102
Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time


In [40]:
format_table(alpha_grid_df,"grid")

0,1,2,3
,{'alpha': 0.0},0.83,102
baseline,{'alpha': 0.0001},0.82,67
,{'alpha': 0.25},0.81,77
,{'alpha': 0.5},0.81,50
,{'alpha': 0.75},0.81,36
,{'alpha': 1.0},0.80,36
Grid_Variable,Parameters,Recall,Fit_Time


In [12]:
result_pl["test_results_activation"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,activation,{'activation': 'logistic'},0.660207,0.793378,0.753255,154.2702


In [25]:
result_pl["grid_results_activation"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,activation,{'activation': 'logistic'},0.853467,154.2702
1,Under_Sample_1:1_threshold_20,activation,{'activation': 'tanh'},0.844433,185.283824
2,Under_Sample_1:1_threshold_20,activation,{'activation': 'relu'},0.820933,100.009219


In [13]:
result_pl["test_results_batch_size"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 100},0.736729,0.815755,0.74262,150.978772


In [26]:
result_pl["grid_results_batch_size"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 100},0.832282,150.978772
1,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 1000},0.822254,80.844334
2,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 500},0.820736,74.753082
3,Under_Sample_1:1_threshold_20,batch_size,{'batch_size': 1},0.806049,1881.966173


In [14]:
result_pl["test_results_iter_no_change"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 100},0.712601,0.808466,0.745294,399.201779


In [27]:
result_pl["grid_results_iter_no_change"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 100},0.842093,399.201779
1,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 250},0.842093,400.506371
2,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 500},0.842093,281.170787
3,Under_Sample_1:1_threshold_20,iter_no_change,{'n_iter_no_change': 50},0.841426,356.611591


In [15]:
result_pl["test_results_layers"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 100]}",0.518116,0.760817,0.809887,832.888808


In [28]:
result_pl["grid_results_layers"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 100]}",0.972256,832.888808
1,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100, 100, 10...",0.969954,1083.540529
2,Under_Sample_1:1_threshold_20,layers,"{'hidden_layer_sizes': [100, 100, 100]}",0.954148,601.686462
3,Under_Sample_1:1_threshold_20,layers,{'hidden_layer_sizes': [100]},0.820933,111.931144


In [16]:
result_pl["test_results_learning_rate"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate,{'learning_rate': 'constant'},0.752738,0.814254,0.732085,96.904531


In [29]:
result_pl["grid_results_learning_rate"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate,{'learning_rate': 'constant'},0.820933,96.904531
1,Under_Sample_1:1_threshold_20,learning_rate,{'learning_rate': 'invscaling'},0.820933,99.450832
2,Under_Sample_1:1_threshold_20,learning_rate,{'learning_rate': 'adaptive'},0.820933,97.082339


In [17]:
result_pl["test_results_learning_rate_init"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.01},0.767905,0.816309,0.73001,72.824502


In [30]:
result_pl["grid_results_learning_rate_init"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.01},0.85262,72.824502
1,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.0001},0.816108,139.893551
2,Under_Sample_1:1_threshold_20,learning_rate_init,{'learning_rate_init': 0.1},0.801173,39.018317


In [18]:
result_pl["test_results_max_iter"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 100},0.752738,0.814254,0.732085,105.687169


In [31]:
result_pl["grid_results_max_iter"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 100},0.820933,105.687169
1,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 250},0.820933,106.422979
2,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 500},0.820933,106.836757
3,Under_Sample_1:1_threshold_20,max_iter,{'max_iter': 1000},0.820933,77.817537


In [19]:
result_pl["test_results_momentum"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.0},0.752738,0.814254,0.732085,108.784148


In [32]:
result_pl["grid_results_momentum"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.0},0.820933,108.784148
1,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.25},0.820933,109.241719
2,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.5},0.820933,108.981126
3,Under_Sample_1:1_threshold_20,momentum,{'momentum': 0.75},0.820933,86.033771
4,Under_Sample_1:1_threshold_20,momentum,{'momentum': 1.0},0.820933,76.221695


In [20]:
result_pl["test_results_neurons"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 500},0.448487,0.729403,0.79575,379.136686


In [33]:
result_pl["grid_results_neurons"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 500},0.928545,379.136686
1,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 250},0.885997,201.331052
2,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 50},0.813431,68.73278
3,Under_Sample_1:1_threshold_20,neurons,{'hidden_layer_sizes': 1},0.795639,13.57265


In [21]:
result_pl["test_results_solver"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,ROC_AUC,Accuracy,Fit_Time
0,Under_Sample_1:1_threshold_20,solver,{'solver': 'adam'},0.752738,0.814254,0.732085,76.121285


In [34]:
result_pl["grid_results_solver"].collect().to_pandas()

Unnamed: 0,Dataset_Name,Grid_Variable,Parameters,Recall,Fit_Time
0,Under_Sample_1:1_threshold_20,solver,{'solver': 'adam'},0.820933,76.121285
1,Under_Sample_1:1_threshold_20,solver,{'solver': 'sgd'},0.811662,148.127234
