# Run an initial analysis of results and produce aggregated results files

In [1]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from IPython.display import display_html

pd.options.display.max_rows = 400
from metadata_utils import get_metadata, get_tuned_alg_perf, process_metafeatures, compute_feature_corrs

In [2]:
dataset_version = ""

# For choosing metafeatures
filter_families = [
    'general',
    'statistical',
    'info-theory'
]

metadataset_df, metafeatures_df = get_metadata(dataset_version)

metafeatures_processed = process_metafeatures(metafeatures_df, filter_families=filter_families)
metafeatures_df = metafeatures_processed

  metafeatures_processed = metafeatures_df.fillna(metafeatures_df.median())


In [4]:
# keep only binary and classification datasets
print(metadataset_df["target_type"].value_counts())

metadataset_df = metadataset_df.loc[metadataset_df["target_type"].isin(["binary", "classification"]), :]

binary            514200
classification    378610
Name: target_type, dtype: int64


In [5]:
# check that all dataset-alg pairs have results for all 10 folds
print(metadataset_df.groupby(["alg_name", "dataset_name"]).agg({"dataset_fold_id": lambda x: len(set(x))}))

                                                   dataset_fold_id
alg_name    dataset_name                                          
CatBoost    openml__APSFailure__168868                          10
            openml__Amazon_employee_access__34539               10
            openml__Australian__146818                          10
            openml__Bioresponse__9910                           10
            openml__Census-Income__168340                       10
...                                                            ...
rtdl_ResNet openml__vehicle__53                                 10
            openml__visualizing_livestock__3731                 10
            openml__vowel__3022                                 10
            openml__wall-robot-navigation__9960                 10
            openml__wdbc__9946                                  10

[2937 rows x 1 columns]


## Analyze num. results per dataset and alg

In [6]:
# for each alg, for how many datasets are there results?
print(f"for each alg: number of datasets with results (out of {len(metadataset_df['dataset_name'].unique())})")
print(metadataset_df.groupby("alg_name")["dataset_name"].apply(lambda x: len(set(x))).sort_values())

print(f"for each dataset: number of algs with results (out of {len(metadataset_df['alg_name'].unique())})")
metadataset_df.groupby("dataset_name")["alg_name"].apply(lambda x: len(set(x))).sort_values()

for each alg: number of datasets with results (out of 176)
alg_name
rtdl_FTTransformer     37
SAINT                  77
NAM                    79
DeepFM                 90
TabTransformer        124
rtdl_ResNet           127
DANet                 130
rtdl_MLP              135
NODE                  138
SVM                   143
VIME                  163
STG                   164
CatBoost              165
LightGBM              165
KNN                   167
LinearModel           168
TabNet                168
RandomForest          173
XGBoost               174
DecisionTree          175
MLP                   175
Name: dataset_name, dtype: int64
for each dataset: number of algs with results (out of 21)


dataset_name
openml__poker-hand__9890                                   4
openml__Devnagari-Script__167121                           6
openml__covertype__7593                                    7
openml__albert__189356                                     7
openml__helena__168329                                     8
openml__walking-activity__9945                             8
openml__CIFAR_10__167124                                  10
openml__guillermo__168337                                 10
openml__Fashion-MNIST__146825                             10
openml__riccardo__168338                                  10
openml__airlines__189354                                  11
openml__skin-segmentation__9965                           11
openml__jungle_chess_2pcs_raw_endgame_complete__167119    11
openml__robert__168332                                    11
openml__MiniBooNE__168335                                 12
openml__ldpa__9974                                        12
openml__mni

## Remove datasets with few results

In [7]:
print(f"for each dataset: number of algs with results")
alg_counts = metadataset_df.groupby("dataset_name")["alg_name"].agg(lambda x: len(set(x))).sort_values()
print(alg_counts)

for each dataset: number of algs with results
dataset_name
openml__poker-hand__9890                                   4
openml__Devnagari-Script__167121                           6
openml__covertype__7593                                    7
openml__albert__189356                                     7
openml__helena__168329                                     8
openml__walking-activity__9945                             8
openml__CIFAR_10__167124                                  10
openml__guillermo__168337                                 10
openml__Fashion-MNIST__146825                             10
openml__riccardo__168338                                  10
openml__airlines__189354                                  11
openml__skin-segmentation__9965                           11
openml__jungle_chess_2pcs_raw_endgame_complete__167119    11
openml__robert__168332                                    11
openml__MiniBooNE__168335                                 12
openml__ldpa__9974        

In [8]:
keep_datasets = list(alg_counts[alg_counts >= 10].index)
drop_datasets = alg_counts[alg_counts < 10]

print(f"dropping {len(drop_datasets)} datasets:")
print(drop_datasets)

dropping 6 datasets:
dataset_name
openml__poker-hand__9890            4
openml__Devnagari-Script__167121    6
openml__covertype__7593             7
openml__albert__189356              7
openml__helena__168329              8
openml__walking-activity__9945      8
Name: alg_name, dtype: int64


In [9]:
print("for each alg: number of datasets with results")
dataset_counts = metadataset_df.groupby("alg_name")["dataset_name"].agg(lambda x: len(set(x))).sort_values()
print(dataset_counts)

for each alg: number of datasets with results
alg_name
rtdl_FTTransformer     37
SAINT                  77
NAM                    79
DeepFM                 90
TabTransformer        124
rtdl_ResNet           127
DANet                 130
rtdl_MLP              135
NODE                  138
SVM                   143
VIME                  163
STG                   164
CatBoost              165
LightGBM              165
KNN                   167
LinearModel           168
TabNet                168
RandomForest          173
XGBoost               174
DecisionTree          175
MLP                   175
Name: dataset_name, dtype: int64


We will keep all algs, regardless of how many datasets they have results for.

In [10]:
metadataset_df = metadataset_df.loc[metadataset_df["dataset_name"].isin(keep_datasets), :]

print("after removing datasets: number of datasets with results")
dataset_counts = metadataset_df.groupby("alg_name")["dataset_name"].agg(lambda x: len(set(x))).sort_values()
print(dataset_counts)

after removing datasets: number of datasets with results
alg_name
rtdl_FTTransformer     37
SAINT                  77
NAM                    79
DeepFM                 90
TabTransformer        122
rtdl_ResNet           125
DANet                 130
rtdl_MLP              133
NODE                  138
SVM                   143
VIME                  162
CatBoost              162
STG                   163
LightGBM              163
KNN                   163
LinearModel           166
TabNet                166
MLP                   169
RandomForest          169
XGBoost               170
DecisionTree          170
Name: dataset_name, dtype: int64


# Get tuned algorithms for a given metric

Report the average & median test performance, over all folds. Note that each alg is tuned for each fold separately.

In [12]:
metric_list = [
    "Accuracy",
    "F1",
    "Log Loss",
]

obj_type_list = [
    "maximize",
    "maximize",
    "minimize",
]
result_df_dict = {}



In [49]:
# add a copy of each "default" hparam row, to treat this as a separate alg
default_rows = metadataset_df.loc[metadataset_df["hparam_source"] == "default"].copy()
default_rows.loc[:, "alg_name"] = default_rows["alg_name"].apply(lambda x: x + "_default")

# append these to the metadataset
metadataset_df = pd.concat([metadataset_df, default_rows], ignore_index=True)

In [56]:
tuned_result_dfs = {}
for drop_default in [True, False]:
    for i, (metric, objective_type) in enumerate(zip(metric_list, obj_type_list)):

        test_metric_col = metric + "__test"

        if drop_default:
            df = metadataset_df.loc[~metadataset_df["alg_name"].str.contains("_default"), :].copy()
        else:
            df = metadataset_df.copy()

        tuned_alg_perf = get_tuned_alg_perf(df, metric=metric)
        # NOTE: this "tunes" each algorithm for each training fold separately. so each of the 10 folds might use different hparams.
        tuned_result_dfs[metric] = tuned_alg_perf

        ##############################
        ### STEP 1: TREAT EACH FOLD AS SEPARATE DATASET

        # result_col = test_metric_col
        
        # # for each dataset, find the min and max metrics over all tuned algs
        # overall_bounds = tuned_alg_perf.groupby("dataset_fold_id").agg({result_col: ["min", "max"]}).reset_index()

        # # rename the multiindex cols
        # new_cols = []
        # for c in overall_bounds.columns:
        #     if c[1] == "":
        #         new_cols.append(c[0])
        #     else:
        #         new_cols.append("_".join(c))

        # overall_bounds.columns = new_cols

        # tuned_alg_perf = tuned_alg_perf.merge(overall_bounds, on="dataset_fold_id", how="left")

        # # add normalized metric
        # tuned_alg_perf.loc[:, "normalized_" + result_col] = (tuned_alg_perf[result_col] - tuned_alg_perf[result_col + "_min"]) / (tuned_alg_perf[result_col + "_max"] - tuned_alg_perf[result_col + "_min"])

        # # rank all algs for each dataset
        # ascending = False if objective_type == "maximize" else True
        
        # tuned_alg_perf.loc[:, f"{metric}_rank"] = tuned_alg_perf.groupby(["dataset_fold_id"])[result_col].rank(method="min", ascending=ascending).values

        # if i == 0:
        #     fold_tuned_df = tuned_alg_perf.copy()
        # else:
        #     fold_tuned_df = fold_tuned_df.merge(tuned_alg_perf, on=["alg_name", "dataset_fold_id"])

        # fold_result_df_dict[metric] = tuned_alg_perf.copy()

        ##############################
        ### STEP 2: AVERAGE OVER FOLDS

        if i == 0:
            agg_dict = {
                test_metric_col: ["median", "mean"],
                "time__train": ["median", "mean"],
                "dataset_name": ["count"],
            }
        else:
            agg_dict = {
                test_metric_col: ["median", "mean"],
            }

        # aggregate over folds: take the mean & median performance over each fold
        agg_tuned_alg_perf = tuned_alg_perf.groupby(["alg_name", "dataset_name"]).agg(agg_dict).reset_index()

        # rename the multiindex cols
        new_cols = []
        for c in agg_tuned_alg_perf.columns:
            if c[1] == "":
                new_cols.append(c[0])
            else:
                new_cols.append("_".join(c))

        agg_tuned_alg_perf.columns = new_cols


        # define the target metric column, we will use this value for all plots
        result_col = test_metric_col + "_mean"

        # for each dataset, find the min and max metrics over all tuned algs
        overall_bounds = agg_tuned_alg_perf.groupby("dataset_name").agg({result_col: ["min", "max"]}).reset_index()

        # adjust the lower bound to be the metric for a tuned decision tree (this is the "baseline")
        # baseline_metric = agg_tuned_alg_perf.loc[agg_tuned_alg_perf["alg_name"] == "DecisionTree"].groupby("dataset_name").agg({result_col: "max"}).reset_index()

        # baseline_metric.columns = ["dataset_name", "baseline_metric"]
        
        # rename the multiindex cols
        new_cols = []
        for c in overall_bounds.columns:
            if c[1] == "":
                new_cols.append(c[0])
            else:
                new_cols.append("_".join(c))

        overall_bounds.columns = new_cols

        
        agg_tuned_alg_perf = agg_tuned_alg_perf.merge(overall_bounds, on="dataset_name", how="left") #. \
            # merge(baseline_metric, on="dataset_name", how="left")

        # add normalized metric
        agg_tuned_alg_perf.loc[:, "normalized_" + result_col] = (agg_tuned_alg_perf[result_col] - agg_tuned_alg_perf[result_col + "_min"]) / (agg_tuned_alg_perf[result_col + "_max"] - agg_tuned_alg_perf[result_col + "_min"])

        # rank all algs for each dataset
        ascending = False if objective_type == "maximize" else True
        
        # rank according to mean or median performance over all folds
        for agg_method in ["mean", "median"]:
            agg_tuned_alg_perf.loc[:, f"{metric}_rank" + "_" + agg_method] = agg_tuned_alg_perf.groupby(["dataset_name"])[test_metric_col + "_" + agg_method].rank(method="min", ascending=ascending).values

        if i == 0:
            tuned_agg_df = agg_tuned_alg_perf.copy()
        else:
            tuned_agg_df = tuned_agg_df.merge(agg_tuned_alg_perf, on=["alg_name", "dataset_name"])

        # result_df_dict[metric] = agg_tuned_alg_perf.copy()

        if drop_default:
            tuned_agg_df_no_default = tuned_agg_df.copy()
            tuned_agg_df.to_csv("./results/tuned_aggregated_results.csv")
        else:
            tuned_agg_df_with_default = tuned_agg_df.copy()
            tuned_agg_df.to_csv("./results/tuned_aggregated_results_with_default_hparams.csv")



In [63]:
# sanity check..
# result_df_dict["Accuracy"][result_df_dict["Accuracy"]["dataset_name"] == "openml__Amazon_employee_access__34539"]
# tuned_agg_df_with_default[(tuned_agg_df_with_default["dataset_name"] == "openml__Amazon_employee_access__34539") & tuned_agg_df_with_default["alg_name"].str.contains("CatBoost")]

tuned_agg_df_with_default[(tuned_agg_df_with_default["dataset_name"] == "openml__Bioresponse__9910") & tuned_agg_df_with_default["alg_name"].str.contains("CatBoost")]

# fold_tuned_df[fold_tuned_df["dataset_fold_id"] == "openml__APSFailure__168868__fold_1"]
# fold_tuned_df[fold_tuned_df["dataset_fold_id"] == "openml__Amazon_employee_access__34539__fold_1"]

Unnamed: 0,alg_name,dataset_name,Accuracy__test_median,Accuracy__test_mean,time__train_median,time__train_mean,dataset_name_count,Accuracy__test_mean_min,Accuracy__test_mean_max,normalized_Accuracy__test_mean,...,normalized_F1__test_mean,F1_rank_mean,F1_rank_median,Log Loss__test_median,Log Loss__test_mean,Log Loss__test_mean_min,Log Loss__test_mean_max,normalized_Log Loss__test_mean,Log Loss_rank_mean,Log Loss_rank_median
3,CatBoost,openml__Bioresponse__9910,0.79894,0.795521,5.815126,6.748842,10,0.727272,0.796848,0.980938,...,0.980938,2.0,2.0,0.455072,0.456102,0.451718,0.967133,0.008506,4.0,1.0
165,CatBoost_default,openml__Bioresponse__9910,0.786667,0.782994,2.447543,2.595033,10,0.727272,0.796848,0.800881,...,0.800881,8.0,6.0,0.474727,0.477421,0.451718,0.967133,0.049868,6.0,7.0


In [52]:
# write tuned df to file
# tuned_agg_df.to_csv("./results/tuned_aggregated_results.csv")
# fold_tuned_df.to_csv("./results/tuned_fold_results.csv")

## Difference between best neural and best non-neural method

In [16]:
# now tune by algorithm type. first define the type as "neural" or "non-neural"
neural_algs = [
    "MLP",
    "TabNet",
    "VIME",
    "TabTransformer",
    "NODE",
    "STG",
    "NAM",
    "DeepFM",
    "SAINT",
    "DANet",
    "rtdl_MLP",
    "rtdl_ResNet",
    "rtdl_FTTransformer",
]

metadataset_df.loc[:, "alg_type"] = "non-neural"
metadataset_df.loc[metadataset_df["alg_name"].isin(neural_algs), "alg_type"] = "neural"

tuned_df = get_tuned_alg_perf(metadataset_df, metric=metric, group_col="alg_type")



In [17]:
# for each dataset fold, get difference between tuned neural and non-neural method (neural - non-neural)
neural_non_neural_comparison = pd.pivot(tuned_df, index="dataset_fold_id", columns=["alg_type"], values=["Accuracy__test", "F1__test", "MSE__test", "Log Loss__test", "alg_name", "time__train", "time__test"])
print(neural_non_neural_comparison.head())

                                   Accuracy__test             F1__test  \
alg_type                                   neural non-neural    neural   
dataset_fold_id                                                          
openml__APSFailure__168868__fold_0       0.992763   0.995263  0.992763   
openml__APSFailure__168868__fold_1       0.988684   0.992237  0.988684   
openml__APSFailure__168868__fold_2       0.990395   0.993947  0.990395   
openml__APSFailure__168868__fold_3       0.992368   0.995526  0.992368   
openml__APSFailure__168868__fold_4       0.991184   0.995789  0.991184   

                                              MSE__test             \
alg_type                           non-neural    neural non-neural   
dataset_fold_id                                                      
openml__APSFailure__168868__fold_0   0.995263       NaN        NaN   
openml__APSFailure__168868__fold_1   0.992237       NaN        NaN   
openml__APSFailure__168868__fold_2   0.993947       NaN  

In [18]:
# save the differences between neural and non-neural algs

# first rename the multiindex cols
new_cols = []
for c in neural_non_neural_comparison.columns:
    if c[1] == "":
        new_cols.append(c[0])
    else:
        new_cols.append("_".join(c))

neural_non_neural_comparison.columns = new_cols 
neural_non_neural_comparison.to_csv("./results/neural_non_neural_comparison.csv")

# Aggregate results

In [42]:
# ###### STEP 1: treat all folds as separate datasets

# # best, worst, and average performance for each alg, over all datasets
# for metric in metric_list:

#     overall_ranks = fold_tuned_df.groupby("alg_name").agg(
#         {
#             f"{metric}_rank": ["min", "max", "mean", "count"],
#             f"normalized_{metric}__test": "mean",
#         }
#     ).reset_index().sort_values([(f"{metric}_rank", "mean")])

#     # format min/max rank columns to be ints

#     overall_ranks.loc[:, "alg_name"] = overall_ranks.loc[:, "alg_name"].apply(lambda x: "\rot{" + x + "}")
#     overall_ranks.loc[:, (f"{metric}_rank", "min")] = overall_ranks.loc[:, (f"{metric}_rank", "min")].astype(int).astype(str)
#     overall_ranks.loc[:, (f"{metric}_rank", "max")] = overall_ranks.loc[:, (f"{metric}_rank", "max")].astype(int).astype(str)
#     overall_ranks.loc[:, (f"{metric}_rank", "count")] = overall_ranks.loc[:, (f"{metric}_rank", "count")].astype(int).astype(str)

#     overall_ranks.loc[:, (f"{metric}_rank", "mean")] = overall_ranks.loc[:, (f"{metric}_rank", "mean")].round(2).astype(str)
    
#     overall_ranks.loc[:, (f"normalized_{metric}__test", "mean")] = overall_ranks.loc[:,(f"normalized_{metric}__test", "mean")].round(2)

#     print(f"metric: {metric}")
#     final_table = overall_ranks.set_index("alg_name").transpose()
#     print(final_table)

#     # save to csv
#     final_table.to_csv(f"./results/fold_rank_tables_{metric}.csv", index=True)

#     # save to latex
#     final_table.to_latex(f"./results/fold_rank_tables_{metric}.tex", index=True, escape=False)


#     print("\n")

In [59]:
###### STEP 2: aggregate over all folds

# best, worst, and average performance for each alg, over all datasets
for metric in metric_list:

    overall_ranks = tuned_agg_df_no_default.groupby("alg_name").agg(
        {
            f"{metric}_rank_mean": ["min", "max", "mean", "count"],
            f"normalized_{metric}__test_mean": "mean",
        }
    ).reset_index().sort_values([(f"{metric}_rank_mean", "mean")])

    # format min/max rank columns to be ints

    overall_ranks.loc[:, "count"] = overall_ranks.loc[:, (f"{metric}_rank_mean", "count")].astype(int)
    overall_ranks.drop(columns=(f"{metric}_rank_mean", "count"), inplace=True)

    # overall_ranks.loc[:, "alg_name"] = overall_ranks.loc[:, "alg_name"].apply(lambda x: "\rot{" + x + "}")
    overall_ranks.loc[:, (f"{metric}_rank_mean", "min")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "min")].astype(int)
    overall_ranks.loc[:, (f"{metric}_rank_mean", "max")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "max")].astype(int)

    overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")].round(2)
    
    overall_ranks.loc[:, (f"normalized_{metric}__test_mean", "mean")] = overall_ranks.loc[:,(f"normalized_{metric}__test_mean", "mean")].round(2)

    print(f"metric: {metric}")
    final_table = overall_ranks.set_index("alg_name")
    print(final_table)

    # save to csv
    final_table.to_csv(f"./results/rank_tables_{metric}.csv", index=True)

    # save to latex
    final_table.to_latex(f"./results/rank_tables_{metric}.tex", index=True, escape=False)


    print("\n")

metric: Accuracy
                   Accuracy_rank_mean             \
                                  min max   mean   
alg_name                                           
CatBoost                            1  17   4.73   
XGBoost                             1  18   5.19   
rtdl_FTTransformer                  1  14   6.08   
rtdl_ResNet                         1  19   6.34   
LightGBM                            1  19   6.43   
NODE                                1  18   6.93   
SAINT                               1  18   7.16   
RandomForest                        1  18   7.38   
SVM                                 1  18   7.71   
DANet                               1  20   7.93   
rtdl_MLP                            1  18   8.95   
DeepFM                              1  20   9.76   
TabNet                              1  21  10.21   
MLP                                 1  19  10.47   
STG                                 1  21  10.52   
DecisionTree                        1  20  10.5

In [44]:
final_table

Unnamed: 0_level_0,Log Loss_rank_mean,Log Loss_rank_mean,Log Loss_rank_mean,normalized_Log Loss__test_mean,count
Unnamed: 0_level_1,min,max,mean,mean,Unnamed: 5_level_1
alg_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
XGBoost,1,15,3.88,0.03,170
CatBoost,1,13,4.21,0.04,162
LightGBM,1,20,6.1,0.08,163
SAINT,1,17,6.52,0.09,77
DANet,1,20,7.11,0.08,130
rtdl_ResNet,1,17,7.16,0.09,125
SVM,1,18,7.25,0.11,143
rtdl_FTTransformer,1,18,8.22,0.13,37
STG,1,20,8.31,0.14,163
RandomForest,1,19,9.0,0.16,169


## UNDER CONSTRUCTION: spaghetti plot - relative performance over different datasets.

In [18]:
# which datasets to use?

result_df_dict["F1"][(result_df_dict["F1"]["alg_name"] == "CatBoost") & (result_df_dict["F1"]["metric_rank_mean"] < 2)]

KeyError: 'metric_rank_mean'

In [23]:
# openml__diabetes__37 <-- lm does well
# openml__isolet__3481
# openml__haberman__42
# openml__robert__168332

# openml__soybean__41 <-- rf does well
# openml__vowel__3022
# openml__guillermo__168337

# openml__cmc__23 <-- mlp does well
# openml__CIFAR_10__167124
# openml__Fashion-MNIST__146825
# openml__Internet-Advertisements__167125	
# openml__dilbert__168909

# openml__Australian__146818 <-- catboost
# openml__APSFailure__168868
# openml__wdbc__9946
# openml__pc1__3918
# openml__eucalyptus__2079

In [24]:
plot_datasets = [
    "openml__diabetes__37",  # <-- lm does well
    "openml__isolet__3481",
    "openml__haberman__42",
    # "openml__robert__168332", # not enough successful algs
    "openml__soybean__41", #  <-- rf does well
    "openml__vowel__3022",
    # "openml__guillermo__168337", # not enough successful algs
    "openml__cmc__23", # <-- mlp does well
    # "openml__CIFAR_10__167124",  # not enough successful algs
    # "openml__Fashion-MNIST__146825",  # not enough successful algs
    "openml__Internet-Advertisements__167125",	
    "openml__dilbert__168909",
    "openml__Australian__146818",  #<-- catboost
    "openml__APSFailure__168868",
    "openml__wdbc__9946",
    "openml__pc1__3918",
    "openml__eucalyptus__2079",
]

# names to show on the plot
plot_dataset_names = [name[len("openml__"):].split("_")[0] for name in plot_datasets]

In [25]:
# number of results for each dataset
num_alg_per_dataset = result_df_dict["F1"].groupby("dataset_name")["alg_name"].count()
num_alg_per_dataset[num_alg_per_dataset < 10].sort_values()

dataset_name
openml__Devnagari-Script__167121                          5
openml__covertype__7593                                   5
openml__helena__168329                                    5
openml__CIFAR_10__167124                                  6
openml__albert__189356                                    6
openml__guillermo__168337                                 6
openml__Fashion-MNIST__146825                             7
openml__riccardo__168338                                  7
openml__robert__168332                                    7
openml__airlines__189354                                  8
openml__mnist_784__3573                                   8
openml__higgs__146606                                     9
openml__jungle_chess_2pcs_raw_endgame_complete__167119    9
openml__numerai28.6__167120                               9
openml__skin-segmentation__9965                           9
openml__sylvine__168912                                   9
Name: alg_name, dtype: int6

In [26]:
plot_algs = agg_tuned_alg_perf["alg_name"].unique()

# gather data for the spaghetti plot
data = dict()
for i_metric, metric_name in enumerate(metric_list):
    data[metric_name] = dict()
    for alg in plot_algs:
            data[metric_name][alg] = []
            for dataset in plot_datasets:
                vals = result_df_dict[metric_name].loc[(result_df_dict[metric_name]["alg_name"] == alg) & (result_df_dict[metric_name]["dataset_name"] == dataset), f"normalized_{metric_name}__test_mean"].values
                if len(vals) != 1:
                    print(f"there's an issue with {alg}-{dataset}-{metric_name}")
                    print(vals)
                    val = None
                else:
                    val = vals[0]
                data[metric_name][alg].append(val)
        

there's an issue with CatBoost-openml__isolet__3481-Accuracy
[]
there's an issue with LightGBM-openml__dilbert__168909-Accuracy
[]
there's an issue with RandomForest-openml__haberman__42-Accuracy
[]
there's an issue with SVM-openml__soybean__41-Accuracy
[]
there's an issue with CatBoost-openml__isolet__3481-F1
[]
there's an issue with LightGBM-openml__dilbert__168909-F1
[]
there's an issue with RandomForest-openml__haberman__42-F1
[]
there's an issue with SVM-openml__soybean__41-F1
[]
there's an issue with CatBoost-openml__isolet__3481-Log Loss
[]
there's an issue with LightGBM-openml__dilbert__168909-Log Loss
[]
there's an issue with RandomForest-openml__haberman__42-Log Loss
[]
there's an issue with SVM-openml__soybean__41-Log Loss
[]


In [1]:
### plotting kwargs

plot_alg_map = {
    "XGBoost": {
        "name": "XGBoost",
        "plt-kwargs": {"marker":"x", "color":"r", "linestyle":"--"}
    },
    "CatBoost": {
        "name": "CatBoost",
        "plt-kwargs": {"marker":"+", "color":"r", "linestyle":"--"}
    },
    "LightGBM": {
        "name": "LightGBM",
        "plt-kwargs": {"marker":"d", "color":"r", "linestyle":"--"}
    },
    "SVM": {
        "name": "SVM",
        "plt-kwargs": {"marker":"v", "color":"black", "linestyle":"-"}
    },
    "KNN": {
        "name": "KNN",
        "plt-kwargs": {"marker":"^", "color":"black", "linestyle":"-"}
    },
    "DecisionTree": {
        "name": "DecisionTree",
        "plt-kwargs": {"marker":">", "color":"black", "linestyle":"-"}
    },
    "RandomForest": {
        "name": "RandomForest",
        "plt-kwargs": {"marker":"P", "color":"black", "linestyle":"-"}
    },
    "LinearModel": {
        "name": "LinearModel",
        "plt-kwargs": {"marker":"<", "color":"black", "linestyle":"-"}
    },
    "TabNet": {
        "name": "TabNet",
        "plt-kwargs": {"marker":"X", "color":"b", "linestyle":":"}
    },
    "MLP": {
        "name": "MLP",
        "plt-kwargs": {"marker":"o", "color":"b", "linestyle":":"}
    },
    "VIME": {
        "name": "VIME",
        "plt-kwargs": {"marker":"P", "color":"b", "linestyle":":"}
    },
}

plot_algs = plot_alg_map.keys()

In [2]:
import numpy as np
fig, ax = plt.subplots(len(metric_list), 1, sharex=True, figsize=(8, 5))

for i, metric in enumerate(metric_list):
    for alg in plot_algs:    
        ax[i].plot(data[metric][alg], label=alg, markersize=7, **plot_alg_map[alg]["plt-kwargs"])
    ax[i].set_ylabel(metric)

    ax[i].set_xticks(np.arange(len(plot_dataset_names)))
    ax[i].set_xticklabels(plot_dataset_names, rotation=-35, ha='left', rotation_mode='anchor')

plt.tight_layout()
plt.subplots_adjust(hspace=0.08)

plt.legend(loc="upper center", bbox_to_anchor=(0.5, 3.6), ncol=6, fontsize="small")
plt.savefig("./results/performance_spaghetti.pdf", bbox_inches='tight')
plt.show()


NameError: name 'plt' is not defined