# Classification

In [1]:
import pandas as pd
import glob
from analysis.utils import resolve_split_csv_path
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Load classification summaries
path_class_adni_psp = "/Users/emmatosato/Documents/PhD/ANM_Verona/output/voxel/umap_classification/adni_psp/summary_all_seeds.csv"
path_class_adni_cbs ="/Users/emmatosato/Documents/PhD/ANM_Verona/output/voxel/umap_classification/adni_cbs/summary_all_seeds.csv"
path_class_cbs_psp =  '/Users/emmatosato/Documents/PhD/ANM_Verona/output/voxel/umap_classification/psp_cbs/summary_all_seeds.csv'

# Load network classification summaries
path_class_net_adni_psp = '/Users/emmatosato/Documents/PhD/ANM_Verona/output/networks/classification/adni_psp/summary_all_seeds.csv'
path_class_net_adni_cbs = '/Users/emmatosato/Documents/PhD/ANM_Verona/output/networks/classification/adni_cbs/summary_all_seeds.csv'
path_class_net_psp_cbs = '/Users/emmatosato/Documents/PhD/ANM_Verona/output/networks/classification/psp_cbs/summary_all_seeds.csv'

# Load split dataframes
dir_split = "/Users/emmatosato/Documents/PhD/ANM_Verona/data/dataframes/split"

# Dataframes voxel classification
df_classs_adni_psp = pd.read_csv(path_class_adni_psp)
df_classs_adni_cbs = pd.read_csv(path_class_adni_cbs)
df_classs_cbs_psp = pd.read_csv(path_class_cbs_psp)

# Dataframe network classification
df_classs_net_adni_psp = pd.read_csv(path_class_net_adni_psp)
df_classs_net_adni_cbs = pd.read_csv(path_class_net_adni_cbs)
df_classs_net_cbs_psp = pd.read_csv(path_class_net_psp_cbs)


# Separate cross-validation and test metrics
cv_metrics = ["cv_accuracy", "cv_precision", "cv_recall", "cv_f1", "cv_auc_roc"]
test_metrics = ["test_accuracy", "test_precision", "test_recall", "test_f1", "test_auc_roc"]

In [2]:
def get_variables(df):
    # Group by model
    grouped = df.groupby("model")

    return  grouped

def print_info(df_split, df_class):
    print(f"Number of seeds: {df_class['seed'].nunique()} and seeds: {df_class['seed'].unique()}")
    print(f"Number of models: {df_class['model'].nunique()} and types: {df_class['model'].unique()}\n")
    print(f"Training set size: {(df_split['split'] == 'train').sum()}")
    print(f"Test set size: {(df_split['split'] == 'test').sum()}")

def print_result_cv(grouped, group1, group2):
    print(f"\nGroup {group1} vs {group2}\n")
    print("=== Cross-validation across seeds ===")
    for model_name, group in grouped:
        print(f"\nModel: {model_name}")
        display(group[cv_metrics].agg(["mean", "std"]).round(3))
    print("\n")


def print_result_test(grouped, group1, group2):
    print(f"\nGroup {group1} vs {group2}\n")
    print("=== Test performance on holdout set across seeds=== ")
    for model_name, group in grouped:
        print(f"\nModel: {model_name}")
        display(group[test_metrics].agg(["mean", "std"]).round(3))
    print("\n")

# Voxel classification

## ADNI vs PSP

In [3]:
# Extract number of seeds and grouped data
group1 = "ADNI"
group2 = "PSP"
split_path = resolve_split_csv_path(dir_split, group1 , group2)
df_split = pd.read_csv(split_path)

print_info(df_split, df_classs_adni_psp)


Using split file: /Users/emmatosato/Documents/PhD/ANM_Verona/data/dataframes/split/ADNI_PSP_splitted.csv

Number of seeds: 5 and seeds: [   42   123  2023 31415 98765]
Number of models: 4 and types: ['SVM' 'RandomForest' 'GradientBoosting' 'KNN']

Training set size: 105
Test set size: 27


In [4]:
grouped_adni_psp = get_variables(df_classs_adni_psp)
print_result_test(grouped_adni_psp, group1, group2)


Group ADNI vs PSP

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.593,0.583,0.575,0.571,0.681
std,0.0,0.0,0.0,0.0,0.0



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.593,0.591,0.592,0.59,0.594
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.608,0.602,0.588,0.584,0.657
std,0.033,0.038,0.03,0.028,0.014



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.519,0.5,0.5,0.494,0.683
std,0.0,0.0,0.0,0.0,0.0






## ADNI vs CBS

In [5]:
group1 = "ADNI"
group2 = "CBS"
split_path = resolve_split_csv_path(dir_split, group1 , group2)
df_split = pd.read_csv(split_path)

print_info(df_split, df_classs_adni_cbs)


Using split file: /Users/emmatosato/Documents/PhD/ANM_Verona/data/dataframes/split/ADNI_CBS_splitted.csv

Number of seeds: 5 and seeds: [   42   123  2023 31415 98765]
Number of models: 4 and types: ['SVM' 'RandomForest' 'GradientBoosting' 'KNN']

Training set size: 92
Test set size: 24


In [6]:
grouped_adni_cbs = get_variables(df_classs_adni_cbs)
print_result_test(grouped_adni_cbs, group1, group2)



Group ADNI vs CBS

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.525,0.458,0.464,0.455,0.341
std,0.023,0.019,0.019,0.016,0.015



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.708,0.694,0.656,0.661,0.522
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.658,0.629,0.598,0.597,0.496
std,0.035,0.047,0.036,0.04,0.028



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.625,0.312,0.5,0.385,0.563
std,0.0,0.0,0.0,0.0,0.0






## PSP vs CBS

In [7]:
## ADNI vs CBS
group1 = "PSP"
group2 = "CBS"
split_path = resolve_split_csv_path(dir_split, group1, group2)
df_split = pd.read_csv(split_path)

print_info(df_split, df_classs_cbs_psp)


Using split file: /Users/emmatosato/Documents/PhD/ANM_Verona/data/dataframes/split/PSP_CBS_splitted.csv

Number of seeds: 5 and seeds: [   42   123  2023 31415 98765]
Number of models: 4 and types: ['SVM' 'RandomForest' 'GradientBoosting' 'KNN']

Training set size: 83
Test set size: 21


In [8]:
grouped_psp_cbs = get_variables(df_classs_cbs_psp)
print_result_test(grouped_psp_cbs, group1, group2)


Group PSP vs CBS

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.476,0.5,0.5,0.471,0.58
std,0.0,0.0,0.0,0.0,0.011



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.571,0.558,0.556,0.555,0.556
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.457,0.491,0.492,0.44,0.555
std,0.026,0.038,0.031,0.021,0.061



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.619,0.644,0.639,0.618,0.6
std,0.0,0.0,0.0,0.0,0.149






# Networks Classification

## ADNI vs PSP

In [9]:
df_net_noThr = pd.read_csv('/Users/emmatosato/Documents/PhD/ANM_Verona/data/dataframes/networks/networks_noTHR.csv')

In [10]:
group1 = "ADNI"
group2 = "PSP"

grouped_net_adni_psp = get_variables(df_classs_net_adni_psp)
print_result_test(grouped_net_adni_psp, group1, group2)


Group ADNI vs PSP

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.6,0.593,0.582,0.578,0.648
std,0.031,0.036,0.028,0.026,0.019



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.519,0.525,0.525,0.519,0.586
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.615,0.611,0.61,0.609,0.655
std,0.02,0.018,0.017,0.018,0.02



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.519,0.5,0.5,0.494,0.639
std,0.0,0.0,0.0,0.0,0.0






## ADNI vs CBS

In [11]:
group1 = "ADNI"
group2 = "CBS"

grouped_net_adni_cbs = get_variables(df_classs_net_adni_cbs)
print_result_test(grouped_net_adni_cbs, group1, group2)



Group ADNI vs CBS

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.583,0.516,0.511,0.496,0.548
std,0.0,0.0,0.0,0.0,0.0



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.583,0.476,0.489,0.444,0.53
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.625,0.588,0.558,0.55,0.538
std,0.059,0.09,0.056,0.06,0.023



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.583,0.516,0.511,0.496,0.496
std,0.0,0.0,0.0,0.0,0.0






## PSP vs CBS

In [12]:
group1 = "PSP"
group2 = "CBS"

grouped_net_cbs_psp = get_variables(df_classs_net_cbs_psp)
print_result_test(grouped_net_cbs_psp, group1, group2)


Group PSP vs CBS

=== Test performance on holdout set across seeds=== 

Model: GradientBoosting


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.752,0.775,0.725,0.729,0.678
std,0.021,0.017,0.025,0.028,0.016



Model: KNN


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.619,0.613,0.583,0.571,0.546
std,0.0,0.0,0.0,0.0,0.0



Model: RandomForest


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.705,0.705,0.683,0.685,0.668
std,0.021,0.021,0.025,0.027,0.015



Model: SVM


Unnamed: 0,test_accuracy,test_precision,test_recall,test_f1,test_auc_roc
mean,0.667,0.659,0.653,0.654,0.677
std,0.0,0.0,0.0,0.0,0.002




