In [1]:
import numpy as np
import pandas as pd
import os

## Combining all spreadsheets together

The objective here is to take the fazekas and clinscores spreadsheet, the sample diversity features spreadsheet, and the extracted features spreadsheet, and combine them into a single spreadsheet, including all different selected thresholds, and the Deep, PV, All regions. Nice.

I intend to do this for all datasets (ADNI, Challenge, CVD - and possibly MSS3, however, I need to double check the segmentation performance results to see what has gone wrong, and check I am using a consistent rater.

we also make these csv's for each model that has predicted, so deterministic, ssn_ens, and punet at the moment.

In [171]:
def load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, model_name):
    # resolve paths
    if ds_name == "CVD":
        sample_div_paths = [os.path.join(extracted_features_folder, f) for f in os.listdir(extracted_features_folder) if "sample_div_and_metrics" in f and model_name in f]
        sample_div_df = pd.read_csv(sample_div_paths[0])
        for sdp in sample_div_paths[1:]:
            sddf = pd.read_csv(sdp)
            sample_div_df = pd.concat([sample_div_df, sddf])
    else:
        sample_div_path = os.path.join(sample_div_folder, f"{model_name}_sample_div_and_metrics.csv")
        sample_div_df = pd.read_csv(sample_div_path)
    extracted_features_files = [os.path.join(extracted_features_folder, f) for f in os.listdir(extracted_features_folder) if ("_pred_" in f or "_ent_" in f) and model_name in f]
    
    # load dfs
    clinscores_df = pd.read_csv(clinscores_path)
    remaining_dfs = [(sample_div_df, "sample_div_and_metrics")] + [(pd.read_csv(path), path) for path in extracted_features_files]
    
    # merge dfs
    merged_df = merge_dfs(clinscores_df, remaining_dfs, ds_name)

    return merged_df, clinscores_df

In [169]:
def merge_dfs(clinscores_df, remaining_dfs, ds_name):
    combined_df = clinscores_df.copy()
    combined_df = combined_df.rename(columns={"Patient ID":"ID"})
    for (df, path) in remaining_dfs:
        df = df.copy()
        try:
            if ds_name == "ADNI300":
                df['ID'] = ["_".join(v.split("_")[1:-1]) for v in df['ID'].values]
            if ds_name == "Challenge":
                df['ID'] = ["_".join(v.split("_")[1:]) for v in df['ID'].values]

            if "_pred_" in path or "_ent_" in path:
                feature_type = "_".join(path[:-4].split("_")[-2:])
                df = df.rename(columns = {key:f"{key}_{feature_type}" for key in df.keys() if key != "ID"})

            combined_df = combined_df.merge(df, on="ID", how='outer')
            if len(combined_df) <= 1:
                print(df.keys())
                print(df['ID'].values)
        except Exception as e:
            print(df.keys())
            print(path)
            # raise e

    return combined_df
    

In [149]:
ds_name = "Challenge"
clinscores_path = "/home/s2208943/preprocessed_data/WMHChallenge_InterRaterData/clinscore_data.csv"
sample_div_folder = "/home/s2208943/preprocessed_data/WMHChallenge_InterRaterData/feature_spreadsheets"
extracted_features_folder = "/home/s2208943/preprocessed_data/WMHChallenge_InterRaterData/feature_spreadsheets"

In [150]:
ssnens_challenge_df, clinscores_challenge_df = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "ssn_ens")
punet_challenge_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "punet")
deterministic_challenge_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "deterministic")

In [151]:
ssnens_challenge_df

Unnamed: 0,Unnamed: 0_x,ID,WMH_PV,WMH_Deep,Confounds,Protocol_AMS,Protocol_SIN,Protocol_UMC,Total,Unnamed: 0_y,...,pv_sum_ent_0.2,pv_mean_ent_0.2,pv_std_ent_0.2,pv_skew_ent_0.2,pv_kurtosis_ent_0.2,pv_prop_umap_segmented_ent_0.2,pv_prop_umap_within_expanded_seg_ent_0.2,pv_prop_seg_uncertain_ent_0.2,seg_volume_ent_0.2,vent_volume_ent_0.2
0,0.0,Amsterdam_GE3T_100,1.0,1.0,,True,False,False,2.0,40.0,...,2396.150879,0.450658,0.165870,0.036229,1.550232,0.252398,0.673312,0.781141,1718.0,0.0
1,1.0,Amsterdam_GE3T_101,1.0,1.0,,True,False,False,2.0,41.0,...,2721.807861,0.423232,0.161748,0.288452,1.688035,0.190017,0.527445,0.809808,1509.0,0.0
2,2.0,Amsterdam_GE3T_102,1.0,2.0,,True,False,False,3.0,42.0,...,1782.932617,0.423399,0.163722,0.275248,1.647701,0.199715,0.512467,0.867905,969.0,0.0
3,3.0,Amsterdam_GE3T_103,3.0,3.0,,True,False,False,6.0,43.0,...,5711.334473,0.459221,0.161346,-0.012149,1.600073,0.379834,0.819571,0.602859,7836.0,0.0
4,4.0,Amsterdam_GE3T_104,2.0,3.0,"Two Lacunar infarcts RH, one subcortical LH",True,False,False,5.0,44.0,...,8631.498047,0.453121,0.162127,0.029916,1.597624,0.423172,0.798047,0.736568,10944.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,,Singapore_95,,,,,,,,165.0,...,4873.139160,0.452810,0.162141,0.030811,1.605757,0.408010,0.804776,0.708226,6200.0,0.0
171,,Singapore_96,,,,,,,,166.0,...,13067.580078,0.493135,0.159364,-0.327733,1.717308,0.445790,0.841088,0.863019,13688.0,0.0
172,,Singapore_97,,,,,,,,167.0,...,5803.843262,0.477525,0.161286,-0.178109,1.624442,0.410235,0.841451,0.882947,5647.0,0.0
173,,Singapore_98,,,,,,,,168.0,...,2088.524414,0.443895,0.162411,0.114404,1.607789,0.285228,0.689479,0.748049,1794.0,0.0


In [153]:
ds_name = "CVD"
clinscores_path = "/home/s2208943/preprocessed_data/Ed_CVD/clinscore_data.csv"
sample_div_folder = "/home/s2208943/preprocessed_data/Ed_CVD/EdData_feature_spreadsheets"
extracted_features_folder = "/home/s2208943/preprocessed_data/Ed_CVD/EdData_feature_spreadsheets"

In [154]:
ssnens_cvd_df, clinscores_cvd_df = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "ssn_ens")
punet_cvd_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "punet")
deterministic_cvd_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "deterministic")

In [155]:
ssnens_cvd_df

Unnamed: 0,Unnamed: 0_x,ID,AGE,PTGENDER,diabetes,hypertension,hyperlipidaemia,SBP,DBP,totalChl,...,pv_sum_pred_0.5,pv_mean_pred_0.5,pv_std_pred_0.5,pv_skew_pred_0.5,pv_kurtosis_pred_0.5,pv_prop_umap_segmented_pred_0.5,pv_prop_umap_within_expanded_seg_pred_0.5,pv_prop_seg_uncertain_pred_0.5,seg_volume_pred_0.5,vent_volume_pred_0.5
0,0,CVD001,76.004929,1.0,0.0,0.0,1.0,128.666667,76.0,3.9,...,8849.757812,0.894819,0.141491,-1.326381,3.464518,1.0,1.0,0.903031,10952,17741
1,1,CVD002,77.144031,0.0,1.0,1.0,1.0,136.000000,64.0,3,...,3404.688232,0.876819,0.148455,-1.130556,2.936551,1.0,1.0,0.952883,4075,7818
2,2,CVD003,92.571194,0.0,0.0,1.0,1.0,137.500000,59.5,,...,17186.210938,0.918803,0.127024,-1.783825,5.057148,1.0,1.0,1.000000,18705,0
3,3,CVD004,53.000000,0.0,0.0,0.0,1.0,125.000000,80.0,3.4,...,505.458588,0.733612,0.142233,0.021770,1.718062,1.0,1.0,1.000000,689,0
4,4,CVD008,50.000000,1.0,0.0,1.0,0.0,196.000000,96.0,6.4,...,13588.746094,0.844546,0.147511,-0.783563,2.332050,1.0,1.0,1.000000,16090,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,CVD321,64.000000,0.0,0.0,1.0,1.0,144.000000,69.0,4.1,...,6094.219727,0.860401,0.153038,-0.920383,2.452858,1.0,1.0,1.000000,7083,0
246,246,CVD322,50.000000,1.0,0.0,1.0,1.0,123.000000,68.0,5.5,...,207.129761,0.761506,0.157415,-0.074200,1.613757,1.0,1.0,1.000000,272,0
247,247,CVD323,63.000000,0.0,0.0,1.0,1.0,112.000000,70.0,4.8,...,2331.335205,0.845913,0.152088,-0.797204,2.290525,1.0,1.0,1.000000,2756,0
248,248,CVD324,63.000000,0.0,0.0,1.0,1.0,170.000000,88.0,5.1,...,7491.174316,0.863237,0.150114,-0.971057,2.601280,1.0,1.0,1.000000,8678,0


In [164]:
ds_name = "ADNI300"
clinscores_path = "/home/s2208943/preprocessed_data/ADNI300/clinscore_data.csv"
sample_div_folder = "/home/s2208943/preprocessed_data/ADNI300/ADNI_300_feature_spreadsheets"
extracted_features_folder = "/home/s2208943/preprocessed_data/ADNI300/ADNI_300_feature_spreadsheets"
model_name = "ssn_ens"

In [165]:
ssnens_adni_df, clinscores_adni_df = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "ssn_ens")

In [166]:
ssnens_adni_df

Unnamed: 0,Unnamed: 0_x,ID,AGE,Ventricles_bl %,Hippocampus_bl %,WholeBrain_bl %,Entorhinal_bl %,Fusiform_bl %,MidTemp_bl %,BMI,...,pv_sum_pred_0.45,pv_mean_pred_0.45,pv_std_pred_0.45,pv_skew_pred_0.45,pv_kurtosis_pred_0.45,pv_prop_umap_segmented_pred_0.45,pv_prop_umap_within_expanded_seg_pred_0.45,pv_prop_seg_uncertain_pred_0.45,seg_volume_pred_0.45,vent_volume_pred_0.45
0,0,002_S_0729,65.1,1.292191,0.472993,71.867729,0.194479,1.399693,1.333163,22.810281,...,4702.428223,0.840470,0.168454,-0.876268,2.402109,0.949777,0.997855,0.809567,6564.0,9547.0
1,1,002_S_1155,57.8,,,,,,,23.555498,...,659.297668,0.783945,0.171873,-0.316636,1.746508,0.940547,0.994055,0.986284,802.0,16565.0
2,2,002_S_1261,71.1,2.157239,0.418533,67.676511,0.175341,1.133000,1.325454,22.935921,...,651.263184,0.721222,0.173634,0.064047,1.627275,0.870432,0.966777,0.981273,801.0,13517.0
3,3,002_S_1280,70.7,1.426753,0.463987,67.385750,0.256341,1.159829,1.225333,38.625486,...,4627.864258,0.852905,0.163574,-0.983468,2.643159,0.957059,0.998157,0.774266,6707.0,8957.0
4,4,002_S_2010,62.9,1.136949,0.574649,70.819040,0.245683,1.201422,1.461607,42.869691,...,204.131592,0.785122,0.174635,-0.410358,1.744580,0.930769,0.984615,0.681690,355.0,5684.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,294,941_S_4255,72.4,2.076654,0.422826,67.895019,0.225797,1.217249,1.382038,27.317500,...,1913.893433,0.815116,0.170373,-0.629126,2.025826,0.946337,0.995315,0.926219,2399.0,11719.0
295,295,941_S_4292,70.9,1.687682,0.531380,71.164047,0.281520,1.160003,1.405134,27.172606,...,1139.499390,0.785320,0.169501,-0.416349,1.833876,0.932460,0.996554,0.992663,1363.0,9455.0
296,296,941_S_4365,80.3,3.531851,0.439262,65.787697,0.259191,1.187521,1.150789,23.436967,...,2917.532227,0.831443,0.166278,-0.795583,2.281320,0.955543,0.998005,0.950936,3526.0,19698.0
297,297,941_S_4376,76.5,,,,,,,29.354299,...,3472.005859,0.860686,0.159868,-1.093848,2.903336,0.961081,0.998761,0.843743,4595.0,11219.0


In [167]:
punet_adni_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "punet")
punet_adni_df

Unnamed: 0,Unnamed: 0_x,ID,AGE,Ventricles_bl %,Hippocampus_bl %,WholeBrain_bl %,Entorhinal_bl %,Fusiform_bl %,MidTemp_bl %,BMI,...,pv_sum_pred_0.25,pv_mean_pred_0.25,pv_std_pred_0.25,pv_skew_pred_0.25,pv_kurtosis_pred_0.25,pv_prop_umap_segmented_pred_0.25,pv_prop_umap_within_expanded_seg_pred_0.25,pv_prop_seg_uncertain_pred_0.25,seg_volume_pred_0.25,vent_volume_pred_0.25
0,0,002_S_0729,65.1,1.292191,0.472993,71.867729,0.194479,1.399693,1.333163,22.810281,...,5961.055664,0.735116,0.252052,-0.538898,1.789233,0.752990,0.969787,0.797857,7653.0,9547.0
1,1,002_S_1155,57.8,,,,,,,23.555498,...,953.651733,0.632395,0.251955,0.041331,1.529704,0.621353,0.897215,0.963992,972.0,16565.0
2,2,002_S_1261,71.1,2.157239,0.418533,67.676511,0.175341,1.133000,1.325454,22.935921,...,821.048950,0.592387,0.240720,0.214201,1.637027,0.570707,0.901154,0.953012,830.0,13517.0
3,3,002_S_1280,70.7,1.426753,0.463987,67.385750,0.256341,1.159829,1.225333,38.625486,...,5998.335938,0.760920,0.246474,-0.703299,2.020667,0.785107,0.985158,0.770639,8031.0,8957.0
4,4,002_S_2010,62.9,1.136949,0.574649,70.819040,0.245683,1.201422,1.461607,42.869691,...,311.336853,0.625174,0.257196,0.037933,1.497694,0.614458,0.863454,0.549372,557.0,5684.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,294,941_S_4255,72.4,2.076654,0.422826,67.895019,0.225797,1.217249,1.382038,27.317500,...,2515.509766,0.718512,0.246919,-0.462717,1.759352,0.742074,0.974293,0.924555,2810.0,11719.0
295,295,941_S_4292,70.9,1.687682,0.531380,71.164047,0.281520,1.160003,1.405134,27.172606,...,1628.611694,0.659357,0.246546,-0.158696,1.585584,0.667611,0.951012,0.985066,1674.0,9455.0
296,296,941_S_4365,80.3,3.531851,0.439262,65.787697,0.259191,1.187521,1.150789,23.436967,...,3836.596191,0.724433,0.252015,-0.506112,1.757527,0.741503,0.968844,0.938351,4185.0,19698.0
297,297,941_S_4376,76.5,,,,,,,29.354299,...,4309.817383,0.773200,0.244041,-0.802650,2.161177,0.794761,0.982598,0.845259,5241.0,11219.0


In [170]:
deterministic_adni_df, _ = load_and_merge_dfs(ds_name, clinscores_path, sample_div_folder, extracted_features_folder, "deterministic")
deterministic_adni_df

Index(['Unnamed: 0'], dtype='object')
sample_div


Unnamed: 0.1,Unnamed: 0,ID,AGE,Ventricles_bl %,Hippocampus_bl %,WholeBrain_bl %,Entorhinal_bl %,Fusiform_bl %,MidTemp_bl %,BMI,...,pv_sum_pred_0.45,pv_mean_pred_0.45,pv_std_pred_0.45,pv_skew_pred_0.45,pv_kurtosis_pred_0.45,pv_prop_umap_segmented_pred_0.45,pv_prop_umap_within_expanded_seg_pred_0.45,pv_prop_seg_uncertain_pred_0.45,seg_volume_pred_0.45,vent_volume_pred_0.45
0,0,002_S_0729,65.1,1.292191,0.472993,71.867729,0.194479,1.399693,1.333163,22.810281,...,4697.713379,0.830278,0.168936,-0.782411,2.242169,0.947508,0.997879,0.807380,6640.0,9547.0
1,1,002_S_1155,57.8,,,,,,,23.555498,...,649.364929,0.776752,0.182033,-0.314227,1.654059,0.907895,0.980861,0.974326,779.0,16565.0
2,2,002_S_1261,71.1,2.157239,0.418533,67.676511,0.175341,1.133000,1.325454,22.935921,...,938.658569,0.733900,0.169965,-0.014301,1.687019,0.892885,0.990618,0.993908,1149.0,13517.0
3,3,002_S_1280,70.7,1.426753,0.463987,67.385750,0.256341,1.159829,1.225333,38.625486,...,4543.604492,0.835529,0.168590,-0.833039,2.322553,0.951085,0.998897,0.783399,6602.0,8957.0
4,4,002_S_2010,62.9,1.136949,0.574649,70.819040,0.245683,1.201422,1.461607,42.869691,...,209.682678,0.748867,0.181779,-0.131338,1.588403,0.885714,0.985714,0.654354,379.0,5684.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,294,941_S_4255,72.4,2.076654,0.422826,67.895019,0.225797,1.217249,1.382038,27.317500,...,1984.294922,0.805314,0.169672,-0.578224,1.978104,0.943588,0.997971,0.942440,2467.0,11719.0
295,295,941_S_4292,70.9,1.687682,0.531380,71.164047,0.281520,1.160003,1.405134,27.172606,...,859.095581,0.753593,0.170395,-0.207306,1.715446,0.902632,0.992982,0.974432,1056.0,9455.0
296,296,941_S_4365,80.3,3.531851,0.439262,65.787697,0.259191,1.187521,1.150789,23.436967,...,2601.195557,0.791118,0.169478,-0.525030,1.920868,0.927311,0.997263,0.960315,3175.0,19698.0
297,297,941_S_4376,76.5,,,,,,,29.354299,...,2966.470947,0.827237,0.164124,-0.787257,2.295651,0.953151,0.997211,0.856856,3989.0,11219.0
