## Creating the ADNI clinscore/fazekas spreadsheet

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from trustworthai.journal_run.new_MIA_fazekas_and_QC.creating_fazekas_clinscore_spreadsheets.spreadsheet_helper_funcs import *

### load the raw spreadsheet data

In [2]:
spreadsheet_dir = "/home/s2208943/datasets/ADNI300/"
variables_df = pd.read_excel(spreadsheet_dir + "ADNI_300_Variables_for_Analysis.xlsx")

In [3]:
variables_df

Unnamed: 0.1,Unnamed: 0,GENERAL INFO,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,BASELINE BRAIN MEASUREMENTS (% of ICV),Unnamed: 8,Unnamed: 9,...,IMAGE ANALYSIS FINDINGS,Unnamed: 14,Unnamed: 15,MEDICAL HISTORY,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Uncertainty Analysis
0,Patient ID,DX.bl,AGE,PTGENDER,PTEDUCAT,PTRACCAT,APOE4,Ventricles_bl %,Hippocampus_bl %,WholeBrain_bl %,...,BMB Y1 Count,BMB Y2 Count,BMB Y3 Count,VSBPDIA,VSBPSYS,VSPULSE,BMI,CV RISK FACTORS,E-M RISK FACTORS,
1,002_S_0729,2,65.1,2,16,5,1,1.292191,0.472993,71.867729,...,0,1,1,57,124,56,22.810281,1,0,Yes
2,002_S_1155,2,57.8,1,20,5,0,0,0,0,...,0,0,1,72,105,54,23.555498,1,1,Yes
3,002_S_1261,0,71.1,2,16,5,0,2.157239,0.418533,67.676511,...,1,1,1,54,108,56,22.935921,2,0,Yes
4,002_S_1280,0,70.7,2,14,5,1,1.426753,0.463987,67.38575,...,3,2,2,85,133,71,38.625486,2,0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,,,,,,,,,,,...,,,,,,,,,,
304,,,,,,,,,,,...,,,,,,,,,,
305,,,,,,,,,,,...,,,,,,,,,,
306,,,,,,,,,,,...,,,,,,,,177,126,


In [4]:
variables_df.keys()

Index(['Unnamed: 0', 'GENERAL INFO ', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'BASELINE BRAIN MEASUREMENTS (% of ICV)',
       'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
       'IMAGE ANALYSIS FINDINGS ', 'Unnamed: 14', 'Unnamed: 15',
       'MEDICAL HISTORY', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19',
       'Unnamed: 20', 'Unnamed: 21', 'Uncertainty Analysis'],
      dtype='object')

In [5]:
# dataset with fazekas ratings by Maria. I only have 290 of the 298 due to a few images missing matches, which is a shame
# but hopefully this is enough information. Nice.
ratings_df = pd.read_excel(spreadsheet_dir + "ADNI_300_Sample_MVH_ratings.xlsx")

  warn(msg)


In [6]:
ratings_df

Unnamed: 0,Patient ID,Image_year,PVS_BG,PVS_CS,Unnamed: 4,Image_year.1,WMH_PV,WMH_Deep,Total,Obs
0,002_S_0729,2013.0,4.0,4.0,,2011.0,2.0,2.0,4.0,
1,002_S_1155,2012.0,3.0,3.0,,2011.0,1.0,0.0,1.0,
2,002_S_1261,2014.0,1.0,3.0,,2011.0,1.0,1.0,2.0,
3,002_S_1280,2013.0,3.0,3.0,,2011.0,3.0,2.0,5.0,
4,002_S_2010,2013.0,1.0,1.0,,2010.0,1.0,1.0,2.0,
...,...,...,...,...,...,...,...,...,...,...
306,DWMH,,,,,,,,,
307,0 = absent,,,,,,,,,
308,1 = discrete diffuse lesions,,,,,,,,,
309,2 = beginning of confluence of foci,,,,,,,,,


In [9]:
list(variables_df.iloc[0].items())

[('Unnamed: 0', 'Patient ID'),
 ('GENERAL INFO ', 'DX.bl'),
 ('Unnamed: 2', 'AGE'),
 ('Unnamed: 3', 'PTGENDER'),
 ('Unnamed: 4', 'PTEDUCAT'),
 ('Unnamed: 5', 'PTRACCAT'),
 ('Unnamed: 6', 'APOE4'),
 ('BASELINE BRAIN MEASUREMENTS (% of ICV)', 'Ventricles_bl %'),
 ('Unnamed: 8', 'Hippocampus_bl %'),
 ('Unnamed: 9', 'WholeBrain_bl %'),
 ('Unnamed: 10', 'Entorhinal_bl %'),
 ('Unnamed: 11', 'Fusiform_bl %'),
 ('Unnamed: 12', 'MidTemp_bl %'),
 ('IMAGE ANALYSIS FINDINGS ', 'BMB Y1 Count'),
 ('Unnamed: 14', 'BMB Y2 Count'),
 ('Unnamed: 15', 'BMB Y3 Count'),
 ('MEDICAL HISTORY', 'VSBPDIA'),
 ('Unnamed: 17', 'VSBPSYS'),
 ('Unnamed: 18', 'VSPULSE'),
 ('Unnamed: 19', 'BMI'),
 ('Unnamed: 20', 'CV RISK FACTORS'),
 ('Unnamed: 21', 'E-M RISK FACTORS'),
 ('Uncertainty Analysis', nan)]

In [10]:
list(ratings_df.iloc[0].items())

[('Patient ID', '002_S_0729'),
 ('Image_year', 2013.0),
 ('PVS_BG', 4.0),
 ('PVS_CS', 4.0),
 ('Unnamed: 4', nan),
 ('Image_year.1', 2011.0),
 ('WMH_PV', 2.0),
 ('WMH_Deep', 2.0),
 ('Total', 4.0),
 ('Obs', nan)]

### preprocess the raw data

- normalization should be applied at training time (and so we don't need to do it here. But we might want to one-hot encode here)
- we have also not removed nan's at this point. Good. Since we are going to try a domain generalization version where we don't have any patient data (to try it on the WMH challenge dataset), its okay to have some patients with nans.

In [28]:
def prepare_ADNI_dfs(
    ratings_df, variables_df,
    selected_columns=[
        'Patient ID', 'AGE', 'Ventricles_bl %', 'Hippocampus_bl %',
        'WholeBrain_bl %', 'Entorhinal_bl %', 'Fusiform_bl %',
        'MidTemp_bl %', 'BMI', 'DX.bl', 'CV RISK FACTORS', 'APOE4',
        'WMH_PV', 'WMH_Deep', 'Total', 'PTGENDER', 'E-M RISK FACTORS',
    ]):
    
    r_df = ratings_df.copy()
    v_df = variables_df.copy()

    # in the variables df, put all the column headings actually in the heading,
    variables_heading_map = {
        key:column_heading 
        for (column_heading, key) in v_df.iloc[0].items()
    }
    for key, column_heading in variables_heading_map.items():
        v_df[key] = v_df[column_heading].values
        v_df = v_df.drop(columns=[column_heading], inplace=False)
    
    
    # remove any * characters and ' ' from patient IDs
    r_df['Patient ID'] = [str(pid).replace('*', '').replace(' ', '') for pid in r_df['Patient ID'].values]
    v_df['Patient ID'] = [str(pid).replace('*', '').replace(' ', '') for pid in v_df['Patient ID'].values]
    
    # remove any rows that do not have a patient ID. patient ID can be detected due to having a '_S_' string in it.
    pid_locs_rdf = ['_S_' in pid for pid in r_df['Patient ID'].values]
    pid_locs_vdf = ['_S_' in pid for pid in v_df['Patient ID'].values]
    r_df = r_df.loc[pid_locs_rdf]
    v_df = v_df.loc[pid_locs_vdf]
    
    # join the two dataframes
    df = pd.merge(r_df, v_df, how='left')
    
    print(df.keys())
    
    # drop any column that isn't selected
    df = df[selected_columns]
    
    # now normalization is done exclusively on the training data
    # # normalize columns
    # for norm_column in ['AGE', 'Ventricles_bl %', 'Hippocampus_bl %', 'WholeBrain_bl %', 'Entorhinal_bl %', 'Fusiform_bl %', 'MidTemp_bl %', 'BMI', 'PTEDUCAT', 'PTRACCAT']:
    #     if norm_column in selected_columns:
    #         normalize_column(df, norm_column)
            
    # one hot encoder columns
    for one_hot_col in ['DX.bl', 'CV RISK FACTORS', 'APOE4']:
        if one_hot_col in selected_columns:
            df = one_hot_encode(df, one_hot_col)
            
    # set values of zero to nan for brain measurement fields
    for no_zero_col in ['Ventricles_bl %', 'Hippocampus_bl %', 'WholeBrain_bl %', 'Entorhinal_bl %', 'Fusiform_bl %', 'MidTemp_bl %']:
        df = convert_symbol_to_nan(df, no_zero_col, 0)
            
    # change PTGENDER column to 0,1 (as opposed to 1, 2)
    df['PTGENDER'] = df['PTGENDER'] - 1
    
    # we no longer those colums that contain a nan value
    # in all selected columns, drop rows that contain a nan value
    # for col in df.keys():
    #     if col not in ['Patient ID']:
    #         try:
    #             df = filter_rows_with_nans(df, col)
    #         except:
    #             print("failed on: ", col)
    
    return df

In [29]:
combined_df = prepare_ADNI_dfs(ratings_df, variables_df)

Index([      'Patient ID',       'Image_year',           'PVS_BG',
                 'PVS_CS',       'Unnamed: 4',     'Image_year.1',
                 'WMH_PV',         'WMH_Deep',            'Total',
                    'Obs',            'DX.bl',              'AGE',
               'PTGENDER',         'PTEDUCAT',         'PTRACCAT',
                  'APOE4',  'Ventricles_bl %', 'Hippocampus_bl %',
        'WholeBrain_bl %',  'Entorhinal_bl %',    'Fusiform_bl %',
           'MidTemp_bl %',     'BMB Y1 Count',     'BMB Y2 Count',
           'BMB Y3 Count',          'VSBPDIA',          'VSBPSYS',
                'VSPULSE',              'BMI',  'CV RISK FACTORS',
       'E-M RISK FACTORS',                nan],
      dtype='object')


In [30]:
for k, v in combined_df[combined_df['Patient ID'] == '002_S_0729'].items():
    print(k, " : ", v[0])

Patient ID  :  002_S_0729
AGE  :  65.1
Ventricles_bl %  :  1.29219139675016
Hippocampus_bl %  :  0.472992859023808
WholeBrain_bl %  :  71.8677292934926
Entorhinal_bl %  :  0.194479297365119
Fusiform_bl %  :  1.39969329429806
MidTemp_bl %  :  1.3331629412767
BMI  :  22.8102805009477
WMH_PV  :  2.0
WMH_Deep  :  2.0
Total  :  4.0
PTGENDER  :  1
E-M RISK FACTORS  :  0
DX.bl_0  :  False
DX.bl_1  :  False
DX.bl_2  :  True
DX.bl_3  :  False
CV RISK FACTORS_0  :  False
CV RISK FACTORS_1  :  True
CV RISK FACTORS_2  :  False
APOE4_0  :  False
APOE4_1  :  True
APOE4_2  :  False


In [32]:
combined_df.to_csv("/home/s2208943/preprocessed_data/ADNI300/clinscore_data.csv")