# Prognositc (trajectory prediction) performance of baseline ML models

* **Objectives:** 
    1. Predict Trajectory classes based on previously trained models (only MMSE based trajectories) 


* **Timepoints:**
    1. baseline (bl)
    2. baseline + follow-up (var_tp: since second tp can be from varying interval)


* **Input modalities:**
    1. clinical features (CS)
    2. structural features (CT)    
    3. both (CS+CT)


* **Models:** (10 model instances (per fold) are saved for each input combination)
    1. Logistic regression (Lasso)
    2. SVM
    3. Random Forest
    4. ANN
    5. LSM
    

In [1]:
# Basic Imports
import numpy as np
import pandas as pd
from scipy import stats
import pickle
import re
import collections
#import tables as tb
from math import isnan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold

%matplotlib inline
#plt.rcParams['figure.figsize'] = (15, 10)
import warnings; warnings.simplefilter('ignore') #to ignore old sKF iterator warning not used in this notebook



# Naming:
    1. tp_name in ['bl','var_tp']
    2. modality in ['CS','CT','CS_CT']
    3. model in ['RFC','SVC','LR_L1','ANN','LSN']
    
# Order of variables
    1. ['CS(bl,tp)','AGE','APOE4', CT(bl,tp)]
    2. CT ROI order is grabbed from an exp_setup file

In [128]:
# Data dirs
model_dir = '/data/chamal/projects/nikhil/data/Traj_prediction/trained_models/'
project_dir = '/data/chamal/projects/nikhil/data/ADB/'
adb_civet_dir = '{}/civet/'.format(project_dir)

#To grab order of CT ROIs
AAL_roi_names = '{}/AAL_ROI_names_number_map.txt'.format(project_dir)
sample_exp_setup_path = '{}/Exp_502_ALL_ADNI_traj_MMSE_tp_var_tp_sample_setup_grab_CT_order.pkl'.format(project_dir)


In [149]:
# Create ordered CT ROIs
exp_setup = pd.read_pickle(sample_exp_setup_path)
df = exp_setup['df']
ct_cols_bl = list(df.columns[pd.Series(df.columns).str.contains('CT_bl')])
ct_cols_tp = list(df.columns[pd.Series(df.columns).str.contains('CT_var_tp')])
#ct_cols

# roi - name map
aal_roi_names = pd.read_csv(AAL_roi_names,delimiter='\'',header=None)
aal_roi_names = aal_roi_names[[0,1,3]]
aal_roi_names.columns = ['id','roi','name']
aal_roi_names['roi'] = aal_roi_names['roi']+'_CT_bl'
aal_roi_names_dict = dict(zip((aal_roi_names['id'].astype(str)), aal_roi_names['roi']))
aal_roi_names_dict['0_x'] = 'background_L'
aal_roi_names_dict['0_y'] = 'background_R'

In [185]:
def get_trained_model(model_dir,tp,modality,model,fold):
    if model in ['RFC','SVC','LR_L1']:
        saved_model_path = '{}/reference_models/baseline_models_tp_{}_{}.pkl'.format(model_dir,tp,modality)
    elif model == 'ann': # need to modify this
        saved_model_path = '{}/ann_models/'.format(model_dir,tp,modality)
    elif model == 'lsn': # need to modify this
        saved_model_path = '{}/lsn_models/'.format(model_dir,tp,modality)
    else:
        print('unknown model')
    saved_model_data = pd.read_pickle(saved_model_path)
    
    # grab the scaler and classifier from the pickle
    if model in ['SVC']:
        scaler = saved_model_data[model]['scaler_list'][fold]
    else:
        scaler = []
    clf = saved_model_data[model]['parallel_result'][fold]['clf']
    feat_imp = saved_model_data[model]['parallel_result'][fold]['feat_imp']
    print('required input shape: {}'.format(feat_imp.shape))
    
    return scaler, clf

def read_AAL_summary(subject_dir,sub_idx):
    # Subject naming: ADB_0226_AAL_lobe_thickness_tlaplace_30mm_left.dat    
    subject_file = subject_dir + 'ADB_{}_AAL_lobe_thickness_tlaplace_30mm_{}.dat'
    sub_df = pd.DataFrame(columns=['ROI','value'])
    for hemi in ['left','right']:
        df = pd.read_csv(subject_file.format(sub_idx,hemi),header=1,delim_whitespace=True)    
        df = df[['#','Label']]
        df.rename(columns={'#':'ROI','Label':'value'},inplace=True)
        df = df[df['ROI']!='Total']        
        df = df[df['ROI']!='0']        
        sub_df = sub_df.append(df)
    
    sub_df_T = sub_df.set_index('ROI').T    
    sub_df_T['sub_idx'] = 'ADB{}'.format(sub_idx)
    return sub_df_T


# Read clinical data
- this is for entire cohort

In [116]:
# Read demo data
adb_demo = pd.read_csv('{}/csv/ADB_scanqc_2018-07-05_ST.csv'.format(project_dir))
adb_demo.rename(columns={'ID':'sub_idx'},inplace=True)

# Read apoe4 data
adb_apoe = pd.read_csv('{}/csv/adb_apoestatus.csv'.format(project_dir))
adb_apoe.rename(columns={'subject':'sub_idx'},inplace=True)
adb_apoe = adb_apoe[~adb_apoe['apoe_genotype'].isnull()]
adb_apoe['apoe_A'],adb_apoe['apoe_B'] = adb_apoe['apoe_genotype'].str.split('-', 1).str
adb_apoe['apoe_A4'] = 0
adb_apoe['apoe_B4'] = 0
adb_apoe.loc[adb_apoe['apoe_A']=='4','apoe_A4'] = 1
adb_apoe.loc[adb_apoe['apoe_B']=='4','apoe_B4'] = 1
adb_apoe['APOE_status'] = adb_apoe['apoe_A4'] + adb_apoe['apoe_B4']

# Read MMSE data
adb_mmse = pd.read_csv('{}/csv/ADB_MMSE_grant_2017-03-24.csv'.format(project_dir))
adb_mmse.rename(columns={'subject_ID':'sub_idx','total_man':'mmse_calc','total_world_man':'mmse_world'},inplace=True)

# Read civet data
- this is per subject

In [148]:
sub_idx = '0124'
civet_df = read_AAL_summary(adb_civet_dir,sub_idx)
civet_df.rename(columns=aal_roi_names_dict,inplace=True)

# ***tmp*** include dummy IPL columns (civet 2.1 has made them obsolete)
civet_df['IPL.L_CT_bl'] = 0
civet_df['IPL.R_CT_bl'] = 0

# reorder based on trained model expectations
civet_df = civet_df[['sub_idx']+ct_cols_bl]
civet_df

ROI,sub_idx,PreCG.L_CT_bl,PreCG.R_CT_bl,SFGdor.L_CT_bl,SFGdor.R_CT_bl,ORBsup.L_CT_bl,ORBsup.R_CT_bl,MFG.L_CT_bl,MFG.R_CT_bl,ORBmid.L_CT_bl,...,STG.L_CT_bl,STG.R_CT_bl,TPOsup.L_CT_bl,TPOsup.R_CT_bl,MTG.L_CT_bl,MTG.R_CT_bl,TPOmid.L_CT_bl,TPOmid.R_CT_bl,ITG.L_CT_bl,ITG.R_CT_bl
value,ADB0124,2.6661,2.7267,2.8307,2.7755,2.5833,2.4283,2.7453,2.8895,2.5981,...,2.7995,2.7519,3.0018,3.0807,2.9008,2.828,3.0054,3.2014,2.8375,2.812


# Merge all input datatypes 

In [188]:
input_data = pd.merge(adb_demo,adb_mmse,on='sub_idx',how='inner')
input_data = pd.merge(input_data,adb_apoe[['sub_idx','APOE_status']],on='sub_idx',how='inner')
input_data = pd.merge(input_data,civet_df,on='sub_idx',how='inner')
print(input_data.shape)

# Create X matrix (order: ['CS(bl,tp)','AGE','APOE4', CT(bl,tp)])
# TODO # check mmse column with Mallar
X = input_data[['mmse_world','age','APOE_status']+ct_cols_bl].values

(1, 85)


# Try a model

In [187]:
tp = 'bl'
modality = 'CS_CT'
model = 'SVC'
fold = 0

scaler, clf = get_trained_model(model_dir,tp,modality,model,fold)

if model in ['SVC']: # LR_L1 didn't make a difference with a scaler
    X = scaler.transform(X) 
    
traj_pred = clf.predict(X)
print('predicted trajectory: {}'.format(traj_pred))

required input shape: (127,)
predicted trajectory: [1]
