# Prognositc (trajectory prediction) performance of baseline ML models

* **Objectives:** 
    1. Predict Trajectory classes based on previously trained models (only MMSE based trajectories) 


* **Timepoints:**
    1. baseline (bl)
    2. baseline + follow-up (var_tp: since second tp can be from varying interval)


* **Input modalities:**
    1. clinical features (CS)
    2. structural features (CT)    
    3. both (CS+CT)


* **Models:** (10 model instances (per fold) are saved for each input combination)
    1. Logistic regression (Lasso)
    2. SVM
    3. Random Forest
    4. ANN
    5. LSM
    

In [1]:
# Basic Imports
import numpy as np
import pandas as pd
from scipy import stats
import pickle
import re
import collections
#import tables as tb
from math import isnan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold

%matplotlib inline
#plt.rcParams['figure.figsize'] = (15, 10)
import warnings; warnings.simplefilter('ignore') #to ignore old sKF iterator warning not used in this notebook



# Naming:
    1. tp_name in ['bl','var_tp']
    2. modality in ['CS','CT','CS_CT']
    3. model in ['RFC','SVC','LR_L1','ANN','LSN']

In [2]:
# Data dirs
model_dir = '/data/chamal/projects/nikhil/data/Traj_prediction/trained_models/'
project_dir = '/data/chamal/projects/nikhil/data/ADB/'
adb_civet_dir = '{}/civet/'.format(project_dir)

In [98]:
def get_trained_model(model_dir,tp,modality,model,fold):
    if model in ['RFC','SVC','LR_L1']:
        saved_model_path = '{}/reference_models/baseline_models_tp_{}_{}.pkl'.format(model_dir,tp_name,modality)
    elif model == 'ann':
        saved_model_path = '{}/ann_models/baseline_models_tp_{}_{}.pkl'.format(model_dir,tp_name,modality)
    elif model == 'lsn':
        saved_model_path = '{}/lsn_models/baseline_models_tp_{}_{}.pkl'.format(model_dir,tp_name,modality)
    else:
        print('unknown model')
    saved_model_data = pd.read_pickle(saved_model_path)
    # grab the classifier from the pickle
    clf = saved_model_data[model]['parallel_result'][fold]['clf']
    feat_imp = saved_model_data[model]['parallel_result'][fold]['feat_imp']
    print('required input shape: {}'.format(feat_imp.shape))
    
    return clf

def read_AAL_summary(subject_dir,sub_idx):
    # Subject naming: ADB_0226_AAL_lobe_thickness_tlaplace_30mm_left.dat    
    subject_file = subject_dir + 'ADB_{}_AAL_lobe_thickness_tlaplace_30mm_{}.dat'
    sub_df = pd.DataFrame(columns=['ROI','value'])
    for hemi in ['left','right']:
        df = pd.read_csv(subject_file.format(sub_idx,hemi),header=1,delim_whitespace=True)    
        df = df[['#','Label']]
        df.rename(columns={'#':'ROI','Label':'value'},inplace=True)
        df = df[df['ROI']!='Total']        
        df = df[df['ROI']!='0']        
        sub_df = sub_df.append(df)
    
    sub_df_T = sub_df.set_index('ROI').T    
    sub_df_T['sub_idx'] = 'ADB{}'.format(sub_idx)
    return sub_df_T


# Read clinical data
- this is for entire cohort

In [116]:
# Read demo data
adb_demo = pd.read_csv('{}/csv/ADB_scanqc_2018-07-05_ST.csv'.format(project_dir))
adb_demo.rename(columns={'ID':'sub_idx'},inplace=True)

# Read apoe4 data
adb_apoe = pd.read_csv('{}/csv/adb_apoestatus.csv'.format(project_dir))
adb_apoe.rename(columns={'subject':'sub_idx'},inplace=True)
adb_apoe = adb_apoe[~adb_apoe['apoe_genotype'].isnull()]
adb_apoe['apoe_A'],adb_apoe['apoe_B'] = adb_apoe['apoe_genotype'].str.split('-', 1).str
adb_apoe['apoe_A4'] = 0
adb_apoe['apoe_B4'] = 0
adb_apoe.loc[adb_apoe['apoe_A']=='4','apoe_A4'] = 1
adb_apoe.loc[adb_apoe['apoe_B']=='4','apoe_B4'] = 1
adb_apoe['APOE_status'] = adb_apoe['apoe_A4'] + adb_apoe['apoe_B4']

# Read MMSE data
adb_mmse = pd.read_csv('{}/csv/ADB_MMSE_grant_2017-03-24.csv'.format(project_dir))
adb_mmse.rename(columns={'subject_ID':'sub_idx','total_man':'mmse_calc','total_world_man':'mmse_world'},inplace=True)

# Read civet data
- this is per subject

In [117]:
sub_idx = '0124'
civet_df = read_AAL_summary(adb_civet_dir,sub_idx)

# Merge all input datatypes 

In [118]:
input_data = pd.merge(adb_demo,adb_apoe[['sub_idx','APOE_status']],on='sub_idx',how='inner')
input_data = pd.merge(input_data,adb_mmse,on='sub_idx',how='inner')
input_data = pd.merge(input_data,civet_df,on='sub_idx',how='inner')
print(input_data.shape)

# Create X matrix
# TODO
input_data

(1, 83)


Unnamed: 0,sub_idx,age,sex,group,APOE_status,mmse_calc,mmse_world,1,3,5,...,64,66,68,70,80,82,84,86,88,90
0,ADB0124,74,F,AD,2,21.0,23.0,2.6661,2.8307,2.5833,...,2.7062,2.825,2.7066,2.7154,2.556,2.7519,3.0807,2.828,3.2014,2.812


# Try a model

In [51]:
tp = 'bl'
modality = 'CS_CT'
model = 'LR_L1'
fold = 0

clf = get_trained_model(baseline_dir,tp,modality,model,fold)
traj_pred = clf.predict(input_data)
traj_pred

required input shape: (1, 81)
