# Classifying with healthy CpGs

- This workbook contains the classification models for healthy vs Alzheimer's vs Huntingtons using the HC top 55 CpG sites and the HC linear model residual values

- 3 classifiers:

1. Using top 55 CpGs and age to classify Healthy vs Huntingtons vs Alzheimers
2. Using top 55 CpGs and age to classify Healthy vs Alzheimers
3. Using linear model residual values and age to classify Healthy vs Alzheimers

In [1]:
import _pickle as cPickle
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
from xgboost import XGBRegressor
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

### Reading in data

In [2]:
huntingtons_df=pd.read_csv('../huntingtons/hunt_brain_top_55.csv',index_col=0)  

In [3]:
alzheimers_df=pd.read_csv('../alzheimers/alz_brain_top_55.csv',index_col=0)  

In [4]:
#Drop ages outside range 20-100
huntingtons_df=huntingtons_df.loc[huntingtons_df['AGE'] >= 20]
huntingtons_df=huntingtons_df.loc[huntingtons_df['AGE'] <= 110]
alzheimers_df=alzheimers_df.loc[alzheimers_df['AGE'] >= 20]
alzheimers_df=alzheimers_df.loc[alzheimers_df['AGE'] <= 110]

In [5]:
healthy=pd.read_csv('../healthy/hc_brain_55CpGs.csv',index_col=0)  

In [6]:
#Drop ages outside range 20-100
healthy=healthy.loc[healthy['age'] >= 20]
healthy=healthy.loc[healthy['age'] <= 110]

In [7]:
#Get any CpGs in healthy dataframe that are not in brain_shared_healthy_unhealthy file and print
#un=list(hc_df.columns)[1:]
#alz=list(alzheimers_df.columns)[1:]

#with open(r"../brain_shared_healthy_unhealthy", "rb") as input_file:
    #brain_shared_healthy_unhealthy = cPickle.load(input_file)
    
#brain_shared_healthy_unhealthy=list(brain_shared_healthy_unhealthy)
#for i in un:
    #if i not in alz:
        #print(i)

In [11]:
healthy=healthy.drop(columns=['cg00050873'])   

### Adding status column

Healthy = 0

Alzheimer's = 1

Huntingston's = 2

In [12]:
healthy['status']=[0]*len(list(healthy.index))
alzheimers_df['status']=[1]*len(list(alzheimers_df.index))
huntingtons_df['status']=[2]*len(list(huntingtons_df.index))

In [15]:
healthy=healthy.rename(columns={"age": "AGE"})

In [17]:
healthy=healthy[['status', 'AGE']+list(healthy.columns)[1:-1]]
alzheimers_df=alzheimers_df[['status', 'AGE']+list(alzheimers_df.columns)[1:-1]]
huntingtons_df=huntingtons_df[['status', 'AGE']+list(huntingtons_df.columns)[1:-1]]

In [18]:
frames=[healthy,alzheimers_df,huntingtons_df]
classification_df = pd.concat(frames)
classification_df

Unnamed: 0,status,AGE,cg00807959,cg01066472,cg13806070,cg15907146,cg17104258,cg24441324,cg22454769,cg23606718,...,cg19622662,cg23595055,cg04739123,cg16367511,cg18008766,cg19451698,cg04834794,cg07303143,cg21182694,cg23352942
GSM2139432,0,71.0,0.288,0.331,0.128,0.607,0.106,0.829,0.299,0.158,...,0.088,0.948,0.037,0.217,0.105,0.102,0.124,0.289,0.240,0.475
GSM2139249,0,76.0,0.329,0.378,0.140,0.563,0.101,0.880,0.413,0.153,...,0.095,0.942,0.039,0.161,0.130,0.064,0.084,0.318,0.215,0.466
GSM2139398,0,102.0,0.294,0.339,0.067,0.604,0.188,0.824,0.395,0.121,...,0.154,0.947,0.044,0.124,0.114,0.085,0.063,0.317,0.219,0.432
GSM2139297,0,108.0,0.327,0.528,0.153,0.703,0.054,0.847,0.438,0.262,...,0.053,0.964,0.088,0.261,0.168,0.105,0.192,0.419,0.293,0.559
GSM1069208,0,40.0,0.152,0.452,0.056,0.559,0.250,0.840,0.212,0.085,...,0.232,0.898,0.038,0.083,0.073,0.044,0.023,0.231,0.193,0.602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1871815,2,56.0,0.231,0.244,0.117,0.587,0.175,0.919,0.265,0.140,...,0.077,0.959,0.048,0.112,0.052,0.038,0.025,0.161,0.181,0.425
GSM1871849,2,62.0,0.212,0.433,0.125,0.667,0.044,0.845,0.155,0.147,...,0.062,0.969,0.049,0.141,0.066,0.044,0.063,0.205,0.211,0.420
GSM1871852,2,58.0,0.250,0.449,0.120,0.623,0.212,0.917,0.273,0.130,...,0.159,0.935,0.035,0.141,0.069,0.048,0.043,0.233,0.176,0.496
GSM1871860,2,91.0,0.287,0.657,0.129,0.649,0.017,0.902,0.324,0.176,...,0.055,0.983,0.052,0.154,0.069,0.060,0.057,0.276,0.237,0.384


In [24]:
def mean_impute(data):
    nas=data.isnull().sum()    
    col_means=data.mean(axis=0)
    na_cols=[]
    na_cols_means=[]

    for i in range(len(nas)):
        if nas[i]!=0:        
            na_cols.append(nas.keys()[i])
            na_cols_means.append(col_means[i])
        
    ids=list(data.index)
    for i in ids:
        for j in range(len(na_cols)):
            if str(data.loc[i][na_cols[j]])=="nan":
                data.loc[i][na_cols[j]]=na_cols_means[j]
    
    
    return data

### Classifying healthy vs Alzheimer's vs Huntington's

- Using top 55 CpGs from HC

In [19]:
#Reading in test/train split IDs
with open(r"../train_test_ids/brain_alz_working_ids", "rb") as input_file:
    brain_alz_working_ids = cPickle.load(input_file)
with open(r"../train_test_ids/brain_alz_held_out_ids", "rb") as input_file:
    brain_alz_held_out_ids = cPickle.load(input_file)

with open(r"../train_test_ids/brain_hunt_working_ids", "rb") as input_file:
    brain_hunt_working_ids = cPickle.load(input_file)
with open(r"../train_test_ids/brain_hunt_held_out_ids", "rb") as input_file:
    brain_hunt_held_out_ids = cPickle.load(input_file)

#Reading in test/train split IDs for healthy 
with open(r"../train_test_ids/brain_working_sample_ids", "rb") as input_file:
    brain_hc_working_ids = cPickle.load(input_file)
with open(r"../train_test_ids/brain_saved_sample_ids", "rb") as input_file:
    brain_hc_held_out_ids = cPickle.load(input_file)

In [36]:
#Getting working and heldout dataframes 
classification_df_working=classification_df
classification_df_heldout=classification_df
all_id=list(classification_df.index)
for i in all_id:
    if i not in brain_alz_working_ids and i not in brain_hc_working_ids and i not in brain_hunt_working_ids:
        classification_df_working=classification_df_working.drop([i])
    if i not in brain_alz_held_out_ids and i not in brain_hc_held_out_ids and i not in brain_hunt_held_out_ids:
        classification_df_heldout=classification_df_heldout.drop([i])

In [37]:
#X and y split for working and held out data
X_hold=classification_df_heldout.iloc[:,1:]
y_hold=classification_df_heldout.iloc[:,0]
X_hold=mean_impute(X_hold)
X_work=classification_df_working.iloc[:,1:]
y_work=classification_df_working.iloc[:,0]

In [38]:
X_tr, X_tst, y_train, y_test = train_test_split(X_work, y_work, test_size=0.25,random_state=4)
X_train = mean_impute(X_tr)
X_test = mean_impute(X_tst)
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [39]:
#Define class accuracy function
def class_accuracy(true,pred):
    correct=0
    total=len(pred)
    for i in range(total):
        if pred[i]==true[i]:
            correct+=1
    return correct/total

In [40]:
from sklearn.metrics import classification_report
train_acc = class_accuracy(y_train,y_train_pred)
test_acc = class_accuracy(y_test,y_test_pred)
print('Train acc = ',train_acc,' Test acc = ',test_acc)
y_hold_pred = model.predict(X_hold)
valid_acc = class_accuracy(y_hold,y_hold_pred)
print('Heldout data acc = ',valid_acc)
print('Classification report test')
print(classification_report(y_test,y_test_pred,target_names=['Healthy','Alzheimers','Huntingtons']))
print('Classification held out data')
print(classification_report(y_hold,y_hold_pred,target_names=['Healthy','Alzheimers','Huntingtons']))

Train acc =  0.6674382716049383  Test acc =  0.7205542725173211
Heldout data acc =  0.6683848797250859
Classification report test
              precision    recall  f1-score   support

     Healthy       0.72      0.82      0.77       237
  Alzheimers       0.74      0.73      0.74       146
 Huntingtons       0.59      0.20      0.30        50

    accuracy                           0.72       433
   macro avg       0.68      0.59      0.60       433
weighted avg       0.71      0.72      0.70       433

Classification held out data
              precision    recall  f1-score   support

     Healthy       0.71      0.74      0.72       319
  Alzheimers       0.62      0.70      0.66       196
 Huntingtons       0.58      0.22      0.32        67

    accuracy                           0.67       582
   macro avg       0.64      0.56      0.57       582
weighted avg       0.66      0.67      0.66       582



### Classifying healthy vs Alzheimer's 

- Using top 55 CpGs from HC

In [41]:
frames2=[healthy,alzheimers_df]
alzheimers_classification_df = pd.concat(frames2)

In [42]:
#Getting working and heldout dataframes 
alzheimers_classification_df_working=alzheimers_classification_df
alzheimers_classification_df_heldout=alzheimers_classification_df
all_alz_class_id=list(alzheimers_classification_df.index)
for i in all_alz_class_id:
    if i not in brain_alz_working_ids and i not in brain_hc_working_ids:
        alzheimers_classification_df_working=alzheimers_classification_df_working.drop([i])
    if i not in brain_alz_held_out_ids and i not in brain_hc_held_out_ids:
        alzheimers_classification_df_heldout=alzheimers_classification_df_heldout.drop([i])

In [43]:
#X and y split for working and held out data
X_hold=alzheimers_classification_df_heldout.iloc[:,1:]
y_hold=alzheimers_classification_df_heldout.iloc[:,0]
X_hold=mean_impute(X_hold)
X_work=alzheimers_classification_df_working.iloc[:,1:]
y_work=alzheimers_classification_df_working.iloc[:,0]

In [44]:
X_tr, X_tst, y_train, y_test = train_test_split(X_work, y_work, test_size=0.25,random_state=4)
X_train = mean_impute(X_tr)
X_test = mean_impute(X_tst)
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [46]:
from sklearn.metrics import classification_report
train_acc = class_accuracy(y_train,y_train_pred)
test_acc = class_accuracy(y_test,y_test_pred)
print('Train acc = ',train_acc,' Test acc = ',test_acc)
y_hold_pred = model.predict(X_hold)
valid_acc = class_accuracy(y_hold,y_hold_pred)
print('Heldout data acc = ',valid_acc)
print('Classification report test')
print(classification_report(y_test,y_test_pred,target_names=['Healthy','Alzheimers']))
print('Classification held out data')
print(classification_report(y_hold,y_hold_pred,target_names=['Healthy','Alzheimers']))

Train acc =  0.7364746945898778  Test acc =  0.7597911227154047
Heldout data acc =  0.7203883495145631
Classification report test
              precision    recall  f1-score   support

     Healthy       0.79      0.82      0.81       231
  Alzheimers       0.71      0.66      0.69       152

    accuracy                           0.76       383
   macro avg       0.75      0.74      0.75       383
weighted avg       0.76      0.76      0.76       383

Classification held out data
              precision    recall  f1-score   support

     Healthy       0.78      0.77      0.77       319
  Alzheimers       0.63      0.64      0.64       196

    accuracy                           0.72       515
   macro avg       0.70      0.71      0.70       515
weighted avg       0.72      0.72      0.72       515



### Classification healthy vs Alzheimers with residual values

- Using HC's model residuals to classify healthy vs. Alzheimers

In [48]:
#Adding residual values
#Read in linear model
mod_55_brain = pd.read_pickle(r'../models/brain_mod_55')

In [62]:
#X and y split for working and held out data
X_hold=alzheimers_classification_df_heldout.iloc[:,1:]
y_hold=alzheimers_classification_df_heldout.iloc[:,0]
X_hold=mean_impute(X_hold)

X_work=alzheimers_classification_df_working.iloc[:,1:]
X_work=mean_impute(X_work)
y_work=alzheimers_classification_df_working.iloc[:,0]

In [65]:
pred_work = mod_55_brain.predict(X_work.iloc[:,1:])
res_work=pred_work.reshape((1,pred_work.shape[0]))-X_work.iloc[:,0].values
X_work['residuals']=res_work[0]

In [67]:
pred_hold = mod_55_brain.predict(X_hold.iloc[:,1:])
res_hold=pred_hold.reshape((1,pred_hold.shape[0]))-X_hold.iloc[:,0].values
X_hold['residuals']=res_hold[0]

In [69]:
X_hold

Unnamed: 0,AGE,cg00807959,cg01066472,cg13806070,cg15907146,cg17104258,cg24441324,cg22454769,cg23606718,cg24079702,...,cg23595055,cg04739123,cg16367511,cg18008766,cg19451698,cg04834794,cg07303143,cg21182694,cg23352942,residuals
GSM1443489,40.0,0.152,0.452,0.056,0.559,0.250,0.840,0.212,0.085,0.144,...,0.898,0.038,0.083,0.073,0.044,0.023,0.231000,0.193,0.602,66.617130
GSM1069141,59.0,0.254,0.358,0.118,0.580,0.096,0.794,0.359,0.168,0.267,...,0.981,0.041,0.140,0.057,0.054,0.076,0.319000,0.219,0.485,68.393146
GSM1069172,66.0,0.270,0.382,0.183,0.652,0.072,0.874,0.376,0.176,0.291,...,0.955,0.047,0.136,0.097,0.099,0.033,0.284000,0.232,0.562,64.513223
GSM1443533,66.0,0.270,0.382,0.183,0.652,0.072,0.874,0.376,0.176,0.291,...,0.955,0.047,0.136,0.097,0.099,0.033,0.284000,0.232,0.562,64.513223
GSM1443763,67.0,0.212,0.354,0.116,0.629,0.046,0.875,0.295,0.127,0.269,...,0.962,0.053,0.135,0.081,0.061,0.091,0.221000,0.198,0.475,52.932824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2809047,76.0,0.316,0.495,0.152,0.670,0.155,0.870,0.347,0.170,0.224,...,0.926,0.072,0.163,0.120,0.061,0.088,0.222000,0.206,0.471,59.924888
GSM2809048,87.0,0.311,0.543,0.162,0.664,0.044,0.886,0.387,0.190,0.285,...,0.941,0.074,0.184,0.108,0.054,0.077,0.257352,0.307,0.417,57.471041
GSM2809053,78.0,0.313,0.622,0.148,0.708,0.144,0.905,0.439,0.159,0.295,...,0.946,0.059,0.119,0.126,0.079,0.064,0.257352,0.164,0.602,60.956629
GSM2809054,76.0,0.285,0.498,0.143,0.621,0.114,0.891,0.372,0.167,0.251,...,0.948,0.084,0.155,0.095,0.099,0.100,0.257352,0.207,0.503,59.363260


In [70]:
X_hold=X_hold.iloc[:,[0,-1]]
X_work=X_work.iloc[:,[0,-1]]


In [63]:
#Working data test/train split

In [71]:
X_tr, X_tst, y_train, y_test = train_test_split(X_work, y_work, test_size=0.25,random_state=4)
X_train = mean_impute(X_tr)
X_test = mean_impute(X_tst)
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [72]:
from sklearn.metrics import classification_report
train_acc = class_accuracy(y_train,y_train_pred)
test_acc = class_accuracy(y_test,y_test_pred)
print('Train acc = ',train_acc,' Test acc = ',test_acc)
y_hold_pred = model.predict(X_hold)
valid_acc = class_accuracy(y_hold,y_hold_pred)
print('Heldout data acc = ',valid_acc)
print('Classification report test')
print(classification_report(y_test,y_test_pred,target_names=['Healthy','Alzheimers']))
print('Classification held out data')
print(classification_report(y_hold,y_hold_pred,target_names=['Healthy','Alzheimers']))

Train acc =  0.7094240837696335  Test acc =  0.7519582245430809
Heldout data acc =  0.6932038834951456
Classification report test
              precision    recall  f1-score   support

     Healthy       0.78      0.82      0.80       231
  Alzheimers       0.71      0.64      0.67       152

    accuracy                           0.75       383
   macro avg       0.74      0.73      0.74       383
weighted avg       0.75      0.75      0.75       383

Classification held out data
              precision    recall  f1-score   support

     Healthy       0.75      0.76      0.75       319
  Alzheimers       0.60      0.59      0.59       196

    accuracy                           0.69       515
   macro avg       0.67      0.67      0.67       515
weighted avg       0.69      0.69      0.69       515

