In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import _pickle as cPickle
import os


plt.rcdefaults()

In [13]:
directory_alz = r'brain/huntingtons/feature_selection'
alz_trials=os.listdir(directory_alz)




In [14]:
importances=[]
for file in alz_trials:
    with open(directory_alz+'/'+file, "rb") as input_file:
        trial = cPickle.load(input_file)
        importances.append(trial)


In [15]:
alz_blood_all = pd.read_csv('brain/huntingtons/hunt_brain_unhealthy_all.csv', index_col=0,low_memory=False)
#alz_blood_all = alz_blood_all.set_index('sample_id')
alz_blood_all

Unnamed: 0,AGE,disease,tissue,cg00050873,cg00212031,cg00213748,cg00214611,cg00455876,cg01707559,cg02004872,...,cg04103088,cg04105342,cg04106633,cg04107037,cg04111054,cg04112173,cg04112590,cg04112626,cg04113154,cg04114386
GSM1871451,83.0,Huntington's disease,brain - caudate nucleus,0.845,0.039,,0.065,0.651,0.066,0.014,...,0.976,0.228,0.711,0.022,,0.994,,0.348,0.994,
GSM1871457,48.0,Huntington's disease,brain - caudate nucleus,0.833,0.021,,,0.685,0.062,0.012,...,0.982,0.196,0.747,0.023,,0.989,,0.318,0.989,0.605
GSM1871498,51.0,Huntington's disease,brain - caudate nucleus,0.905,0.067,,0.046,0.716,0.076,0.016,...,,0.149,0.755,0.021,,0.984,,0.230,0.982,
GSM1871545,65.0,Huntington's disease,brain - caudate nucleus,0.855,0.062,,0.060,0.680,0.084,0.019,...,0.985,0.165,0.794,0.020,,0.973,0.965,0.235,0.988,
GSM1871590,67.0,Huntington's disease,brain - caudate nucleus,,,,,,0.326,0.131,...,,0.158,0.753,0.012,,0.988,,0.238,0.985,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1871815,56.0,Huntington's disease,brain - visual cortex,0.861,0.003,,0.063,0.732,0.090,0.026,...,0.991,0.177,0.778,0.009,,0.991,0.972,0.292,0.887,0.496
GSM1871849,62.0,Huntington's disease,brain - visual cortex,,,,,,0.235,,...,0.998,0.186,0.941,0.024,,0.970,,0.340,0.836,
GSM1871852,58.0,Huntington's disease,brain - visual cortex,0.925,0.041,,0.047,0.617,0.192,0.023,...,,0.144,0.755,0.016,,0.987,,0.303,0.855,
GSM1871860,91.0,Huntington's disease,brain - visual cortex,,,,,,,,...,0.982,0.194,0.955,0.010,,0.989,0.948,0.291,0.892,0.579


In [6]:
def drop_na_cols(df,percent=0.1):
    #Drop columns with >pearcent% NAs
    nas=df.isnull().sum()
    nas=nas[1:]

    for i in nas:
        if i>=len(df.index)*percent:
            try:
                df=df.drop(nas.keys()[i], axis=1)
            except KeyError:
                pass
    return df

In [16]:
alz_blood_all=drop_na_cols(alz_blood_all,percent=0.1)

In [17]:
alz_blood_imp = pd.DataFrame(importances, columns = alz_blood_all.columns[3:])
alz_blood_imp = alz_blood_imp.transpose()

alz_blood_imp['Mean'] = alz_blood_imp.mean(axis=1)
alz_blood_imp['Std'] = alz_blood_imp.std(axis=1)



In [18]:
def importances_sorted_by_mean(df_imp):
    df_imp_sorted = df_imp.sort_values('Mean', ascending=False)
    return df_imp_sorted

In [19]:
alz_blood_imp_sorted = importances_sorted_by_mean(alz_blood_imp)
alz_blood_imp_sorted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Mean,Std
cg07544187,0.072097,0.105069,0.044133,0.091028,0.070648,0.157831,0.052963,0.079503,0.092137,0.078740,0.134943,0.169606,0.178727,0.169201,0.130514,0.064215,0.105339,0.096113,0.105156,0.040952
cg18279094,0.084054,0.047299,0.099697,0.046338,0.081418,0.068600,0.050827,0.024690,0.050640,0.098300,0.042843,0.077405,0.138303,0.031015,0.053586,0.044642,0.074833,0.032452,0.063719,0.028379
cg06645033,0.000000,0.022662,0.137559,0.084723,0.093211,0.057211,0.014459,0.109154,0.000000,0.074769,0.056728,0.096245,0.067487,0.056527,0.008957,0.032800,0.031841,0.125300,0.059424,0.041385
cg13327545,0.024663,0.088249,0.052763,0.051605,0.009679,0.039391,0.112527,0.053962,0.094085,0.040654,0.009301,0.052656,0.024247,0.050992,0.046623,0.033366,0.051208,0.044771,0.048930,0.026282
cg24369989,0.006746,0.001679,0.003389,0.001542,0.027365,0.001145,0.000000,0.000000,0.030203,0.090498,0.083826,0.020700,0.007846,0.014834,0.199444,0.023325,0.130413,0.008250,0.036178,0.053484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg03784994,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg03790745,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg03790908,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg03793270,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [20]:
xgb_alz_blood_cpgs=list(alz_blood_imp_sorted.index)
with open('brain/huntingtons/xgb_hunt_brain_cpgs', 'wb') as fp:
        cPickle.dump(xgb_alz_blood_cpgs, fp)

# Parkinsons

In [16]:
directory_park = r'blood/parkinsons/feature_selection'
park_trials=os.listdir(directory_park)

importances_park=[]
for file in park_trials:
    with open(directory_park+'/'+file, "rb") as input_file:
        trial = cPickle.load(input_file)
        importances_park.append(trial)



In [20]:
park_blood_all = pd.read_csv('blood/parkinsons/park_blood_unhealthy.csv', index_col=0,low_memory=False)
park_blood_all

Unnamed: 0,AGE,cg00050873,cg00212031,cg00213748,cg00214611,cg00455876,cg01707559,cg02004872,cg02011394,cg02050847,...,cg04103088,cg04105342,cg04106633,cg04107037,cg04111054,cg04112173,cg04112590,cg04112626,cg04113154,cg04114386
GSM3035963,78.0,0.925,,,,,0.144,0.026,0.966,0.963,...,0.977,0.061,0.949,0.016,,0.979,,0.056,0.988,
GSM3035809,78.0,0.863,0.118,,0.032,0.840,0.166,0.019,0.966,0.991,...,0.990,0.285,0.907,0.032,,0.974,0.874,0.119,0.992,0.944
GSM3035847,75.0,0.893,0.038,,,0.738,0.118,0.013,0.971,0.994,...,,0.235,0.864,0.012,,0.979,,0.077,0.989,
GSM1870906,71.4,0.897,0.014,,0.035,0.778,0.112,0.024,0.962,0.993,...,0.983,0.161,0.920,0.042,,0.988,,0.104,0.995,0.953
GSM1871363,76.2,,,,,,0.173,,,,...,0.986,0.259,0.962,0.038,,0.967,,0.111,0.987,0.965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM3035602.1,79.0,,,,,,,,,,...,,0.289,0.948,0.049,,0.975,,0.133,0.987,
GSM1343558.1,72.2,0.816,0.036,,0.020,0.793,0.108,0.024,0.974,0.987,...,0.991,0.225,0.847,0.055,,0.981,,0.148,0.969,0.980
GSM1870847.1,76.3,,,,,,0.283,,,,...,0.996,0.237,0.896,0.027,,0.983,0.857,0.091,0.986,0.979
GSM1498556.1,62.0,,,,,,0.216,,,,...,,0.256,0.966,0.038,,0.986,,,0.977,


In [21]:
#Drop columns with >10% NAs
nas=park_blood_all.isnull().sum()
nas=nas[1:]
        
for i in nas:
    if i>=len(park_blood_all.index)/10:
        try:
            park_blood_all=park_blood_all.drop(nas.keys()[i], axis=1)
        except KeyError:
            pass

In [22]:
park_blood_all

Unnamed: 0,AGE,cg00050873,cg00212031,cg00213748,cg00214611,cg00455876,cg01707559,cg02004872,cg02011394,cg02050847,...,cg04103088,cg04105342,cg04106633,cg04107037,cg04111054,cg04112173,cg04112590,cg04112626,cg04113154,cg04114386
GSM3035963,78.0,0.925,,,,,0.144,0.026,0.966,0.963,...,0.977,0.061,0.949,0.016,,0.979,,0.056,0.988,
GSM3035809,78.0,0.863,0.118,,0.032,0.840,0.166,0.019,0.966,0.991,...,0.990,0.285,0.907,0.032,,0.974,0.874,0.119,0.992,0.944
GSM3035847,75.0,0.893,0.038,,,0.738,0.118,0.013,0.971,0.994,...,,0.235,0.864,0.012,,0.979,,0.077,0.989,
GSM1870906,71.4,0.897,0.014,,0.035,0.778,0.112,0.024,0.962,0.993,...,0.983,0.161,0.920,0.042,,0.988,,0.104,0.995,0.953
GSM1871363,76.2,,,,,,0.173,,,,...,0.986,0.259,0.962,0.038,,0.967,,0.111,0.987,0.965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM3035602.1,79.0,,,,,,,,,,...,,0.289,0.948,0.049,,0.975,,0.133,0.987,
GSM1343558.1,72.2,0.816,0.036,,0.020,0.793,0.108,0.024,0.974,0.987,...,0.991,0.225,0.847,0.055,,0.981,,0.148,0.969,0.980
GSM1870847.1,76.3,,,,,,0.283,,,,...,0.996,0.237,0.896,0.027,,0.983,0.857,0.091,0.986,0.979
GSM1498556.1,62.0,,,,,,0.216,,,,...,,0.256,0.966,0.038,,0.986,,,0.977,


In [23]:
park_blood_imp = pd.DataFrame(importances_park, columns = park_blood_all.columns[1:])
park_blood_imp = park_blood_imp.transpose()

park_blood_imp['Mean'] = park_blood_imp.mean(axis=1)
park_blood_imp['Std'] = park_blood_imp.std(axis=1)


park_blood_imp_sorted = importances_sorted_by_mean(park_blood_imp)
park_blood_imp_sorted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,Mean,Std
cg15121420,0.057554,0.011433,0.050849,0.003842,0.049744,0.000000,0.040626,0.020149,0.071459,0.056291,...,0.051791,0.053817,0.000000,0.000000,0.034357,0.042976,0.073343,0.048019,0.032087,0.023984
cg16867657,0.024222,0.029530,0.041523,0.053612,0.049242,0.027534,0.033389,0.025170,0.024637,0.019616,...,0.027220,0.027712,0.042098,0.030473,0.030275,0.041460,0.028116,0.024695,0.031892,0.008853
cg23744638,0.035400,0.048098,0.035529,0.054938,0.045881,0.018572,0.013365,0.024400,0.022097,0.001461,...,0.037608,0.000000,0.000000,0.025567,0.030913,0.074047,0.044374,0.030060,0.030507,0.018772
cg16932827,0.032138,0.035249,0.014327,0.042489,0.022292,0.028108,0.029008,0.036323,0.031612,0.009702,...,0.000000,0.046083,0.032339,0.050397,0.013215,0.041979,0.004538,0.004671,0.026673,0.022188
cg04875128,0.025007,0.048003,0.037954,0.033747,0.010741,0.003726,0.006107,0.000000,0.000000,0.052326,...,0.060958,0.020651,0.036716,0.044192,0.042937,0.000813,0.000000,0.000000,0.026101,0.023540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg07427559,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg07428101,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg07428323,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
cg07428372,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [24]:
xgb_park_blood_cpgs=list(park_blood_imp_sorted.index)
with open('blood/parkinsons/xgb_park_blood_cpgs', 'wb') as fp:
        cPickle.dump(xgb_park_blood_cpgs, fp)

In [25]:
xgb_park_blood_cpgs

['cg23744638',
 'cg16867657',
 'cg04875128',
 'cg08453194',
 'cg06639320',
 'cg07553761',
 'cg24724428',
 'cg03725309',
 'cg07181952',
 'cg24079702',
 'cg26614073',
 'cg00175403',
 'cg08160331',
 'cg22158769',
 'cg16932827',
 'cg00292135',
 'cg01554474',
 'cg20822990',
 'cg05721773',
 'cg22454769',
 'cg13709487',
 'cg07955995',
 'cg06540876',
 'cg15121420',
 'cg17662369',
 'cg24853956',
 'cg23606718',
 'cg00588614',
 'cg16173186',
 'cg26734350',
 'cg16015712',
 'cg02046143',
 'cg19593767',
 'cg19099050',
 'cg22617819',
 'cg27386529',
 'cg04608933',
 'cg26701785',
 'cg17785250',
 'cg25410668',
 'cg07052627',
 'cg01877778',
 'cg23415489',
 'cg17380661',
 'cg05207611',
 'cg04861640',
 'cg07552803',
 'cg22967516',
 'cg01783650',
 'cg06478173',
 'cg19400520',
 'cg11344352',
 'cg05017994',
 'cg10221746',
 'cg21795074',
 'cg07927379',
 'cg03126058',
 'cg08540945',
 'cg09360654',
 'ch.2.30415474F',
 'cg14608275',
 'cg14674720',
 'cg10806820',
 'cg12258344',
 'cg03382805',
 'cg05898618',
 'cg04

In [38]:
xgb_alz_blood_cpgs

['cg04875128',
 'cg16867657',
 'cg07553761',
 'cg24724428',
 'cg23112821',
 'cg23069444',
 'cg07733620',
 'cg22441973',
 'cg23753502',
 'cg24711224',
 'cg22629170',
 'cg08160331',
 'cg00571041',
 'cg16385335',
 'cg00389276',
 'cg05523500',
 'cg12982090',
 'cg15879104',
 'cg03738707',
 'cg23647968',
 'cg14686949',
 'cg09834444',
 'cg02314846',
 'cg17176676',
 'cg02933228',
 'cg25266135',
 'cg04565649',
 'cg05308819',
 'cg11218561',
 'cg11205173',
 'cg04122324',
 'cg15058210',
 'cg16008966',
 'cg23311929',
 'cg00723017',
 'cg07757497',
 'cg03125090',
 'cg03607117',
 'cg03355241',
 'cg15810747',
 'cg19858371',
 'cg08961047',
 'cg12117942',
 'cg23744638',
 'cg02352281',
 'cg14393034',
 'cg26506507',
 'cg12744634',
 'cg08571918',
 'cg21845957',
 'cg15614268',
 'cg12506980',
 'cg27553988',
 'cg07927379',
 'cg25677394',
 'cg01074797',
 'cg25457007',
 'cg13948074',
 'cg11299854',
 'cg16745091',
 'cg16384957',
 'cg19381766',
 'cg03519967',
 'cg18333339',
 'cg07015494',
 'cg24000259',
 'cg135203

In [48]:
with open(r"blood/healthy/cpgs_XGboost_whole_blood_ranked", "rb") as input_file:
    xgb_hc_blood_cpgs = cPickle.load(input_file)
    
xgb_hc_blood_cpgs=list(xgb_hc_blood_cpgs)

In [52]:
print('Features in top 100 for all 3')
for i in xgb_hc_blood_cpgs[0:100]:    
    if i in xgb_park_blood_cpgs[0:100] and i in xgb_alz_blood_cpgs[0:100]:       
        print('|',i,' Healthy = ',xgb_hc_blood_cpgs.index(i),' Alz = ',xgb_alz_blood_cpgs.index(i),' Park = ',xgb_park_blood_cpgs.index(i),'|')


Features in top 100 for all 3
| cg16867657  Healthy =  1  Alz =  1  Park =  1 |
| cg04875128  Healthy =  2  Alz =  0  Park =  2 |
| cg07553761  Healthy =  4  Alz =  2  Park =  5 |
| cg04208403  Healthy =  7  Alz =  76  Park =  93 |
| cg23744638  Healthy =  20  Alz =  43  Park =  0 |
| cg07927379  Healthy =  21  Alz =  53  Park =  55 |
| cg24724428  Healthy =  23  Alz =  3  Park =  6 |


In [54]:
print('Features in top 100 for Alzheimers')
for i in xgb_alz_blood_cpgs[0:100]:    
    if i in xgb_park_blood_cpgs and i in xgb_hc_blood_cpgs:       
        print('|',i,' Healthy = ',xgb_hc_blood_cpgs.index(i),' Alz = ',xgb_alz_blood_cpgs.index(i),' Park = ',xgb_park_blood_cpgs.index(i),'|')


Features in top 100 for Alzheimers
| cg04875128  Healthy =  2  Alz =  0  Park =  2 |
| cg16867657  Healthy =  1  Alz =  1  Park =  1 |
| cg07553761  Healthy =  4  Alz =  2  Park =  5 |
| cg24724428  Healthy =  23  Alz =  3  Park =  6 |
| cg08160331  Healthy =  1883  Alz =  11  Park =  12 |
| cg02933228  Healthy =  40  Alz =  24  Park =  1352 |
| cg05308819  Healthy =  95  Alz =  27  Park =  15088 |
| cg16008966  Healthy =  111  Alz =  32  Park =  28200 |
| cg03607117  Healthy =  35  Alz =  37  Park =  123 |
| cg23744638  Healthy =  20  Alz =  43  Park =  0 |
| cg21845957  Healthy =  1695  Alz =  49  Park =  171525 |
| cg07927379  Healthy =  21  Alz =  53  Park =  55 |
| cg01074797  Healthy =  63  Alz =  55  Park =  114 |
| cg19381766  Healthy =  775  Alz =  61  Park =  406 |
| cg15728256  Healthy =  1441  Alz =  71  Park =  6978 |
| cg04208403  Healthy =  7  Alz =  76  Park =  93 |
| cg07202479  Healthy =  1406  Alz =  77  Park =  156013 |
| cg25478614  Healthy =  204  Alz =  84  Park 

In [55]:
print('Features in top 100 for Parkinsons')
for i in xgb_park_blood_cpgs[0:100]:    
    if i in xgb_alz_blood_cpgs and i in xgb_hc_blood_cpgs:       
        print('|',i,' Healthy = ',xgb_hc_blood_cpgs.index(i),' Alz = ',xgb_alz_blood_cpgs.index(i),' Park = ',xgb_park_blood_cpgs.index(i),'|')


Features in top 100 for Parkinsons
| cg23744638  Healthy =  20  Alz =  43  Park =  0 |
| cg16867657  Healthy =  1  Alz =  1  Park =  1 |
| cg04875128  Healthy =  2  Alz =  0  Park =  2 |
| cg08453194  Healthy =  19  Alz =  4707  Park =  3 |
| cg06639320  Healthy =  10  Alz =  113  Park =  4 |
| cg07553761  Healthy =  4  Alz =  2  Park =  5 |
| cg24724428  Healthy =  23  Alz =  3  Park =  6 |
| cg03725309  Healthy =  760  Alz =  14724  Park =  7 |
| cg07181952  Healthy =  98  Alz =  128  Park =  8 |
| cg24079702  Healthy =  51  Alz =  12409  Park =  9 |
| cg26614073  Healthy =  86  Alz =  5275  Park =  10 |
| cg08160331  Healthy =  1883  Alz =  11  Park =  12 |
| cg22158769  Healthy =  176  Alz =  728  Park =  13 |
| cg16932827  Healthy =  31  Alz =  90500  Park =  14 |
| cg00292135  Healthy =  5  Alz =  118  Park =  15 |
| cg20822990  Healthy =  142  Alz =  97879  Park =  17 |
| cg22454769  Healthy =  9  Alz =  1045  Park =  19 |
| cg07955995  Healthy =  62  Alz =  9794  Park =  21 |
|

In [59]:
comb=[]
for i in xgb_hc_blood_cpgs[0:100]:    
    if i in xgb_alz_blood_cpgs and i in xgb_park_blood_cpgs and i not in comb:       
        comb.append(i)

for i in xgb_alz_blood_cpgs[0:100]:    
    if i in xgb_park_blood_cpgs and i in xgb_hc_blood_cpgs and i not in comb:       
        comb.append(i)
        
for i in xgb_park_blood_cpgs[0:100]:    
    if i in xgb_alz_blood_cpgs and i in xgb_hc_blood_cpgs and i not in comb:       
        comb.append(i)

In [61]:
for i in comb:
    print('|',i,' Healthy = ',xgb_hc_blood_cpgs.index(i),' Alz = ',xgb_alz_blood_cpgs.index(i),' Park = ',xgb_park_blood_cpgs.index(i),'|')

| cg16867657  Healthy =  1  Alz =  1  Park =  1 |
| cg04875128  Healthy =  2  Alz =  0  Park =  2 |
| cg07553761  Healthy =  4  Alz =  2  Park =  5 |
| cg00292135  Healthy =  5  Alz =  118  Park =  15 |
| cg09809672  Healthy =  6  Alz =  31824  Park =  2279 |
| cg04208403  Healthy =  7  Alz =  76  Park =  93 |
| cg22454769  Healthy =  9  Alz =  1045  Park =  19 |
| cg06639320  Healthy =  10  Alz =  113  Park =  4 |
| cg19344626  Healthy =  13  Alz =  3734  Park =  216 |
| cg02046143  Healthy =  15  Alz =  682  Park =  31 |
| cg23606718  Healthy =  18  Alz =  1395  Park =  26 |
| cg08453194  Healthy =  19  Alz =  4707  Park =  3 |
| cg23744638  Healthy =  20  Alz =  43  Park =  0 |
| cg07927379  Healthy =  21  Alz =  53  Park =  55 |
| cg24724428  Healthy =  23  Alz =  3  Park =  6 |
| cg23078123  Healthy =  28  Alz =  103742  Park =  100635 |
| cg16932827  Healthy =  31  Alz =  90500  Park =  14 |
| cg10221746  Healthy =  33  Alz =  31794  Park =  53 |
| cg18877361  Healthy =  34  Alz 

In [71]:
len(comb)

82

In [72]:
def read_by_cpg(df_filename, cpg_list):
    allcpgs = []
    with open(df_filename) as f:
        for row in f:
            if row[0:3]!='ch.':
                allcpgs.append(row[0:10])
            else:
                allcpgs.append(row[0:14])
    # we want to skip the rows that are not in cpg_list
    skipped = []
    for i in range(1,len(allcpgs)):
        if allcpgs[i] not in cpg_list:
            skipped.append(i)
            
    df = pd.read_csv(df_filename, skiprows=skipped,low_memory=False)
    return df

In [73]:
healthy_df=read_by_cpg('blood/healthy/whole_blood_age_methylation_v1.txt', comb)
healthy_df


Unnamed: 0,sample_id,GSM2334366,GSM989882,GSM989863,GSM1443696,GSM1069241,GSM1572402,GSM1572442,GSM1648928,GSM990558,...,GSM2815698,GSM2814296,GSM2814963,GSM2814230,GSM2814121,GSM2815345,GSM2814886,GSM2815335,GSM2334524,GSM2815609
0,cg01074797,0.688,0.504,0.571,0.440,0.440,0.539,0.599,0.522,0.530,...,0.745,0.751,0.712,0.094,0.783,0.785,0.701,0.730,0.678,0.767
1,cg06784991,0.457,0.316,0.435,0.335,0.335,0.314,0.287,0.275,0.332,...,0.262,0.235,0.224,0.233,0.211,0.199,0.240,0.224,0.190,0.178
2,cg07181952,0.688,0.602,0.673,0.636,0.636,0.553,0.570,0.667,0.597,...,0.770,0.730,0.743,0.768,0.760,0.699,0.673,0.717,0.699,0.690
3,cg07202479,0.513,0.502,0.507,0.564,0.564,0.470,0.471,0.583,0.529,...,0.641,0.581,0.649,0.689,0.599,0.651,0.554,0.612,0.673,0.642
4,cg07418114,0.713,0.664,0.634,0.663,0.663,0.591,0.593,0.660,0.608,...,0.750,0.656,0.723,0.779,0.748,0.702,0.719,0.716,0.794,0.715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,cg03607117,0.194,0.078,0.096,0.180,0.180,0.078,0.053,0.202,0.106,...,0.026,0.030,0.025,0.059,0.038,0.040,0.033,0.021,0.051,0.027
78,cg16932827,0.835,0.659,0.650,0.611,0.611,0.653,0.566,0.753,0.666,...,0.959,0.920,0.967,0.956,0.940,0.886,0.939,0.889,0.951,0.948
79,cg25478614,0.459,0.562,0.523,0.495,0.495,0.565,0.485,0.440,0.392,...,0.333,0.290,0.377,0.373,0.310,0.319,0.357,0.327,0.325,0.334
80,cg26614073,0.287,0.592,0.506,0.422,0.422,0.402,0.426,0.354,0.240,...,0.608,0.575,0.649,,,0.547,0.577,0.617,0.530,0.597


In [85]:
#meta = pd.read_csv('sample_disease.txt', delimiter = '\t')
meta = []

with open(r'sample_age.txt') as f:
    for row in f:
        meta.append(row[0:50])

meta_2 =[]

for i in meta:
    meta_2.append(i.split())

ages=[]
df_id_ages=[]
df_age=[]
for i in range(1,len(meta_2)):
    p=meta_2[i]
    
    v=[]
    if p[0][1]=='G':
        v.append(p[0][1:-1])
        v.append(p[2])
        df_id_ages.append(p[0][1:-1])
        df_age.append(p[2])
        ages.append(v)
    
    
df_ages = pd.DataFrame({'id': df_id_ages,
                   'age': df_age,})

df_ages=df_ages.set_index('id')
df_ages

Unnamed: 0_level_0,age
id,Unnamed: 1_level_1
GSM2139432,71
GSM2139249,76
GSM2139398,102
GSM2139297,108
GSM1069208,40
...,...
GSM1871753,53
GSM1438255,53.1594
GSM1438138,53.3855
GSM1438147,53.477


In [86]:
healthy_df=healthy_df.set_index('sample_id').T
healthy_df['AGE']=np.zeros(len(healthy_df.index))
cols = list(healthy_df.columns.values)
cols=cols[0:-1]
healthy_df = healthy_df[['AGE']+cols]
healthy_df

sample_id,AGE,cg01074797,cg06784991,cg07181952,cg07202479,cg07418114,cg15728256,cg16015712,cg16290275,cg09825030,...,cg25410668,cg06639320,cg14674720,ch.2.30415474F,cg00329615,cg03607117,cg16932827,cg25478614,cg26614073,cg27386529
GSM2334366,0.0,0.688,0.457,0.688,0.513,0.713,0.246,0.165,0.232,0.553,...,0.597,0.594,0.311,0.026,0.357,0.194,0.835,0.459,0.287,0.180
GSM989882,0.0,0.504,0.316,0.602,0.502,0.664,0.315,0.103,0.193,0.525,...,0.699,0.704,0.304,0.014,0.377,0.078,0.659,0.562,0.592,0.529
GSM989863,0.0,0.571,0.435,0.673,0.507,0.634,0.273,0.081,0.148,0.515,...,0.652,0.619,0.257,0.050,0.327,0.096,0.650,0.523,0.506,0.418
GSM1443696,0.0,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,0.441,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
GSM1069241,0.0,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,0.441,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2815345,0.0,0.785,0.199,0.699,0.651,0.702,0.086,0.047,0.272,0.490,...,0.468,0.292,0.138,0.039,0.598,0.040,0.886,0.319,0.547,0.376
GSM2814886,0.0,0.701,0.240,0.673,0.554,0.719,0.081,0.054,0.307,0.525,...,0.557,0.381,0.194,0.133,0.634,0.033,0.939,0.357,0.577,0.493
GSM2815335,0.0,0.730,0.224,0.717,0.612,0.716,0.092,0.052,0.358,0.500,...,0.513,0.347,0.168,0.052,0.650,0.021,0.889,0.327,0.617,0.464
GSM2334524,0.0,0.678,0.190,0.699,0.673,0.794,0.119,0.096,0.349,0.577,...,0.520,0.338,0.127,0.083,0.731,0.051,0.951,0.325,0.530,0.479


In [89]:
for i in healthy_df.index:
    try:
        healthy_df.loc[i,'AGE']=float(df_ages.loc[i]['age'])
    except KeyError:
        healthy_df.loc[i,'AGE']='NaN'
healthy_df


sample_id,AGE,cg01074797,cg06784991,cg07181952,cg07202479,cg07418114,cg15728256,cg16015712,cg16290275,cg09825030,...,cg25410668,cg06639320,cg14674720,ch.2.30415474F,cg00329615,cg03607117,cg16932827,cg25478614,cg26614073,cg27386529
GSM2334366,94,0.688,0.457,0.688,0.513,0.713,0.246,0.165,0.232,0.553,...,0.597,0.594,0.311,0.026,0.357,0.194,0.835,0.459,0.287,0.180
GSM989882,96,0.504,0.316,0.602,0.502,0.664,0.315,0.103,0.193,0.525,...,0.699,0.704,0.304,0.014,0.377,0.078,0.659,0.562,0.592,0.529
GSM989863,101,0.571,0.435,0.673,0.507,0.634,0.273,0.081,0.148,0.515,...,0.652,0.619,0.257,0.050,0.327,0.096,0.650,0.523,0.506,0.418
GSM1443696,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,0.441,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
GSM1069241,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,0.441,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2815345,18,0.785,0.199,0.699,0.651,0.702,0.086,0.047,0.272,0.490,...,0.468,0.292,0.138,0.039,0.598,0.040,0.886,0.319,0.547,0.376
GSM2814886,18,0.701,0.240,0.673,0.554,0.719,0.081,0.054,0.307,0.525,...,0.557,0.381,0.194,0.133,0.634,0.033,0.939,0.357,0.577,0.493
GSM2815335,18,0.730,0.224,0.717,0.612,0.716,0.092,0.052,0.358,0.500,...,0.513,0.347,0.168,0.052,0.650,0.021,0.889,0.327,0.617,0.464
GSM2334524,18,0.678,0.190,0.699,0.673,0.794,0.119,0.096,0.349,0.577,...,0.520,0.338,0.127,0.083,0.731,0.051,0.951,0.325,0.530,0.479


In [90]:
healthy_df['status']=np.zeros(len(healthy_df.index))
cols = list(healthy_df.columns.values)
cols=cols[0:-1]
healthy_df = healthy_df[['status']+cols]
healthy_df

sample_id,status,AGE,cg01074797,cg06784991,cg07181952,cg07202479,cg07418114,cg15728256,cg16015712,cg16290275,...,cg25410668,cg06639320,cg14674720,ch.2.30415474F,cg00329615,cg03607117,cg16932827,cg25478614,cg26614073,cg27386529
GSM2334366,0.0,94,0.688,0.457,0.688,0.513,0.713,0.246,0.165,0.232,...,0.597,0.594,0.311,0.026,0.357,0.194,0.835,0.459,0.287,0.180
GSM989882,0.0,96,0.504,0.316,0.602,0.502,0.664,0.315,0.103,0.193,...,0.699,0.704,0.304,0.014,0.377,0.078,0.659,0.562,0.592,0.529
GSM989863,0.0,101,0.571,0.435,0.673,0.507,0.634,0.273,0.081,0.148,...,0.652,0.619,0.257,0.050,0.327,0.096,0.650,0.523,0.506,0.418
GSM1443696,0.0,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
GSM1069241,0.0,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2815345,0.0,18,0.785,0.199,0.699,0.651,0.702,0.086,0.047,0.272,...,0.468,0.292,0.138,0.039,0.598,0.040,0.886,0.319,0.547,0.376
GSM2814886,0.0,18,0.701,0.240,0.673,0.554,0.719,0.081,0.054,0.307,...,0.557,0.381,0.194,0.133,0.634,0.033,0.939,0.357,0.577,0.493
GSM2815335,0.0,18,0.730,0.224,0.717,0.612,0.716,0.092,0.052,0.358,...,0.513,0.347,0.168,0.052,0.650,0.021,0.889,0.327,0.617,0.464
GSM2334524,0.0,18,0.678,0.190,0.699,0.673,0.794,0.119,0.096,0.349,...,0.520,0.338,0.127,0.083,0.731,0.051,0.951,0.325,0.530,0.479


In [94]:
healthy_df

sample_id,status,AGE,cg01074797,cg06784991,cg07181952,cg07202479,cg07418114,cg15728256,cg16015712,cg16290275,...,cg25410668,cg06639320,cg14674720,ch.2.30415474F,cg00329615,cg03607117,cg16932827,cg25478614,cg26614073,cg27386529
GSM2334366,0.0,94,0.688,0.457,0.688,0.513,0.713,0.246,0.165,0.232,...,0.597,0.594,0.311,0.026,0.357,0.194,0.835,0.459,0.287,0.180
GSM989882,0.0,96,0.504,0.316,0.602,0.502,0.664,0.315,0.103,0.193,...,0.699,0.704,0.304,0.014,0.377,0.078,0.659,0.562,0.592,0.529
GSM989863,0.0,101,0.571,0.435,0.673,0.507,0.634,0.273,0.081,0.148,...,0.652,0.619,0.257,0.050,0.327,0.096,0.650,0.523,0.506,0.418
GSM1443696,0.0,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
GSM1069241,0.0,99,0.440,0.335,0.636,0.564,0.663,0.329,0.160,0.143,...,0.663,0.579,0.271,0.014,0.385,0.180,0.611,0.495,0.422,0.305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2815345,0.0,18,0.785,0.199,0.699,0.651,0.702,0.086,0.047,0.272,...,0.468,0.292,0.138,0.039,0.598,0.040,0.886,0.319,0.547,0.376
GSM2814886,0.0,18,0.701,0.240,0.673,0.554,0.719,0.081,0.054,0.307,...,0.557,0.381,0.194,0.133,0.634,0.033,0.939,0.357,0.577,0.493
GSM2815335,0.0,18,0.730,0.224,0.717,0.612,0.716,0.092,0.052,0.358,...,0.513,0.347,0.168,0.052,0.650,0.021,0.889,0.327,0.617,0.464
GSM2334524,0.0,18,0.678,0.190,0.699,0.673,0.794,0.119,0.096,0.349,...,0.520,0.338,0.127,0.083,0.731,0.051,0.951,0.325,0.530,0.479


In [None]:
col=list(alz_blood_all.columns[1:])
for i in col:
    if i not in comb:
        alz_blood_all=alz_blood_all.drop(columns=[i])
        
alz_blood_all       

In [None]:
col_p=list(park_blood_all.columns[1:])
drop_p=[]
for i in col_p:
    if i not in comb:
        drop_p.append(i)

park_blood_all=park_blood_all.drop(columns=drop_p)    
park_blood_all   

### Which features to use:

- Depends on whether the goal is healthy vs unhealthy or by category?

1. Top 100 from each of the 3 groups
2. Features with the greatest discrepensies between healthy and unhealthy
3. 