# Get shared healthy and unhealthy CpGs

- This notebook produces a cut of the Alzheimer's and Huntinton's data for the top 100 healthy CpG sites present in the unhealthy cohort
- Healthy CpG list was generated by XGBoost and found in healthy/cpgs_XGboost_brain_ranked
- The file produces brain_shared_healthy_unhealthy: a list of brain CpGs in healthy top 100 present in unhealthy cohort
- This file produced alz_brain_top_56.csv and hunt_brain_top_56.csv


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import seaborn as sns
import _pickle as cPickle
import csv

In [3]:
#Reading in healthy brain cpg ranking 
with open(r"healthy/cpgs_XGboost_brain_ranked", "rb") as input_file:
    healthy_cpg_rank = cPickle.load(input_file)

In [7]:
#Reading in all disease cphs
allcpgs = []
with open(r'../disease_methylation.txt') as f:
    for row in f:
        allcpgs.append(row[0:10])

In [11]:
brain_shared_healthy_unhealthy=[]
for i in healthy_cpg_rank[0:100]:
    if i in allcpgs:
        brain_shared_healthy_unhealthy.append(i)

In [12]:
len(brain_shared_healthy_unhealthy)

55

In [13]:
with open('brain_shared_healthy_unhealthy', 'wb') as fp:
        cPickle.dump(brain_shared_healthy_unhealthy, fp)

### Alzheimer's data

In [14]:
#Read in all Alzheimer's data
alz_brain_55=pd.read_csv('alzheimers/alz_brain_unhealthy_all.csv',index_col=0)  

In [18]:
#Dropping CpGs not in shared list
alz_columns_to_drop=[]
alz_cols=list(alz_brain_55.columns)
for i in alz_cols:
    if i not in brain_shared_healthy_unhealthy:
        alz_columns_to_drop.append(i)
alz_brain=alz_brain_55.drop(columns=alz_columns_to_drop[1:])

In [19]:
alz_brain.to_csv('alzheimers/alz_brain_top_55.csv', encoding='utf-8', index=True)

In [20]:
alz_brain

Unnamed: 0,AGE,cg00807959,cg01066472,cg13806070,cg15907146,cg17104258,cg24441324,cg22454769,cg23606718,cg24079702,...,cg19622662,cg23595055,cg04739123,cg16367511,cg18008766,cg19451698,cg04834794,cg07303143,cg21182694,cg23352942
GSM1068826,88.0,0.052,0.512,0.113,0.679,0.010,0.846,0.275,0.078,0.198,...,0.023,0.972,0.105,0.044,0.135,0.120,0.029,0.158,0.243,0.407
GSM1068827,92.0,0.103,0.519,0.142,0.532,0.015,0.811,0.275,0.030,0.187,...,0.018,0.959,0.115,0.125,0.172,0.136,0.052,0.106,0.265,0.444
GSM1068829,93.0,0.064,0.620,0.154,0.535,0.024,0.819,0.307,0.065,0.209,...,0.015,0.968,0.053,0.193,0.229,0.161,0.032,,0.329,0.480
GSM1068832,96.0,0.049,0.549,0.133,0.575,0.019,0.871,0.289,0.028,0.189,...,0.009,0.980,0.058,0.290,0.162,0.089,0.024,,0.317,0.422
GSM1068833,86.0,0.081,0.287,0.089,0.568,0.021,0.863,0.165,0.044,0.139,...,0.015,0.981,0.062,0.189,0.116,0.083,0.022,0.107,0.236,0.431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM2809060,80.0,0.364,0.635,0.144,0.692,0.189,0.910,0.407,0.202,0.274,...,0.115,0.950,0.069,0.153,0.111,0.083,0.081,,0.184,0.589
GSM2809061,70.0,0.366,0.520,0.118,0.554,0.394,0.887,0.383,0.221,0.249,...,0.311,0.910,0.075,0.117,0.090,0.058,0.054,,0.214,0.592
GSM2809062,75.0,0.328,0.321,0.124,0.610,0.134,0.892,0.400,0.130,0.253,...,0.101,0.943,0.031,0.161,0.072,0.056,0.064,0.285,0.209,0.610
GSM2809063,79.0,0.356,0.537,0.145,0.668,0.143,0.895,0.390,0.166,0.251,...,0.097,0.935,0.051,0.117,0.110,0.054,0.074,,0.220,0.540


### Huntington's data

In [21]:
#Read in all Huntington's data
hunt_brain_55=pd.read_csv('huntingtons/hunt_brain_unhealthy_all.csv',index_col=0)  

In [22]:
#Dropping CpGs not in shared list
hunt_columns_to_drop=[]
hunt_cols=list(hunt_brain_55.columns)
for i in hunt_cols:
    if i not in brain_shared_healthy_unhealthy:
        hunt_columns_to_drop.append(i)
hunt_brain=hunt_brain_55.drop(columns=hunt_columns_to_drop[1:])

In [23]:
hunt_brain.to_csv('huntingtons/hunt_brain_top_55.csv', encoding='utf-8', index=True)

In [24]:
hunt_brain

Unnamed: 0,AGE,cg00807959,cg01066472,cg13806070,cg15907146,cg17104258,cg24441324,cg22454769,cg23606718,cg24079702,...,cg19622662,cg23595055,cg04739123,cg16367511,cg18008766,cg19451698,cg04834794,cg07303143,cg21182694,cg23352942
GSM1871451,83.0,0.255,0.432,0.147,0.640,0.179,0.926,0.426,0.159,0.308,...,0.147,0.981,0.027,0.166,0.082,0.096,0.078,0.270,0.221,0.571
GSM1871457,48.0,0.257,0.554,0.032,0.648,0.173,0.909,0.311,0.112,0.190,...,0.158,0.968,0.050,0.126,0.047,0.048,0.030,0.204,0.227,0.571
GSM1871498,51.0,0.282,0.248,0.143,0.550,0.244,0.920,0.332,0.168,0.245,...,0.210,0.914,0.022,0.170,0.057,0.047,0.049,0.306,0.239,0.603
GSM1871545,65.0,0.335,0.470,0.114,0.617,0.178,0.888,0.352,0.178,0.243,...,0.146,0.954,0.080,0.177,0.063,0.077,0.051,0.333,0.311,0.597
GSM1871590,67.0,0.316,0.586,0.056,0.595,0.189,0.881,0.402,0.160,0.245,...,0.174,0.964,0.037,0.189,0.051,0.056,0.040,0.241,0.186,0.575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1871815,56.0,0.231,0.244,0.117,0.587,0.175,0.919,0.265,0.140,0.173,...,0.077,0.959,0.048,0.112,0.052,0.038,0.025,0.161,0.181,0.425
GSM1871849,62.0,0.212,0.433,0.125,0.667,0.044,0.845,0.155,0.147,0.215,...,0.062,0.969,0.049,0.141,0.066,0.044,0.063,0.205,0.211,0.420
GSM1871852,58.0,0.250,0.449,0.120,0.623,0.212,0.917,0.273,0.130,0.182,...,0.159,0.935,0.035,0.141,0.069,0.048,0.043,0.233,0.176,0.496
GSM1871860,91.0,0.287,0.657,0.129,0.649,0.017,0.902,0.324,0.176,0.238,...,0.055,0.983,0.052,0.154,0.069,0.060,0.057,0.276,0.237,0.384
