# Loading brain tissue data

- This is a script to load in brain tissue data for Alzheimer's and Huntington's patients from the EWAS disease_methylation.txt file and using the metadata file sample_disease.txt.
- Both of the files should be place in the directory above this script
- All subsets of the data will also be placed in the same diretory as this script

### This script produces:

alzheimers/alz_brain_unhealthy_all.csv: csv with the unhealthy data for Alzheimer's patients, brain tissue and all CpG sites

huntingtons/hunt_brain_unhealthy_all.csv: csv with the unhealthy data for Huntington's patients, brain tissue and all CpG sites



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import seaborn as sns
import _pickle as cPickle
import csv

### Getting IDs, diseases and tissues

In [2]:
#Set path to full disease dataset

data_location = '../disease_methylation.txt'
df = pd.read_csv(data_location, sep='\t',nrows=2,index_col=0)
unhealthy_meta=df.T
unhealthy_meta

sample_id,disease,tissue
GSM1068826,Alzheimer's disease,brain - cerebellum
GSM1068827,Alzheimer's disease,brain - cerebellum
GSM1068829,Alzheimer's disease,brain - cerebellum
GSM1068832,Alzheimer's disease,brain - cerebellum
GSM1068833,Alzheimer's disease,brain - cerebellum
...,...,...
GSM2190544.1,Ulcerative colitis,whole blood
GSM1871195,Ulcerative colitis,whole blood
GSM990041.1,Ulcerative colitis,whole blood
GSM1506396,Ulcerative colitis,whole blood


In [3]:
#Getting just unhealthy IDs
unhealthy_ids=[]
#Open metadata file
with open(r'../sample_disease.txt') as f:
    for row in f:
        row=row.split()
        #If sample doesn't come from the control group append ID to unhealthy_ids list
        if str(row[-3][1:-1])!='control':
            unhealthy_ids.append(row[0][1:-1])


In [4]:
#Remove 'sample_id' from list
unhealthy_ids=unhealthy_ids[1:]

### Getting the brain sample IDs

In [5]:
#IDs for samples from Alzheimer's and Huntington's cohorts and any brain tissue
id_alzheimers_brain=[]
id_huntingtons_brain=[]

for i in unhealthy_ids:
    if unhealthy_meta.loc[i]['disease']=="Alzheimer's disease" and str(unhealthy_meta.loc[i]['tissue'])[0:5]=="brain":
        id_alzheimers_brain.append(i)
        
    if unhealthy_meta.loc[i]['disease']=="Huntington's disease" and str(unhealthy_meta.loc[i]['tissue'])[0:5]=="brain":
        id_huntingtons_brain.append(i)
        

### Reading in ages

In [6]:
#Producing a pandas df of the age in the unhealthy cohort
meta = []

with open(r'../sample_disease.txt') as f:
    for row in f:
        meta.append(row[0:50])
    
meta_2=[]
for i in meta:
    meta_2.append(i.split())
    
ages=[]
df_id_ages=[]
df_age=[]
for i in range(1,len(meta_2)):
    p=meta_2[i]
    v=[]
    v.append(p[0][1:-1])
    v.append(p[2])
    df_id_ages.append(p[0][1:-1])
    df_age.append(p[2])
    ages.append(v)
    
df_ages = pd.DataFrame({'id': df_id_ages,
                   'age': df_age,})

df_ages=df_ages.set_index('id')
df_ages

Unnamed: 0_level_0,age
id,Unnamed: 1_level_1
GSM1068826,88
GSM1068827,92
GSM1068829,93
GSM1068832,96
GSM1068833,86
...,...
GSM2190544.1,0
GSM1871195,74.4
GSM990041.1,69
GSM1506396,34


### Reading in all Alzheimer's brain data

In [None]:
col=['sample_id']+id_alzheimers_brain
alz_brain_all = pd.read_csv('../disease_methylation.txt', delimiter = '\t', usecols=col, low_memory=False)

In [None]:
### Adding ages from metadata

In [None]:
alz_brain_all=alz_brain_all.set_index('sample_id')
alz_brain_all=alz_brain_all.T
# Adding age column
alz_brain_all['AGE']=np.zeros(len(id_alzheimers_brain))
cols = list(alz_brain_all.columns.values)
# Reordering columns
cols=cols[0:-1]
alz_brain_all = alz_brain_all[['AGE']+cols]
alz_brain_all


In [None]:
# For each ID add age to datafram - if no age given, drop sample
for i in id_alzheimers_brain:
    try:
        try:
            alz_brain_all.loc[i]['AGE']=float(df_ages.loc[i]['age'])
        except ValueError:
            alz_brain_all=alz_brain_all.drop([i])
    except KeyError:
        alz_brain_all=alz_brain_all.drop([i])


In [None]:
#Saving to alz_brain_unhealthy_all.csv
alz_brain_all.to_csv('alzheimers/alz_brain_unhealthy_all.csv', encoding='utf-8', index=True)

### Reading in all Huntington's brain data

In [None]:
col_hunt=['sample_id']+id_huntingtons_brain
hunt_brain_all = pd.read_csv('../disease_methylation.txt', delimiter = '\t', usecols=col_hunt, low_memory=False)
hunt_brain_all=hunt_brain_all.set_index('sample_id')
hunt_brain_all=hunt_brain_all.T


In [None]:
#Adding ages
hunt_brain_all['AGE']=np.zeros(len(id_huntingtons_brain))
cols = list(hunt_brain_all.columns.values)
cols=cols[0:-1]
hunt_brain_all = hunt_brain_all[['AGE']+cols]


In [None]:
# For each ID add age to datafram - if no age given, drop sample
for i in id_huntingtons_brain:
    try:
        try:
            hunt_brain_all.loc[i]['AGE']=float(df_ages.loc[i]['age'])
        except ValueError:
            hunt_brain_all=hunt_brain_all.drop([i])
    except KeyError:
        hunt_brain_all=hunt_brain_all.drop([i])

In [None]:
#Saving to hunt_brain_unhealthy_all.csv
hunt_brain_all.to_csv('huntingtons/hunt_brain_unhealthy_all.csv', encoding='utf-8', index=True)