# Data Mining Laboratory
Group 4: Damiano Chini, Riccardo Gilmozzi, Gianmarco Piccinno & Alessandro Rizzuto

This code operates the preprocessing steps, namely (from the paper):

1) Probes containing missing values are excluded from the analysis. 

2) Probes are mapped to Entrez ID labels if they are available in the associated platform. 3) Values corresponding to raw expression counts or gene expression intensity are log2 transformed (if necessary).

4) Probes mapping to the same Entrez ID label are averaged out. 

5) Probes that cannot be mapped to a unique Entrez ID label are excluded from the analysis, as well as those that cannot be mapped to any Entrez ID label at all.

6) We apply a simple L 1 normalization in linear space, imposing that the sum of expression of all genes is constant among samples. 

After these steps, each data set or batch is represented by a single expression matrix X. Each entry X i j represents the log 2 of the expression intensity of gene i in sample j.

In [38]:
import GEOparse
import pandas as pd
import numpy as np
from  functools import *
import re

In [4]:
list_data = ["GSE2508", "GSE26637", "GSE27949", "GSE48964"]
gse1 = GEOparse.get_GEO(filepath="./data/GSE2508_family.soft.gz")
gse2 = GEOparse.get_GEO(filepath="./data/GSE26637_family.soft.gz")
gse3 = GEOparse.get_GEO(filepath="./data/GSE27949_family.soft.gz")
gse4 = GEOparse.get_GEO(filepath="./data/GSE48964_family.soft.gz")

09-Oct-2017 16:22:19 INFO GEOparse - Parsing ./data/GSE2508_family.soft.gz: 
09-Oct-2017 16:22:19 DEBUG GEOparse - DATABASE: GeoMiame
09-Oct-2017 16:22:19 DEBUG GEOparse - SERIES: GSE2508
09-Oct-2017 16:22:19 DEBUG GEOparse - PLATFORM: GPL91
09-Oct-2017 16:22:20 DEBUG GEOparse - PLATFORM: GPL92
09-Oct-2017 16:22:20 DEBUG GEOparse - PLATFORM: GPL93
09-Oct-2017 16:22:20 DEBUG GEOparse - PLATFORM: GPL94
09-Oct-2017 16:22:20 DEBUG GEOparse - PLATFORM: GPL95
09-Oct-2017 16:22:21 DEBUG GEOparse - PLATFORM: GPL8300
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47224
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47225
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47226
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47227
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47228
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47229
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47230
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: GSM47231
09-Oct-2017 16:22:21 DEBUG GEOparse - SAMPLE: 

# Batch 1 Preprocessing
Batch 1 is composed of five different datasets that use 5 different Affymetrix platforms, each one represents a technical replicate.

In [7]:
plats_1 = list(gse1.gpls.keys())
plats_1

['GPL91', 'GPL92', 'GPL93', 'GPL94', 'GPL95', 'GPL8300']

In [9]:
samples1 = list(gse1.gsms.keys()); samples1
gse1.gsms[samples1[0]].columns

Unnamed: 0,description
ID_REF,
VALUE,'signal' a measure of the abundance of a trans...
ABS_CALL,the call in an absolute analysis that indicate...


In [13]:
samples1 = gse1.phenotype_data[["platform_id", "title"]]; samples1; len(samples1)
sample1 = samples1.groupby(["platform_id"]); sample1.groups
d = {}                        
for l in plats_1:
    print("\nPlatform: "+str(l)+"\n", sample1.get_group(l))
    print("\nPlatform: "+str(l)+"\n", sample1.get_group(l)['title'])
    ls = "".join(list(sample1.get_group(l)['title']))
    lf = re.findall("Lean F", ls)
    of = re.findall("Obese F", ls)
    lm = re.findall("Lean M", ls)
    om = re.findall("Obese M", ls)
    print("LF: ", len(lf), "\nOF: ", len(of), "\nLM: ", len(lm), "\nOM: ", len(om))
    d[l] = {"LF": len(lf), "OF": len(of), "LM": len(lm), "OM": len(om)}
#print(d)
df = pd.DataFrame(d); print(df)
df.sum(axis=1)
df.sum(axis=0)


Platform: GPL91
          platform_id            title
GSM47224       GPL91   Lean F 01 subA
GSM47225       GPL91   Lean F 02 subA
GSM47226       GPL91   Lean F 03 subA
GSM47227       GPL91   Lean F 04 subA
GSM47228       GPL91   Lean F 05 subA
GSM47234       GPL91   Lean M 01 subA
GSM47235       GPL91   Lean M 02 subA
GSM47242       GPL91   Lean M 03 subA
GSM47256       GPL91   Lean M 04 subA
GSM47269       GPL91   Lean M 05 subA
GSM47317       GPL91  Obese F 01 subA
GSM47319       GPL91  Obese F 02 subA
GSM47321       GPL91  Obese F 03 subA
GSM47322       GPL91  Obese F 04 subA
GSM47323       GPL91  Obese F 05 subA
GSM47329       GPL91  Obese M 01 subA
GSM47330       GPL91  Obese M 02 subA
GSM47331       GPL91  Obese M 03 subA
GSM47332       GPL91  Obese M 04 subA

Platform: GPL91
 GSM47224     Lean F 01 subA
GSM47225     Lean F 02 subA
GSM47226     Lean F 03 subA
GSM47227     Lean F 04 subA
GSM47228     Lean F 05 subA
GSM47234     Lean M 01 subA
GSM47235     Lean M 02 subA
GSM47242

GPL8300    20
GPL91      19
GPL92      39
GPL93      39
GPL94      39
GPL95      39
dtype: int64

In [21]:
t8300 = sample1.get_group("GPL8300")[["title"]]#; t8300
t91 = sample1.get_group("GPL91")[["title"]]#; t91
t92 = sample1.get_group("GPL92")[["title"]]#; t92
t93 = sample1.get_group("GPL93")[["title"]]#; t93
t94 = sample1.get_group("GPL94")[["title"]]#; t94
t95 = sample1.get_group("GPL95")[["title"]]#; t95

In [22]:
samples8300 = gse1.pivot_samples('VALUE')[list(t8300.index)]#; samples8300.head()
samples91 = gse1.pivot_samples('VALUE')[list(t91.index)]#; samples91.head()
samples92 = gse1.pivot_samples('VALUE')[list(t92.index)]#; samples92.head()
samples93= gse1.pivot_samples('VALUE')[list(t93.index)]#; samples93.head()
samples94 = gse1.pivot_samples('VALUE')[list(t94.index)]#; samples94.head()
samples95 = gse1.pivot_samples('VALUE')[list(t95.index)]#; samples95.head()

In [23]:
# annotate with GPL
samples8300_annotated = samples8300.reset_index().merge(gse1.gpls['GPL8300'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples8300_annotated.head()
del samples8300_annotated["ID"]; samples8300_annotated.head()
# remove probes without ENTREZ
samples8300_annotated = samples8300_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples8300_annotated = samples8300_annotated[~samples8300_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples8300_annotated = samples8300_annotated.groupby("ENTREZ_GENE_ID").median(); samples8300_annotated.shape

(8592, 20)

In [31]:
# annotate with GPL
samples91_annotated = samples91.reset_index().merge(gse1.gpls['GPL91'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples91_annotated.head()
del samples91_annotated["ID"]; samples91_annotated.head()
# remove probes without ENTREZ
samples91_annotated = samples91_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples91_annotated = samples91_annotated[~samples91_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples91_annotated = samples91_annotated.groupby("ENTREZ_GENE_ID").median(); samples91_annotated.shape


(8591, 19)

In [25]:
# annotate with GPL
samples92_annotated = samples92.reset_index().merge(gse1.gpls['GPL92'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples92_annotated.head()
del samples92_annotated["ID"]; samples92_annotated.head()
# remove probes without ENTREZ
samples92_annotated = samples92_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples92_annotated = samples92_annotated[~samples92_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples92_annotated = samples92_annotated.groupby("ENTREZ_GENE_ID").median(); samples92_annotated.shape

(6547, 39)

In [26]:
# annotate with GPL
samples93_annotated = samples93.reset_index().merge(gse1.gpls['GPL93'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples93_annotated.head()
del samples93_annotated["ID"]; samples93_annotated.head()
# remove probes without ENTREZ
samples93_annotated = samples93_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples93_annotated = samples93_annotated[~samples93_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples93_annotated = samples93_annotated.groupby("ENTREZ_GENE_ID").median(); samples93_annotated.shape

(5229, 39)

In [27]:
# annotate with GPL
samples94_annotated = samples94.reset_index().merge(gse1.gpls['GPL94'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples94_annotated.head()
del samples94_annotated["ID"]; samples94_annotated.head()
# remove probes without ENTREZ
samples94_annotated = samples94_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples94_annotated = samples94_annotated[~samples94_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples94_annotated = samples94_annotated.groupby("ENTREZ_GENE_ID").median(); samples94_annotated.shape

(3905, 39)

In [28]:
# annotate with GPL
samples95_annotated = samples95.reset_index().merge(gse1.gpls['GPL95'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples95_annotated.head()
del samples95_annotated["ID"]; samples95_annotated.head()
# remove probes without ENTREZ
samples95_annotated = samples95_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples95_annotated = samples95_annotated[~samples95_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples95_annotated = samples95_annotated.groupby("ENTREZ_GENE_ID").median(); samples95_annotated.head()

Unnamed: 0_level_0,GSM47823,GSM47824,GSM47825,GSM47826,GSM47827,GSM47828,GSM47829,GSM47830,GSM47831,GSM47832,...,GSM47852,GSM47853,GSM47854,GSM47855,GSM47856,GSM47857,GSM47858,GSM47859,GSM47860,GSM47861
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,410.9,1576.6,456.0,523.2,1597.5,809.1,922.2,1528.2,260.8,629.0,...,258.2,1282.4,1188.5,1861.8,1023.3,872.8,314.8,409.2,1087.5,187.3
100009676,972.1,765.0,931.5,567.3,428.1,3066.2,808.5,2089.9,752.5,622.1,...,1320.4,1423.2,1894.8,1907.5,1646.7,985.2,571.6,3064.4,1578.5,2390.9
10001,907.4,746.9,308.0,481.1,352.5,744.6,697.1,205.3,534.7,804.6,...,394.8,650.8,878.8,1001.7,908.5,429.0,148.1,261.9,571.9,128.4
10003,601.7,350.7,992.6,1495.1,1187.6,972.2,744.9,761.9,252.3,1207.1,...,507.2,220.7,85.1,855.1,200.7,269.0,620.3,734.1,74.3,561.7
10004,4765.8,6540.3,6495.5,6324.6,5901.0,5169.8,4312.7,4194.8,4806.7,6314.8,...,5840.1,5865.8,3593.2,3630.7,5818.2,5956.4,6243.0,7691.1,6141.6,5837.7


In [32]:
len(set.intersection(set(samples8300_annotated.index), set(samples91_annotated.index), set(samples92_annotated.index), set(samples93_annotated.index), set(samples94_annotated.index), set(samples95_annotated.index)))

72

In [33]:
s = [samples8300_annotated, samples91_annotated, samples92_annotated, samples93_annotated, samples94_annotated, samples95_annotated]

In [39]:
df_final = reduce(lambda left,right: pd.merge(left,right, left_index=True, right_index=True, how='outer'), s)

In [41]:
samples1 = gse1.phenotype_data[["platform_id", "title"]]; samples1; len(samples1)-


195

In [42]:
s = samples1[['title', 'platform_id']]; s.head()
s['title'] = s['title'].apply(lambda x: re.split(' sub*', x)[0]); s.head()
s['mod'] = s['title'].apply(lambda x: re.split(' ', x)); s.head()
s['clinical_status'] = s['mod'].apply(lambda x: x[0]); s.head()
s['gender'] = s['mod'].apply(lambda x: x[1]); s.head()
s['replicate'] = s['mod'].apply(lambda x: x[2]); s.head()
del s['mod']; s.head()
s_gr = s.groupby('title')   #s_gr.get_group('Lean F 01')

In [44]:
list(s_gr.groups.keys()); s_gr.first().shape
dff = s_gr.first()
dff = df_final.T
dff.head()
dff = s.merge(dff, left_index=True, right_index=True); dff.head()
dff.drop(['platform_id', 'clinical_status', 'gender', 'replicate'], axis=1, inplace=True)
dff.head()
dff_grouped = dff.groupby(['title']).mean()
dff_grouped.head(); dff_grouped.shape

#dff_grouped        :expression series               
#s                  :annotation series

(39, 18281)

In [45]:
with open('./output/batch1_exprs.txt', 'w') as handle:
    dff_grouped.to_csv(handle, sep='\t')

In [48]:
with open('./output/batch1_ann.txt', 'w') as handle:
    s.to_csv(handle, sep='\t')

# Batch 2 Preprocessing

In [50]:
plats_2 = list(gse2.gpls.keys())[0]
plats_2

'GPL570'

In [51]:
samples2 = gse2.phenotype_data[["characteristics_ch1.2.stimulation", "characteristics_ch1.3.resistance status"]]
samples2.head(); len(samples2)

20

In [52]:
samples2 = samples2.rename(columns={'characteristics_ch1.2.stimulation':'fasting_status',
                         'characteristics_ch1.3.resistance status':'insulin_status'})
samples2.head()
samples2['cbmi'] = samples2['insulin_status'].apply(lambda x: 'obese' if x == 'resistant' else 'lean')
samples2.head()

Unnamed: 0,fasting_status,insulin_status,cbmi
GSM655603,fasting,resistant,obese
GSM655604,fasting,resistant,obese
GSM655605,fasting,resistant,obese
GSM655606,fasting,resistant,obese
GSM655607,fasting,resistant,obese


In [58]:
h = pd.read_pickle("./data/GSE26637_geno.p")
h.head()
q = pd.read_pickle("./data/GSE26637_pheno.p")
q.head()

Unnamed: 0,cbmi,insulin_status,fasting_status
GSM655608,lean,sensitive,fasting
GSM655609,lean,sensitive,fasting
GSM655610,lean,sensitive,fasting
GSM655611,lean,sensitive,fasting
GSM655612,lean,sensitive,fasting


In [59]:

samples570 = gse2.pivot_samples('VALUE')[list(samples2.index)]; samples570.head()

# annotate with GPL
samples570_annotated = samples570.reset_index().merge(gse2.gpls['GPL570'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples570_annotated.head()
del samples570_annotated["ID"]; samples570_annotated.head()
# remove probes without ENTREZ
samples570_annotated = samples570_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples570_annotated = samples570_annotated[~samples570_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples570_annotated = samples570_annotated.groupby("ENTREZ_GENE_ID").median(); samples570_annotated.shape

(20486, 20)

In [60]:
with open('./output/batch2_exprs.txt', 'w') as handle:
    samples570_annotated.T.to_csv(handle, sep='\t')
    
with open('./output/batch2_ann.txt', 'w') as handle:
    samples2.to_csv(handle, sep='\t')

# Batch 3 Preprocessing

In [62]:
plats_3 = list(gse3.gpls.keys())[0]
plats_3

'GPL570'

In [63]:
samples3 = gse3.phenotype_data["characteristics_ch1.1.bmi"]
samples3.head(); len(samples3)
samples3 = pd.DataFrame(samples3); samples3.head()
samples3.rename(columns={"characteristics_ch1.1.bmi":"bmi"}, inplace=True)
samples3["cbmi"] = samples3["bmi"].apply(lambda x: "obese" if (float(x) > 30) else ("lean" if (float(x) > 18.5) & (float(x) <= 25) else ("overweight" if (float(x) > 25) & (float(x) <= 30) else "STRANGE")) )
samples3.head();samples3.shape

(33, 2)

In [64]:
samples3_570 = gse3.pivot_samples('VALUE')[list(samples3.index)]; samples3_570.head() 

name,GSM691122,GSM691123,GSM691124,GSM691125,GSM691126,GSM691127,GSM691128,GSM691129,GSM691130,GSM691131,...,GSM691145,GSM691146,GSM691147,GSM691148,GSM691149,GSM691150,GSM691151,GSM691152,GSM691153,GSM691154
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,7.004583,7.6595,7.430517,7.771238,7.68304,7.879629,7.184948,7.124926,7.498073,7.687856,...,7.767611,7.76975,7.75978,7.925825,7.588576,8.013171,8.301649,8.516437,7.764079,7.935316
1053_at,5.28503,5.253187,5.492071,5.738101,5.643463,5.356738,5.775912,5.640174,5.262772,5.70477,...,5.200273,5.121898,5.65597,5.33495,5.47995,5.342561,5.664976,5.385453,5.530938,5.611537
117_at,5.607604,5.476518,5.753545,5.453142,6.431755,4.901203,6.298175,6.155661,5.073436,5.634631,...,5.401797,5.456835,5.242564,5.484471,4.999941,5.076155,5.034425,5.414035,5.414752,5.142741
121_at,6.942938,7.450646,7.004481,7.251976,6.931031,7.162552,6.570954,7.042676,6.968916,7.077937,...,6.758132,7.080725,7.290405,6.689023,6.793457,6.604402,7.339816,7.106812,6.752074,6.85442
1255_g_at,2.84026,2.948136,2.82735,2.663054,2.610417,2.978128,2.493905,2.745614,2.742668,2.670149,...,2.837239,2.766161,2.756533,2.719723,2.866922,2.704116,2.713577,2.726565,2.589001,2.567418


In [65]:

# annotate with GPL
samples3_570_annotated = samples3_570.reset_index().merge(gse3.gpls['GPL570'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID").set_index('ID_REF'); samples3_570_annotated.head()
del samples3_570_annotated["ID"]; samples3_570_annotated.head()
# remove probes without ENTREZ
samples3_570_annotated = samples3_570_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
samples3_570_annotated = samples3_570_annotated[~samples3_570_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
samples3_570_annotated = samples3_570_annotated.groupby("ENTREZ_GENE_ID").median(); samples3_570_annotated.shape
samples3_570_annotated.head() 

Unnamed: 0_level_0,GSM691122,GSM691123,GSM691124,GSM691125,GSM691126,GSM691127,GSM691128,GSM691129,GSM691130,GSM691131,...,GSM691145,GSM691146,GSM691147,GSM691148,GSM691149,GSM691150,GSM691151,GSM691152,GSM691153,GSM691154
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.202563,3.853392,4.005799,3.775287,3.988817,4.074538,3.789106,3.985469,4.202136,3.929663,...,4.035109,4.505537,3.688537,4.021974,3.859197,4.016395,3.853851,3.905869,3.933165,3.731458
10,3.611358,3.900851,3.670345,3.98903,3.699571,4.050322,3.687475,3.788186,4.230811,4.020388,...,4.260082,3.755154,4.273325,3.764177,3.931262,3.856003,3.631455,3.812028,4.10696,4.014231
100,5.526316,5.553305,5.527924,5.899559,5.844783,5.752965,6.008601,5.560185,6.149886,6.078632,...,6.150959,5.858245,6.099553,6.562532,6.097076,5.970264,5.229485,5.441216,6.34484,6.026648
1000,3.781192,3.88183,3.518877,3.740666,3.540397,3.609309,3.587624,3.610829,3.501181,3.365197,...,3.53761,3.925105,3.738994,3.721691,3.315593,3.471609,3.578582,3.718983,3.683929,3.618684
10000,4.521154,4.355984,4.752987,4.277416,4.410266,4.467307,4.573366,4.229275,4.221578,4.362966,...,4.302391,4.348741,4.414892,4.134308,4.327611,4.488344,4.22999,4.791674,4.489651,4.827196


In [67]:
with open('./output/batch3_exprs.txt', 'w') as handle:
    samples3_570_annotated.T.to_csv(handle, sep='\t')

# Batch 4 Preprocessing

In [68]:
plats_4 = list(gse4.gpls.keys())[0]
plats_4

'GPL6244'

In [92]:
#####################################################################
#                   ANNOTATIONS
#####################################################################

samples4 = gse4.phenotype_data["source_name_ch1"]
samples4 = pd.DataFrame(samples4); samples4.head()
samples4.rename(columns={"source_name_ch1":"cbmi"}, inplace=True); samples4.head()
samples4["cbmi"] = samples4["cbmi"].apply(lambda x: x.split(' ')[1]); samples4.head()
print(samples4.head()); print(len(samples4))
print(samples4.shape)


                 cbmi
GSM1187673      obese
GSM1187674      obese
GSM1187675      obese
GSM1187676  non-obese
GSM1187677  non-obese
6
(6, 1)


In [91]:
             
with open('./output/batch4_ann.txt', 'w') as handle:
    samples4.to_csv(handle, sep='\t')

In [172]:
samples4_6244 = gse4.pivot_samples('VALUE')[list(samples4.index)]; samples4_6244.head() 
anns_4 = gse4.gpls['GPL6244'].table[["ID", "GB_LIST"]].dropna(); anns_4.head()
anns_4['refs'] = anns_4['GB_LIST'].apply(lambda x: x.split(','))
anns_4.head()
anns_4['an_bool'] = anns_4['refs'].apply(lambda x: False if len(x)>1 else True)
anns_4 = anns_4[anns_4['an_bool']==True].apply(lambda x: x,axis=0)
print(anns_4.head())
anns_4.shape

         ID    GB_LIST         refs  an_bool
6   7896748  NC_001807  [NC_001807]     True
7   7896750  NC_001807  [NC_001807]     True
17  7896859  NR_029639  [NR_029639]     True
18  7896861  NR_029834  [NR_029834]     True
19  7896863  NR_029957  [NR_029957]     True


(2400, 4)

In [173]:
with open('./output/batch4_exprs.txt', 'w') as handle:
    anns_4.T.to_csv(handle, sep='\t')