In [1]:
import os, gzip, glob, shutil, pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, f_oneway
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import plotly.express as px
import seaborn as sns
from sklearn import decomposition

In [2]:
for gz_fn in glob.glob('../data/*.gz'):
    if not os.path.exists(gz_fn[:-3]):
        with gzip.open(gz_fn, 'rb') as f_in, open(gz_fn[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [3]:
with open('../data/data_processing/GSE8685_family.soft') as f:
    lines=f.readlines()
    
    on_expression_table=False
    on_platform_table=False
    on_sample=False
    
    sample=''
    sample_key={}
    
    probe_key={}    
    expression_table={}
    p_val_table={}
    transcript_table={}
    abs_call_table={}
    
    for line in lines:
        
        line=line.strip()
        
        if line=='!platform_table_begin':
            on_platform_table=True
            continue
            
        elif line=='!platform_table_end':
            on_platform_table=False
            continue
            
        elif line=='!sample_table_begin':
            on_expression_table=True
            continue
            
        elif line=='!sample_table_end':
            on_expression_table=False
            continue
            
        elif line.split(' = ')[0]=='^SAMPLE':
            sample=line.split(' = ')[1]
            expression_table[sample]={}
            transcript_table[sample]={}
            p_val_table[sample]={}
            abs_call_table[sample]={}
            continue
            
        elif line.split(' = ')[0]=='!Sample_title':
            sample_key[sample]=line.split(' = ')[1]
            continue
        
        line=line.strip().split('\t')
        
        if on_platform_table:
            if len(line)<10:
                if line[5]=='':
                    print(line)
                probe_key[line[0]]=line[5]
                continue
                
            else:
                gene_ids=line[10].split(' /// ')

                probe_key[line[0]]=gene_ids
                continue
            
        elif on_expression_table:
            if sample=='':
                print(line)
            if line[0]=='ID_REF':
                continue
            else:
                value=line[1]
                p_val=line[3]
                abs_call=line[2]
                probe_id=line[0]
                gene_names=probe_key[probe_id]
                if type(gene_names)==list:
                    for gene_name in gene_names:
                        if not gene_name in expression_table[sample]:
                            expression_table[sample][gene_name]=[float(value)]
                        else:
                            expression_table[sample][gene_name].append(float(value))
                        
                else:
                    if not gene_names in expression_table[sample]:
                        expression_table[sample][gene_names]=[float(value)]
                    else:
                        expression_table[sample][gene_names].append(float(value))
                p_val_table[sample][probe_id]=float(p_val) 
                abs_call_table[sample][probe_id]= 1 if abs_call=='P' else 0
                transcript_table[sample][probe_id]=float(value)

In [21]:
transcript_df=pd.DataFrame(transcript_table).dropna()
transcript_df.to_csv('../Data/data_processing/Microarray_Data.csv')

In [18]:
sample_key_abb={
    'GSM215347':'Control 1',
    'GSM215348':'Control 2',
    'GSM215349':'Control 3',
    'GSM215350':'Activated with IL2 rep 1',
    'GSM215351':'Activated with IL2 rep 2',
    'GSM215352':'Activated with IL2 rep 3',
    'GSM215353':'Activated with IL15 rep 1',
    'GSM215354':'Activated with IL15 rep 2',
    'GSM215355':'Activated with IL15 rep 3',
    'GSM215356':'Activated with IL21 rep 1',
    'GSM215357':'Activated with IL21 rep 2',
    'GSM215358':'Activated with IL21 rep 3'
}

experimental_groups={
    'Control': ['GSM215347', 'GSM215348', 'GSM215349'],
    'IL2': ['GSM215350', 'GSM215351', 'GSM215352'],
    'IL15': ['GSM215353', 'GSM215354', 'GSM215355'],
    'IL21': ['GSM215356', 'GSM215357', 'GSM215358'],
}

In [19]:
abs_call_df=pd.DataFrame(abs_call_table)
abs_call_df

present_transcripts=set()

for transcript in abs_call_df.index:
    for condition in experimental_groups:
        
        if abs_call_df.loc[transcript, experimental_groups[condition]].sum()==3:
            present_transcripts.add(transcript)
            break

len(present_transcripts)

present_transcripts_df=transcript_df.loc[present_transcripts]
present_transcripts_df.shape

(21794, 12)

In [22]:
present_transcripts_df.to_csv('../data/expression_by_probe.csv')

In [24]:
present_transcripts_df

Unnamed: 0,GSM215347,GSM215348,GSM215349,GSM215350,GSM215351,GSM215352,GSM215353,GSM215354,GSM215355,GSM215356,GSM215357,GSM215358
231311_at,107.3,48.0,103.2,110.1,121.1,98.1,115.2,120.4,96.3,74.4,45.9,70.5
226314_at,130.2,212.1,208.7,220.5,181.6,171.1,222.5,162.6,194.5,189.8,238.3,221.7
223048_at,714.8,675.5,690.4,680.8,715.0,557.3,642.3,663.1,756.9,633.6,840.6,646.2
239143_x_at,161.1,97.0,95.7,198.3,123.4,155.4,222.4,174.3,289.8,155.5,116.2,143.5
201522_x_at,2010.9,2298.4,2128.5,2126.2,2191.3,2056.6,2067.4,1914.6,2046.1,2225.5,2352.3,2273.1
...,...,...,...,...,...,...,...,...,...,...,...,...
225443_at,664.5,715.8,784.5,798.8,837.2,879.5,837.9,1072.7,924.6,799.2,797.1,748.6
241632_x_at,184.8,151.7,93.8,115.6,127.7,139.7,110.0,92.1,88.7,101.8,57.0,145.8
242481_at,30.0,32.6,27.5,25.7,20.1,26.0,27.3,36.3,27.4,36.4,35.0,31.2
209870_s_at,437.7,650.1,472.9,359.4,304.4,339.9,356.0,363.2,316.5,453.9,478.4,405.2


In [27]:
for probe in probe_key:
    if type(probe_key[probe]) != list:
        probe_key[probe]=[probe_key[probe]]

In [30]:
pickle.dump(probe_key, open('../Data/probe_key.p', 'wb'))