In [1]:
import pandas as pd
import numpy as np
import ppbbatch as pp
import re

# Set logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logging.getLogger("GEOparse").setLevel(logging.WARNING)

In [2]:
batches_gse = []
for batch_id in ['GSE56649', 'GSE15573', 'GSE19821', 'GSE4588']:
    batch_gse = pp.load_dataset(batch_id)
    batches_gse.append(batch_gse)
    print('loaded', batch_id)

- Loading from ./GSE56649_family.soft.gz


  gpls[entry_name] = parse_GPL(data_group, entry_name)


loaded GSE56649
- Loading from ./GSE15573_family.soft.gz
loaded GSE15573
- Loading from ./GSE19821_family.soft.gz
loaded GSE19821
- Loading from ./GSE4588_family.soft.gz
loaded GSE4588


In [3]:
# a list for each batch
# the list contains all the label for all the sample in the batch
batches_indexes = []
batches_dataframes = []

# metadata
metadata = {}

def get_sample_status(characteristics_str, all_diseased=False):
    characteristics_str = characteristics_str.lower()

    if 'erythematosus' in characteristics_str or '1 months' in characteristics_str:
        # to remove
        return None

    if all_diseased:
        return 'D'
    
    if 'control' in characteristics_str or 'healthy' in characteristics_str:
        return 'N'
    else:
        return 'D'
        
batch_status_list = [False, False, True, False]

for i, batch in enumerate(batches_gse):
    
    batch_status = batch_status_list[i]    
    indexes = []
    dataframes = []
    
    for gsm_name, gsm in batch.gsms.items():
        characteristics = gsm.metadata['characteristics_ch1']
        if 'characteristics_ch2' in gsm.metadata:
            characteristics += gsm.metadata['characteristics_ch2']
            
        characteristics = ' '.join(characteristics)
        
        gender = 'U'
        if 'gender: female' in characteristics or 'gender: F' in characteristics:
            gender = 'F'
        elif 'gender: male' in characteristics or 'gender: M' in characteristics:
            gender = 'M'
            
        sample_status = get_sample_status(characteristics, batch_status)
        if sample_status == None:
            # skip
            continue
        
        sample_label = gsm_name + '_' + gender + sample_status
        metadata[gsm_name] = {'gender': gender, 'status': sample_status}
        dataframes.append(gsm.table)
        indexes.append(sample_label)
        
    batches_indexes.append(indexes)
    batches_dataframes.append(dataframes)

In [4]:
mapper_1 = pp.create_mapper_from_platform(batches_gse[0], 'ID', 'ENTREZ_GENE_ID')
mapper_2 = pp.create_mapper_from_platform(batches_gse[1], 'ID', 'Entrez_Gene_ID')
mapper_3 = pp.create_mapper_from_platform(batches_gse[2], 'ID', 'CompositeSequence BioSequence Database Entry [Geneid (Locusid)]')
mapper_4 = pp.create_mapper_from_platform(batches_gse[3], 'ID', 'ENTREZ_GENE_ID')
mapper_1_df = pp.mapper_to_pandas_df(mapper_1, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_2_df = pp.mapper_to_pandas_df(mapper_2, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_3_df = pp.mapper_to_pandas_df(mapper_3, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_4_df = pp.mapper_to_pandas_df(mapper_4, 'ID_REF', 'ENTREZ_GENE_ID')
mappers = [mapper_1_df, mapper_2_df, mapper_3_df, mapper_4_df]

In [5]:
results_df = []
isLog2 = [False, False, True, False]
for i, batch in enumerate(batches_gse):
    dataframes = batches_dataframes[i]
    mapper = mappers[i]
    batch_indexes = batches_indexes[i]
    assert len(dataframes) == len(batch_indexes)
    filtered_df = pp.filter_and_normalize(dataframes, mapper, 'ID_REF', batch_indexes, isLog2=isLog2[i])
    results_df.append(filtered_df)

In [6]:
results_df[3].head()

Unnamed: 0,GSM101870_UN,GSM102703_UD,GSM101884_UD,GSM101869_UN,GSM101880_UD,GSM101965_UN,GSM101967_UN,GSM101969_UN,GSM101871_UN,GSM102704_UD,...,GSM101885_UD,GSM101882_UD,GSM101874_UN,GSM101968_UN,GSM101886_UD,GSM101872_UN,GSM102706_UD,GSM102707_UD,GSM101873_UN,GSM101881_UD
1,-16.059255,-17.16229,-16.476293,-17.709576,-16.961417,-16.986022,-15.469228,-16.413532,-15.829882,-16.745273,...,-16.358624,-17.502083,-16.769452,-16.46488,-17.176201,-16.500727,-15.986577,-18.124171,-16.572612,-16.530265
10,-21.773501,-21.845987,-21.872042,-20.634169,-21.868308,-21.293451,-20.979286,-20.809117,-21.147797,-22.0871,...,-20.483319,-21.329902,-20.869589,-20.913341,-20.928274,-20.571777,-20.635669,-21.423731,-21.582979,-20.926013
100,-14.087876,-15.359755,-14.4395,-13.966203,-13.576446,-15.480097,-15.684227,-16.284062,-14.090106,-14.038765,...,-13.820354,-14.127288,-14.54226,-16.030698,-13.222368,-14.6929,-16.307199,-16.48936,-14.138123,-14.232178
1000,-19.803875,-18.919987,-17.885631,-18.488189,-19.961417,-20.472421,-18.310907,-18.294544,-17.66667,-18.475666,...,-17.649492,-19.95139,-19.943589,-18.468556,-18.227834,-19.206127,-17.627496,-18.848046,-19.294734,-20.547501
10000,-15.803875,-15.764903,-15.012108,-16.005043,-15.569492,-15.766011,-16.189209,-15.075467,-15.373742,-16.28343,...,-15.85512,-15.850314,-15.149076,-15.207626,-15.952199,-15.874695,-15.785462,-15.475942,-15.526386,-15.640082


In [7]:
results_df[0].to_pickle('data/GSE56649_table.pkl')
results_df[1].to_pickle('data/GSE15573_table.pkl')
results_df[2].to_pickle('data/GSE19821_table.pkl')
results_df[3].to_pickle('data/GSE4588_table.pkl')

In [8]:
import pickle
with open('data/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)