In [1]:
import pandas as pd
import numpy as np
import ppbbatch as pp
import re

# Set logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logging.getLogger("GEOparse").setLevel(logging.WARNING)

In [2]:
batches_gse = []
for batch_id in ['GSE62117', 'GSE64567','GSE33526', 'GSE78958']:
    batch_gse = pp.load_dataset(batch_id)
    batches_gse.append(batch_gse)
    print('loaded', batch_id)

- Loading from ./GSE62117_family.soft.gz






loaded GSE62117
- Loading from ./GSE64567_family.soft.gz
loaded GSE64567
- Loading from ./GSE33526_family.soft.gz






loaded GSE33526
- Loading from ./GSE78958_family.soft.gz
loaded GSE78958


In [3]:
# a list for each batch
# the list contains all the label for all the sample in the batch
batches_indexes = []

# metadata
metadata = {}

def getObeseOrLeanLabel(bmi_str):
    if '25-29.99' in bmi_str:
        return 'overweight', '25-29.99'
    elif '30+' in bmi_str:
        return 'obese', '30+'
    elif 'unk' in bmi_str:
        return 'unk', 'unk'
    elif '<25' in bmi_str:
        return 'lean', '<25'
    else:
        bmi = float(re.findall(r"[-+]?\d*\.\d+|\d+", bmi_str)[0])
        if bmi >= 30:
            return 'obese', bmi
        elif bmi < 25:
            return 'lean', bmi
        else:
            return 'overweight', bmi
    
for batch in batches_gse:
    indexes = []
    for gsm_name, gsm in batch.gsms.items():
        characteristics = gsm.metadata['characteristics_ch1']
        if 'characteristics_ch2' in gsm.metadata:
            characteristics += gsm.metadata['characteristics_ch2']
            
        gender = 'U'
        if 'gender: female' in str(characteristics) or 'gender: F' in str(characteristics):
            gender = 'F'
        elif 'gender: male' in str(characteristics) or 'gender: M' in str(characteristics):
            gender = 'M'
        
        bmi_str = re.findall('bmi[^\']*', str(characteristics).lower())[0]
        bmi_str = bmi_str.replace(' (kg/m2)', '')
        bmi_label, bmi_value = getObeseOrLeanLabel(bmi_str)
        bmi_label_short = bmi_label[0].upper()
        
        sample_label = gsm_name + '_' + bmi_label_short + gender
        metadata[gsm_name] = {'gender': gender, 'bmi_label': bmi_label_short, 'bmi': bmi_value}
        indexes.append(sample_label)
        
    batches_indexes.append(indexes)

In [4]:
mapper_1 = pp.create_mapper_from_platform(batches_gse[0], 'ID', 'GENE')
mapper_2 = pp.create_mapper_from_platform(batches_gse[1], 'ID', 'Entrez_Gene_ID')
mapper_3 = pp.create_mapper_from_platform(batches_gse[2], 'ID', 'GENE')
mapper_4 = pp.create_mapper_from_platform(batches_gse[3], 'ID', 'ENTREZ_GENE_ID')
mapper_1_df = pp.mapper_to_pandas_df(mapper_1, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_2_df = pp.mapper_to_pandas_df(mapper_2, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_3_df = pp.mapper_to_pandas_df(mapper_3, 'ID_REF', 'ENTREZ_GENE_ID')
mapper_4_df = pp.mapper_to_pandas_df(mapper_4, 'ID_REF', 'ENTREZ_GENE_ID')
mappers = [mapper_1_df, mapper_2_df, mapper_3_df, mapper_4_df]

In [5]:
results_df = []
isLog2 = [False, False, True, True] 
for i, batch in enumerate(batches_gse):
    dataframes = pp.extract_dataframes(batch)
    mapper = mappers[i]
    batch_indexes = batches_indexes[i]
    filtered_df = pp.filter_and_normalize(dataframes, mapper, 'ID_REF', batch_indexes, isLog2=isLog2[i])
    results_df.append(filtered_df)

In [6]:
results_df[3].head()

Unnamed: 0_level_0,GSM2082309_OU,GSM2082137_LU,GSM2082288_OU,GSM2082430_LU,GSM2082352_LU,GSM2082452_LU,GSM2082232_OU,GSM2082122_LU,GSM2082273_LU,GSM2082100_LU,...,GSM2082454_OU,GSM2082136_LU,GSM2082415_OU,GSM2082414_LU,GSM2082115_OU,GSM2082354_OU,GSM2082254_OU,GSM2082103_UU,GSM2082331_OU,GSM2082456_UU
ENTREZ_GENE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,-17.485207,-15.265371,-16.54229,-17.592852,-17.492677,-17.284715,-17.706616,-17.112536,-17.365168,-17.055606,...,-17.492196,-16.687889,-17.191476,-16.362673,-16.71327,-16.407849,-17.306658,-15.61637,-17.06486,-16.595442
100,-14.806216,-15.441234,-15.264528,-14.835158,-14.62193,-15.857838,-14.829319,-14.279805,-14.848493,-14.925592,...,-14.38347,-14.868336,-14.287737,-14.621416,-15.759792,-14.805317,-14.604102,-15.63589,-15.703869,-15.216451
1000,-16.134476,-15.152324,-16.100634,-15.249574,-14.228156,-16.957138,-16.692373,-14.773797,-15.235785,-15.444889,...,-17.21845,-16.613555,-16.651223,-15.180054,-15.917871,-16.626918,-15.766871,-16.105876,-16.159137,-15.057308
10000,-14.8344,-13.205308,-14.471264,-14.090553,-15.164141,-15.909644,-15.106608,-14.090525,-14.701185,-14.416511,...,-16.591182,-14.446372,-14.53255,-13.491817,-14.89911,-13.913453,-13.622456,-14.877586,-15.070286,-13.617644
10001,-16.411896,-16.288927,-16.167137,-15.237443,-15.026887,-14.617355,-15.701839,-17.074856,-16.15871,-16.357151,...,-13.854726,-17.792856,-16.062189,-15.336831,-16.68588,-15.735242,-16.153518,-16.380995,-15.652743,-15.658944


In [7]:
results_df[0].to_pickle('data/GSE62117_table.pkl')
results_df[1].to_pickle('data/GSE64567_table.pkl')
results_df[2].to_pickle('data/GSE33526_table.pkl')
results_df[3].to_pickle('data/GSE78958_table.pkl')

In [9]:
import pickle
with open('data/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)