In [1]:
import pandas as pd, numpy as np

import GEOparse, pyhgnc, pickle, copy

In [13]:
def get_gene_fc_dict(gene_fc_dataframe_, sample_name, duplicate_handler=np.mean, hgnc=True, hgnc_path="", query=None):
    gene_fc_dataframe = copy.deepcopy(gene_fc_dataframe_)[[sample_name]].dropna()
    gene_fc_dict = dict()
    geo_hgnc_ = False
    try:
        geo_hgnc_dict = pickle.load(open("datasets/geo_to_hgnc_ids","rb"))
    except FileNotFoundError:
        geo_hgnc_ = True
        geo_hgnc_dict = dict()
    for row in gene_fc_dataframe.iterrows():
        gene_symbol = row[0]
        if hgnc:
            try:
                if geo_hgnc_:
                    geo_hgnc_dict[gene_symbol] = "HGNC:{}".format(query.hgnc(symbol=gene_symbol)[0].identifier)
                gene_symbol = geo_hgnc_dict[gene_symbol]
            except (KeyError, IndexError) as e:
                continue
        gene_fc = row[1]
        gene_fc_dict.setdefault(gene_symbol, [])
        gene_fc_dict[gene_symbol].append(gene_fc)
    for gene in gene_fc_dict:
        gene_fc_dict[gene] = duplicate_handler(gene_fc_dict[gene])
    if geo_hgnc_ and hgnc_path:
        with open("datasets/geo_to_hgnc_ids","wb") as f:
            pickle.dump(geo_hgnc_dict, f)
    print ("Dict for {} addedd successfully with {} samples".format(sample_name, len(gene_fc_dict)))
    return gene_fc_dict

In [11]:
def parse_database(geo_database_name, index_column="IDENTIFIER", hgnc_path=""):
    geo_df = GEOparse.get_GEO(geo=geo_database_name)\
                                .table\
                                .set_index(index_column)
    geo_df = geo_df[[col for col in geo_df.columns if "GSM" in col]]
    query = pyhgnc.query()
    return [get_gene_fc_dict(geo_df, sample, query=query, hgnc_path=hgnc_path) for sample in geo_df.columns]
    

In [14]:
gene_data = parse_database("GDS5819",hgnc_path="datasets/geo_to_hgnc_ids")

21-Dec-2017 21:24:43 INFO GEOparse - File already exist: using local version.
21-Dec-2017 21:24:43 INFO GEOparse - Parsing ./GDS5819.soft.gz: 
21-Dec-2017 21:24:43 DEBUG GEOparse - DATABASE: Geo
21-Dec-2017 21:24:43 DEBUG GEOparse - DATASET: GDS5819
21-Dec-2017 21:24:43 DEBUG GEOparse - SUBSET: GDS5819_1
21-Dec-2017 21:24:43 DEBUG GEOparse - SUBSET: GDS5819_2
21-Dec-2017 21:24:43 DEBUG GEOparse - SUBSET: GDS5819_3
21-Dec-2017 21:24:43 DEBUG GEOparse - SUBSET: GDS5819_4
21-Dec-2017 21:24:43 DEBUG GEOparse - DATASET: GDS5819


Dict for GSM1599177 addedd successfully with 15949 samples
Dict for GSM1599178 addedd successfully with 15979 samples
Dict for GSM1599179 addedd successfully with 16008 samples
Dict for GSM1599180 addedd successfully with 15545 samples
Dict for GSM1599181 addedd successfully with 15670 samples
Dict for GSM1599182 addedd successfully with 15706 samples
Dict for GSM1599183 addedd successfully with 15736 samples
Dict for GSM1599184 addedd successfully with 15460 samples
Dict for GSM1599185 addedd successfully with 15598 samples
Dict for GSM1599186 addedd successfully with 15568 samples
Dict for GSM1599187 addedd successfully with 15560 samples
Dict for GSM1599188 addedd successfully with 15593 samples
Dict for GSM1599189 addedd successfully with 15572 samples


In [16]:
pickle.dump(gene_data, open("datasets/gene_data","wb"))