In [None]:
import os, sys
import GEOparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


In [None]:
accession_id = 'GSE97810'
data_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/RA/data'
directory = os.path.join(data_dir, accession_id)
try:
    os.stat(directory)
except:
    os.mkdir(directory)
    os.mkdir(os.path.join(directory, "exprs"))
    os.mkdir(os.path.join(directory, "pData"))
    os.mkdir(os.path.join(directory, "annot"))


In [None]:
# Parse soft file
gz_name = os.path.join(directory, '{}_family.soft.gz'.format(accession_id))
if not os.path.exists(gz_name):
    print("downloading")
    gse = GEOparse.get_GEO(geo=accession_id, destdir=directory, silent=True)
else:
    gse = GEOparse.get_GEO(filepath=gz_name, silent=True)


In [None]:
# Get expression data and metadata matrices
exprs = []
gsmNames = []
metadata = {}
with PdfPages(os.path.join(directory, 'GSE_boxplots.pdf')) as pdf:
    for gsm_name, gsm in gse.gsms.items():
        if gsm.metadata['type'][0] == 'RNA':
            # Expression data
            if len(gsm.table) > 0:
                tmp = gsm.table['VALUE']
                tmp.index = gsm.table['ID_REF']
                gsmNames.append(gsm_name)
                if len(exprs) == 0:
                    exprs = tmp.to_frame()
                else:
                    exprs = pd.concat([exprs, tmp.to_frame()], axis=1)

            # Metadata
            for key, value in gsm.metadata.items():
                #print(key, value)
                if (key == 'characteristics_ch1' or key == 'characteristics_ch2') and (
                        len([i for i in value if i != '']) > 1 or value[0].find(': ') != -1):
                    print(key, value)
                    tmpVal = 0
                    for tmp in value:
                        splitUp = [i.strip() for i in tmp.split(':')]
                        # print(splitUp)
                        if len(splitUp) == 2:
                            if not splitUp[0] in metadata:
                                metadata[splitUp[0]] = {}
                            metadata[splitUp[0]][gsm_name] = splitUp[1]
                        else:
                            if not key in metadata:
                                metadata[key] = {}
                            metadata[key][gsm_name] = splitUp[0]
                else:
                    if not key in metadata:
                        metadata[key] = {}
                    print(value)
                    if len(value) == 1:
                        metadata[key][gsm_name] = ' '.join([j.replace(',', ' ') for j in value])

    # Write expression data matrix to file
    exprs.columns = gsmNames
    with open(os.path.join(directory, 'expres/{}_exprs.csv'.format(accession_id)), 'w') as outFile:
        exprs.to_csv(outFile)

    # Write metadata matrix to file
    with open(os.path.join(directory, 'pData/{}_pData.csv'.format(accession_id)), 'w') as outFile:
        outFile.write('Metadata,' + ','.join(gsmNames))
        for key in metadata:
            tmp = [key]
            for gsm_name in gsmNames:
                if gsm_name in metadata[key]:
                    tmp.append(metadata[key][gsm_name])
                else:
                    tmp.append('NA')
            outFile.write('\n' + ','.join(tmp))

    # Plot boxplot of expression data
    plt.boxplot(exprs.transpose(), showfliers=False)
    plt.title(accession_id)
    pdf.savefig()
    plt.close()
    try:
        plt.boxplot(np.log2(exprs).transpose(), showfliers=False)
        plt.title('log2(' + accession_id + ')')
        pdf.savefig()
        plt.close()
    except:
        pass

    # Write out platform information
    for gpl_name, gpl in gse.gpls.items():
        with open(os.path.join(directory, 'annot/{}_{}_gpl.csv'.format(accession_id, gpl_name)), 'w') as outFile:
            gpl.table.to_csv(outFile)
