<a href="https://polly.elucidata.io/manage/workspaces?action=open_polly_notebook&amp;source=github&amp;path=ElucidataInc%2Fpolly-python%2Fblob%2Fmain%2Fmerge-multiple-gct-files.ipynb&amp;kernel=elucidata%2FPython+3&amp;machine=small" target="_parent"><img alt="Open in Polly" src="https://elucidatainc.github.io/PublicAssets/open_polly.svg"/></a>


## Install polly-python

In [1]:
!sudo pip3 install polly-python 

Looking in indexes: https://pypi.org/simple, http://54.245.179.143:80/
Collecting polly-python
  Downloading https://files.pythonhosted.org/packages/9a/4b/fc1433a5f8f214d2a6ef54d593c0302db7b85b9c5fbc3f316418c5fb8669/polly_python-0.0.6-py3-none-any.whl
Collecting requests==2.25.1 (from polly-python)
[?25l  Downloading https://files.pythonhosted.org/packages/29/c1/24814557f1d22c56d50280771a17307e6bf87b70727d975fd6b2ce6b014a/requests-2.25.1-py2.py3-none-any.whl (61kB)
[K     |████████████████████████████████| 61kB 8.2MB/s  eta 0:00:01
[?25hCollecting python-dateutil==2.8.1 (from polly-python)
[?25l  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)
[K     |████████████████████████████████| 235kB 18.0MB/s eta 0:00:01
Collecting six==1.16.0 (from polly-python)
  Downloading https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbc

## Import libraries

In [2]:
from polly.omixatlas import OmixAtlas
import os
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse
from cmapPy.pandasGEXpress import GCToo
from cmapPy.pandasGEXpress import write_gct
from multiprocessing import Pool

## Authentication with Polly

In [3]:
omix_atlas = OmixAtlas(os.environ['POLLY_REFRESH_TOKEN'])

## Define helper functions

In [4]:
def download_dataset(args):
    """
    Downloads a single dataset with given repo_id and dataset_id
    """
    repo_id = args[0]
    dataset_id = args[1]
    file_name = f"{dataset_id}.gct"
    data = omix_atlas.download_data(repo_id, dataset_id)
    url = data.get('data').get('attributes').get('download_url')
    status = os.system(f"wget -O '{file_name}' '{url}'")
    if status == 0:
        print("Downloaded data successfully")
    else:
        raise Exception("Download not successful")
        
def download_datasets(repo_id, dataset_ids):
    """
    Download multiple datasets at a time
    """
    pool = Pool()
    pool.map(download_dataset, list(zip([repo_id]*len(dataset_ids), dataset_ids)))
    
def rename_samples(dataset_id, gct_object):
    """
    Adds dataset_id as prefix to the sample names (cids) in the gct object
    """
    gct_object.data_df.columns = [dataset_id+'_'+sample for sample in gct_object.data_df.columns]
    gct_object.col_metadata_df.index = [dataset_id+'_'+sample for sample in gct_object.col_metadata_df.index]
    
    return gct_object
    
def merge_gcts(dataset_ids):
    """
    Merge the input gcts together to create one gct object. Only the features(genes) common to all the gcts are retained.
    """
    gct_files = [dataset_id+'.gct' for dataset_id in dataset_ids]
    gct_objects = {dataset_id:parse(gct_file) for dataset_id, gct_file in zip(dataset_ids, gct_files)}
    
    # Rename all samples for each gct and add dataset_id as a column to col metadata of each gct object
    for dataset_id in dataset_ids:
        gct_objects[dataset_id] = rename_samples(dataset_id, gct_objects[dataset_id])
        gct_objects[dataset_id].col_metadata_df['dataset_id'] = dataset_id
        
    merged_data_df = pd.merge(gct_objects[dataset_ids[0]].data_df,
                              gct_objects[dataset_ids[1]].data_df, 
                              left_index=True,
                              right_index=True)
    
    merged_col_metadata_df = pd.concat([gct_objects[dataset_ids[0]].col_metadata_df,
                                        gct_objects[dataset_ids[1]].col_metadata_df],
                                        axis = 0)
    if len(dataset_ids) > 2:
        for dataset_id in dataset_ids[2:]:
            merged_data_df = pd.merge(merged_data_df, gct_objects[dataset_id].data_df, left_index=True, right_index=True)
            merged_col_metadata_df = pd.concat([merged_col_metadata_df, gct_objects[dataset_id].col_metadata_df], axis=0)
            
    merged_row_metadata_df = gct_objects[dataset_ids[0]].row_metadata_df.loc[merged_data_df.index]
    
    merged_gct_object = GCToo.GCToo(data_df=merged_data_df, 
                                    row_metadata_df=merged_row_metadata_df, 
                                    col_metadata_df=merged_col_metadata_df, 
                                    make_multiindex=True)
    return merged_gct_object

## Define the list of datasets to be merged

In [5]:
repo_id = 9 # GEO
dataset_ids = ['GSE90683_GPL16791', 'GSE91377_GPL16791', 'GSE139442_GPL18573']

## Download the gct files for the datasets

In [6]:
download_datasets(repo_id, dataset_ids)

Downloaded data successfully
Downloaded data successfully
Downloaded data successfully


## Merge the downloaded gct files to create a single GCT object

In [7]:
merged_gct = merge_gcts(dataset_ids)

In [8]:
print(merged_gct.data_df.shape)
merged_gct.data_df.head()

(11926, 48)


Unnamed: 0_level_0,GSE90683_GPL16791_GSM2664378,GSE90683_GPL16791_GSM2664379,GSE90683_GPL16791_GSM2664380,GSE90683_GPL16791_GSM2664381,GSE90683_GPL16791_GSM2664382,GSE90683_GPL16791_GSM2664383,GSE90683_GPL16791_GSM2664384,GSE90683_GPL16791_GSM2664385,GSE90683_GPL16791_GSM2664386,GSE90683_GPL16791_GSM2664387,...,GSE91377_GPL16791_GSM2422098,GSE91377_GPL16791_GSM2422102,GSE139442_GPL18573_GSM4141730,GSE139442_GPL18573_GSM4141731,GSE139442_GPL18573_GSM4141732,GSE139442_GPL18573_GSM4141733,GSE139442_GPL18573_GSM4141734,GSE139442_GPL18573_GSM4141735,GSE139442_GPL18573_GSM4141736,GSE139442_GPL18573_GSM4141737
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADAM7,2.3311,2.3236,2.3391,2.3311,2.4108,2.3472,2.3391,2.374,2.3311,2.3971,...,4.7972,4.6822,-2.03,-1.79,-1.88,-1.59,-4.7,-2.67,-3.17,-3.15
AWAT2,2.3311,2.3236,2.3391,2.3311,2.4108,2.3472,2.3391,2.374,2.3311,2.3971,...,4.7972,4.6822,-2.03,-1.79,-1.88,-1.59,-4.7,-2.67,-3.17,-3.15
CSNK1A1L,2.3311,2.3236,2.3391,2.3311,2.4108,2.3472,2.3391,2.374,2.3311,2.3971,...,7.519,4.8857,-2.03,-1.79,-1.88,-1.59,-4.21,-2.67,-3.17,-3.15
DRD1,2.3311,2.3236,2.3391,2.3311,2.4108,2.3472,2.3391,2.374,2.3311,2.3971,...,5.4966,4.6822,-2.03,-1.79,-1.88,-1.59,-4.7,-2.67,-1.66,-3.15
FCRL6,2.3311,2.3236,2.3391,2.3311,2.4108,2.3472,2.3391,2.374,2.3311,2.3971,...,4.7972,4.7976,-2.03,-1.79,-1.88,-1.59,-5.33,-2.67,-3.17,-1.64


In [9]:
print(merged_gct.col_metadata_df.shape)
merged_gct.col_metadata_df.head()

(48, 72)


Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,kw_curated_gender,curated_is_control,curated_cohort_id,curated_cohort_name,dataset_id,contact_email,contact_phone,contact_laboratory,contact_state,condition.ch1
GSE90683_GPL16791_GSM2664378,"SJNB6, RNA-seq",GSM2664378,Public on Jul 01 2017,Jun 12 2017,May 15 2019,SRA,1,SJNB6,Homo sapiens,cell type: Neuroblastoma cell line,...,none,1,19,,GSE90683_GPL16791,,,,,
GSE90683_GPL16791_GSM2664379,"SJNB8, RNA-seq",GSM2664379,Public on Jul 01 2017,Jun 12 2017,May 15 2019,SRA,1,SJNB8,Homo sapiens,cell type: Neuroblastoma cell line,...,none,1,19,,GSE90683_GPL16791,,,,,
GSE90683_GPL16791_GSM2664380,"SK-N-AS, RNA-seq",GSM2664380,Public on Jul 01 2017,Jun 12 2017,May 15 2019,SRA,1,SK-N-AS,Homo sapiens,cell type: Neuroblastoma cell line,...,none,0,20,,GSE90683_GPL16791,,,,,
GSE90683_GPL16791_GSM2664381,"CLB-CAR, RNA-seq",GSM2664381,Public on Jul 01 2017,Jun 12 2017,May 15 2019,SRA,1,CLB-CAR,Homo sapiens,cell type: Neuroblastoma cell line,...,none,0,24,,GSE90683_GPL16791,,,,,
GSE90683_GPL16791_GSM2664382,"CLB-PE, RNA-seq",GSM2664382,Public on Jul 01 2017,Jun 12 2017,May 15 2019,SRA,1,CLB-PE,Homo sapiens,cell type: Neuroblastoma cell line,...,none,0,31,,GSE90683_GPL16791,,,,,


In [10]:
merged_gct.col_metadata_df['dataset_id'].value_counts()

GSE90683_GPL16791     36
GSE139442_GPL18573     8
GSE91377_GPL16791      4
Name: dataset_id, dtype: int64

## Write the merged gct object to file

In [11]:
write_gct.write(merged_gct, "merged_data.gct")