In [None]:
import xenaPython as xena
import numpy as np
import pandas as pd
import pickle

In [None]:
host = "https://gdc.xenahubs.net"

# ids for the gene expressions
dataset_ids_gene = ["TCGA-THCA.htseq_counts.tsv",
                  "TCGA-STAD.htseq_counts.tsv",
                  "TCGA-OV.htseq_counts.tsv",
                  "TCGA-PRAD.htseq_counts.tsv"]

dataset_ids_stem = ["TCGA-THCA.mirna.tsv",
                  "TCGA-STAD.mirna.tsv",
                  "TCGA-OV.mirna.tsv",
                  "TCGA-PRAD.mirna.tsv"]

cancer_types = ["Thyroid-Cancer",
               "Stomach-Cancer",
               "Ovarian-Cancer",
               "Prostate-Cancer"]

In [None]:
def retrieve_full_datasets(dataset_ids):
    
    """ 
    Retrieves a number of datasets
   
    Parameters: 
    arg1 (list): Id names of the datasets as found on Xena website
  
    Returns: 
    list: The transposed datasets as shown on Xena in a pandas format 
    """
    
    panda_datasets = []
    for i in (dataset_ids):
        
        #getting the sample and feature names
        samples_names = xena.dataset_samples(host, i, None)
        features_names = xena.dataset_field(host, i)

        #retrieving the full dataset 
        dataset = xena.dataset_fetch(host, i, samples_names, features_names)
        
        #transposing the data so it is in the "sample X features" format
        dataset = np.array(dataset)
        dataset = dataset.T
        print(dataset.shape)
        
        panda_dataset = pd.DataFrame(data=dataset,            # values
                                     index=samples_names,     # 1st column as index
                                     columns=features_names)  # 1st row as the column names
        
        panda_datasets.append(panda_dataset)

    return panda_datasets    

In [None]:
def save_panda_datasets(datasets_pandas, dataset_ids):
    
    """ 
    Saves all the datasets to current directory for further use
   
    Parameters: 
    arg1 (list): Datasets in pandas format
    arg2 (list): Datasets ids as in Xena website
    """
    
    #saving each dataset with the
    for i in range(len(datasets_pandas)):
        filename = dataset_ids[i][:-4]
        datasets_pandas[i].to_csv( filename + ".csv")
        print("saved ", filename)

In [None]:
datasets_pandas_gene = retrieve_full_datasets(dataset_ids_gene)
save_panda_datasets(datasets_pandas_gene, dataset_ids_gene)
del datasets_pandas_gene

In [None]:
datasets_pandas_stem = retrieve_full_datasets(dataset_ids_stem)
save_panda_datasets(datasets_pandas_stem, dataset_ids_stem)
del datasets_pandas_stem

In [None]:
def add_cancer_type_column(dataset_ids, cancer_types):
    
    """ 
    Adds new column to datasets with cancer type and overides the old version to working dir
   
    Parameters: 
    arg1 (list): Datasets ids as on Xena website
    arg2 (list): cancer types as simple names
    """
    
    for i in range(len(dataset_ids)):
        filename = dataset_ids[i][:-4] + ".csv"
        dataset = pd.read_csv(filename)
        
        dataset['Cancer-Type'] = cancer_types[i]
        dataset.to_csv(filename, index= False)
        print("saved ", filename)

In [None]:
add_cancer_type_column(dataset_ids_gene, cancer_types)
add_cancer_type_column(dataset_ids_stem, cancer_types)

In [None]:
def merging_datasets(dataset_ids, name):
    
    """ 
    Merges the datasets saved to working dir and saves the result
   
    Parameters: 
    arg1 (list): Datasets ids as on Xena website
    arg2 (string): The name of the final merged dataset
    """
    datasets = []
    for i in range(len(dataset_ids)):
        filename = dataset_ids[i][:-4] + ".csv"
        dataset = pd.read_csv(filename)
        print(dataset.shape)
        datasets.append(dataset)
        
    merged_dataset = pd.concat(datasets)
    print(merged_dataset.shape)
    merged_dataset.to_csv(name, index= False)
    print("saved ", name)

In [None]:
merging_datasets(dataset_ids_gene, "merged-gene-expr.csv")
merging_datasets(dataset_ids_stem, "merged-stem-expr.csv")