In [None]:
# A combined cohort of TCGA, TARGET and GTEx samples              # Unifying cancer and normal RNA sequencing data from different sources
# https://xenabrowser.net/datapages/?cohort=TCGA%20TARGET%20GTEx  # https://www.nature.com/articles/sdata201861

In [None]:
import time; start = time.time()

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import warnings; warnings.filterwarnings("ignore") 
from tqdm import tqdm
import pandas as pd
import scipy.stats

In [None]:
genes_link      = "https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"
phenotype_link  = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/TcgaTargetGTEX_phenotype.txt.gz"
#metadata_link    = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/TcgaTargetGtex_expected_count.json"
#data_link       = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/TcgaTargetGtex_expected_count.gz"
data_link       = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/TcgaTargetGtex_rsem_gene_tpm.gz"
#data_link       = "./data/TcgaTargetGtex_rsem_gene_tpm.gz"

In [None]:
#metadata = pd.read_csv(metadata_link, compression="gzip", header=0, sep="\t", quotechar='"', error_bad_lines=False)
genes_df  = pd.read_csv(genes_link, header=0, sep="\t", quotechar='"', error_bad_lines=False, encoding="latin-1")
phenotype = pd.read_csv(phenotype_link, compression="gzip", header=0, sep="\t", quotechar='"', error_bad_lines=False, encoding="latin-1")

In [None]:
columns={"Ensembl ID(supplied by Ensembl)":"EnsemblGeneID","Approved symbol":"GeneSymbol"}
genes_df.rename(columns=columns, inplace=True)
print(len(genes_df)); genes_df.sample(2)

In [None]:
# print(len(phenotype)); phenotype.sample(3)

In [None]:
phenotype.columns.tolist()

In [None]:
columns={"sample":"SampleID", "detailed_category":"DetailedCategory","_primary_site":"PrimarySite",
         "_sample_type":"SampleType","_study":"Study","_gender":"Gender"}
phenotype.rename(columns=columns, inplace=True)

In [None]:
SelectedSampleTypes = ["Primary Tumor","Normal Tissue","Primary Solid Tumor"]
phenotype = phenotype[phenotype.SampleType.isin(SelectedSampleTypes)]

In [None]:
print(len(phenotype)); phenotype.sample(2)

In [None]:
def function(row):
    if row["SampleType"]   == "Primary Tumor":
        return "Tumor"
    elif row["SampleType"] == "Solid Tissue Normal":
        return "Healthy"        
    elif row["SampleType"] == "Normal Tissue":
        return "Healthy"
    else:
        return "NaN"

In [None]:
phenotype["SampleType2"] = phenotype.apply(function, axis=1)

In [None]:
print(len(phenotype)); phenotype.sample(2)

In [None]:
PrimarySite = phenotype.PrimarySite.unique().tolist()
PrimarySite = [x for x in PrimarySite if str(x) != 'nan']; print(PrimarySite)

In [None]:
#PrimarySite = ["Brain","Breast","Spleen","Testis"]; print(PrimarySite)
#phenotype = phenotype[phenotype["PrimarySite"].isin(PrimarySite)]

In [None]:
#columns=["SampleID", "SampleType", "SampleType2", "Study", "PrimarySite", "DetailedCategory"]
#phenotype = phenotype[columns].copy()
print(len(phenotype)); phenotype.sample(2)

In [None]:
columns=["SampleID", "SampleType2", "PrimarySite"]
phenotype2 = phenotype[columns].copy(); phenotype2.sample(2)

In [None]:
chunk_size = 50
chunk_no   = 1

In [None]:
cols1 = ['GeneSymbol','PrimarySite','Avg_Health','Avg_Tumor','Avg_D','MWstatistic','pvalue']
stats_df = pd.DataFrame(columns = cols1)
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -    
cols2 = ['GeneSymbol','PrimarySite','Error']
error_df = pd.DataFrame(columns = cols2)

In [None]:
%%time
for chunk in tqdm(pd.read_csv(data_link, compression='gzip', sep='\t', chunksize = chunk_size), colour="green"):
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     
    print("\x1b[31m","Time:", f'{time.time()-start:.0f}', "seconds.","\x1b[0m")
    chunk.rename(columns={"sample":"EnsemblGeneID"}, inplace=True)
    chunk.EnsemblGeneID = chunk.EnsemblGeneID.str.slice(0,15)
    df = pd.merge(chunk, genes_df, on = "EnsemblGeneID", how="left")
    df.drop(["EnsemblGeneID"], axis=1, inplace=True)
    df.insert(0,"GeneSymbol", df.pop("GeneSymbol"))
    genes = df.GeneSymbol.unique().tolist()
    cleanedgenes = [x for x in genes if str(x) != 'nan']
    #print(cleanedgenes)
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     - 
    df0 = pd.melt(df, id_vars=["GeneSymbol"], var_name = "SampleID", value_name ="TPM")
    df0 = df0[df0["GeneSymbol"].notna()]
    df1 = pd.merge(df0, phenotype2, on="SampleID", how="inner")
    df1.drop(['SampleID'], axis = 1, inplace = True)
    df1 = df1[['GeneSymbol', 'PrimarySite', 'SampleType2', 'TPM']]
    print("Number of PrimarySites", "\x1b[31m", str(len(df1.PrimarySite.unique())),"\x1b[0m")
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     
    for gene in cleanedgenes:
        print("\x1b[31m",gene, "\x1b[0m")
        df00 = df1[df1.GeneSymbol == gene]
        PrimarySite = df00.PrimarySite.unique().tolist()
        PrimarySite = [x for x in PrimarySite if str(x) != 'nan']; #print(PrimarySite)
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -         
        for site in PrimarySite:
            print(site)
            df000 = df00[df00.PrimarySite == site]
            tumor_df = df000[df000.SampleType2 == "Tumor"]    ; Avg_Tumor = tumor_df.TPM.mean()
            healt_df = df000[df000.SampleType2 == "Healthy"]  ; Avg_Healt = healt_df.TPM.mean()
            Avg_D = Avg_Tumor - Avg_Healt
            t = tumor_df.TPM.tolist(); h = healt_df.TPM.tolist()
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -             
            try:
                u, p= scipy.stats.mannwhitneyu(t, h)
                #print(gene, site,  f"{Avg_Healt:.3f}", f"{Avg_Tumor:.3f}", f"{Avg_D:.3f}", u, f"{p:.3e}")
                to_append1 = [[gene, site,  f"{Avg_Healt:.3f}", f"{Avg_Tumor:.3f}", f"{Avg_D:.3f}", u, f"{p:.3e}"]]
                stats_df = stats_df.append(pd.DataFrame(to_append1, columns = cols1), ignore_index = True)
            except ValueError as r:
                #print("\x1b[31m", gene, site, str(r),"\x1b[0m")
                to_append2 = [[gene, site, r]]
                error_df = error_df.append(pd.DataFrame(to_append2, columns = cols2), ignore_index = True)
                #break
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -                 
    chunk_no += 1
    if chunk_no == 10:
        print("\n  *  *  *  *  *  *  BREAK  *  *  *  *  *  *  \n")
        break
#     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -     -          
print("\n  *  *  *  *  *  *  THE END  *  *  *  *  *  *  \n")

In [None]:
stats_df.to_csv("TTG_stats.csv", sep="\t", index=False)
error_df.to_csv("TTG_error_log.csv", sep="\t", index=False)

In [None]:
print("\x1b[31m","'DataExploration_v005' script run time:", f'{time.time()-start:.0f}', "seconds.","\x1b[0m")

In [None]:
=   =   =   =   S  T  O  P   =   =   =   = 