In [1]:
# cBioPortal_TCGA_MutationLandscape_v01
# Alberto Bejarano (2024)
import time; start = time.time()
from datetime import datetime; print(datetime.now().strftime("%B %d, %Y %H:%M:%S"))

June 11, 2024 00:12:39


In [2]:
# https://pybioportal.readthedocs.io/en/latest/index.html
# https://github.com/Matteo-Valerio/pyBioPortal/tree/master/examples
# https://github.com/LLCampos/pybioportal
# https://pypi.org/project/pybioportal/
# https://www.biostars.org/p/106127/
# https://docs.cbioportal.org/user-guide/faq/#what-are-mrna-and-microrna-z-scores
# https://www.ebi.ac.uk/gxa/FAQ.html

In [3]:
#!pip install pybioportal
#!conda install -c matteo.valerio pybioportal
#!pip install autograd

In [4]:
#%config InlineBackend.figure_format = "retina"
#%matplotlib inline

In [None]:
from pybioportal import server_running_status as srs
from pybioportal import molecular_profiles as mf
from pybioportal import molecular_data as md
from pybioportal import mutations as mts
from pybioportal import samples as sp
#          -          -          -          -          -          -          -          -          -          -          -  
import pandas as pd
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.precision', 3)

#        -         -        -        -         -        -         -        -        -         -
import warnings; warnings.filterwarnings("ignore")
#from scipy.stats import mannwhitneyu, normaltest
#from statannotations.Annotator import Annotator
#from statannot import add_stat_annotation
import matplotlib.pyplot as plt
import matplotlib.transforms
import autograd.numpy as np
import seaborn as sns

In [None]:
srs.get_server_status()

In [None]:
GenesOfInterest = ["PDCD1", "CD274", "CTLA4"]
GenesOfInterest = ["5133", "29126", "1493"]
GenesOfInterest = ["PDCD1", "CD274", "CTLA4"]
GenesOfInterest = ['EGFR', 'ERBB2', 'PTEN', 'CCND1', 'TP53', 'CDKN1B', 'KRAS', 'NRAS', 'HRAS', 'BRAF', 'PIK3CA', 'TTN']

GenesOfInterest = ['TP53']

In [None]:
genes_df = pd.read_csv("./metadata/HGNC_table.txt", header=0, sep='\t', quotechar='"', on_bad_lines="warn")
genes_df = genes_df.dropna(subset=['NCBI Gene ID'])
genes_df["NCBI Gene ID"] = genes_df["NCBI Gene ID"].astype(int)
genes_df = genes_df[genes_df["Approved symbol"].isin(GenesOfInterest)]
genes_df = genes_df[['Approved symbol', 'NCBI Gene ID']].copy()
genes_df.rename({'NCBI Gene ID':'entrezGeneId', 'Approved symbol':'ApprovedSymbol'},axis=1,inplace=True)
genes_df.sample()

In [None]:
genes = sorted(genes_df['ApprovedSymbol'].astype(str).unique().tolist()); print(genes)

In [None]:
NCBIgenes = sorted(genes_df['entrezGeneId'].astype(str).unique().tolist()); print(NCBIgenes)

In [None]:
df = mf.get_all_molecular_profiles()
df.sample(3)

In [None]:
molProfile00 = df.molecularProfileId.unique().tolist(); 
molProfile00 = [item for item in molProfile00 if '_tcga_' in item]
print('\033[33m', sorted(molProfile00[:100]), '\033[0m')

In [None]:
mutProfile = ['_tcga_mutations'] # '_mutations', '_tcga_', 

In [None]:
molProfiles   = "_tcga_rna_seq_v2_mrna"
mol_filterout = ["_median_Zscores", "_median_all_sample_Zscores", "_gistic", "_linear_CNA", "_mutations", "_methylation_hm450", "_pan_can_atlas_",
                "rppa", "log2CNA", "structural_variants", "protein_quantification", "U133", "methylation_hm27", "tcga_mrna",
               ]

In [None]:
df_exp = df[~df['molecularProfileId'].str.contains('|'.join(mol_filterout))]
df_exp = df_exp[df_exp['molecularProfileId'].str.contains(molProfiles)]
#          -          -          -          -          -          -          -          -          -          -          - 
condition = df['molecularProfileId'].str.contains(mutProfile[0])
for substring in mutProfile[1:]:
    condition = condition & df['molecularProfileId'].str.contains(substring)
df_mut = df[condition]

In [None]:
molProfiles = df_exp.molecularProfileId.unique().tolist()
print(len(molProfiles)); print('\033[33m', sorted(molProfiles), '\033[0m')

In [None]:
mut_profiles = df_mut.molecularProfileId.unique().tolist()
print(len(mut_profiles)); print('\033[33m', sorted(mut_profiles), '\033[0m')

In [None]:
%%time
mut_data = mts.fetch_muts_in_multiple_mol_profs(entrez_gene_ids=NCBIgenes, molecular_profile_ids=mut_profiles)
mut_data = mut_data[['sampleId', 'entrezGeneId', 'studyId', 'mutationType']].copy()
print('\033[31m', len(mut_data), '\033[0m')
mutationTypes = mut_data.mutationType.unique().tolist(); print('\033[33m', mutationTypes, '\033[0m')
mut_data.sample(3)

In [None]:
%%time
sampl_data  = sp.fetch_samples(sample_list_ids=molProfiles)
sampl_data  = sampl_data[['sampleId', 'studyId', 'sampleType']].copy()
sampleTypes = sampl_data.sampleType.unique().tolist(); print('\033[33m', sampleTypes, '\033[0m')
print('\033[31m', len(sampl_data), '\033[0m')
sampl_data.sample(3)

In [None]:
sampl_data.sampleType.value_counts(dropna=False)

In [None]:
data = pd.merge(mut_data, sampl_data, on='sampleId', how='right')
#data = pd.merge(data, genes_df, on='entrezGeneId', how='right')
#data.mutationType.fillna('No_Mutation', inplace=True)
#data['studyId'] = data['studyId'].str.replace('_tcga', '')
#data = data[['sampleId', 'studyId', 'sampleType', 'ApprovedSymbol', 'mutationType']].copy()
print('\033[31m', len(data), '\033[0m')
data.sample(10)

In [None]:
sampleTypes = data.sampleType.unique().tolist(); print('\033[33m', sampleTypes, '\033[0m')

In [None]:
mutationTypes = data.mutationType.unique().tolist(); print('\033[33m', mutationTypes, '\033[0m')

In [None]:
data.sampleType.value_counts(dropna=False)

In [None]:
impact_mapping = {
    'No_Mutation':            'No_impact',
    
    'Translation_Start_Site': 'Severe',  
    'Nonsense_Mutation':      'Severe',
    'Frame_Shift_Del':        'Severe',
    'Frame_Shift_Ins':        'Severe',
    
    'Missense_Mutation':      'Moderate',
    'Splice_Region':          'Moderate',
    'Splice_Site':            'Moderate',
    
    'In_Frame_Del':           'Mild',
    'In_Frame_Ins':           'Mild',

}

data['impact'] = data['mutationType'].map(impact_mapping)

In [None]:
data.mutationType.value_counts(dropna=False)

In [None]:
data.impact.value_counts(dropna=False)

In [None]:
impactTypes = data.impact.unique().tolist(); print('\033[33m', impactTypes, '\033[0m')

In [None]:
data.sample(5)

In [None]:
data_df2 = data[["ApprovedSymbol", "studyId", "impact"]].copy()
count_df01 = data_df2.groupby(["ApprovedSymbol", "studyId", "impact"])["impact"].count().reset_index(name="IMPACT_count")
count_df01.sort_values(['ApprovedSymbol', 'studyId', 'impact', 'IMPACT_count'], ascending=[True, False, True, False], inplace=True)
count_df01.head(10)

In [None]:
for gene in GenesOfInterest:
    fig, ax  = plt.subplots(figsize=(25/2.54, 10/2.54), dpi=150)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    df01 = data_df.loc[data_df.gene_name == gene]; n=len(df01)
    df02 = df01[["project", "IMPACT"]].copy(); #print(df02.head())
    norm_df = pd.crosstab(df02.project, df02.IMPACT, normalize="index").rename_axis("test", axis=1)
    norm_df = norm_df[["NONE", "LOW", "MODERATE", "HIGH"]].copy(); #print(df02.head())
    norm_df = norm_df[["HIGH", "MODERATE", "LOW", "NONE"]].copy(); #print(df02.head())
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    norm_df.plot(ax=ax, kind='bar', stacked=True, width=0.95, color=['#859da1', '#77ab5c', '#ffb96e', '#ff4747'])
    norm_df.plot(ax=ax, kind='bar', stacked=True, width=0.95, color=['#ff4747', '#ffb96e', '#77ab5c', '#afc1c4'])
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    ax.set_title(gene, fontsize=14, color="#7E7E7E", pad=10)
    plt.xlabel("", fontsize=0); plt.ylabel("Proportion of Samples per Variant Effect", fontsize=10, color="#7E7E7E", labelpad=8)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -    
    ax.set_ylim(-.01, 1.01); ax.set_yscale('linear'); sns.despine(left=True, bottom=True)   
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -        
    plt.xticks(fontsize=8, color="#7E7E7E", rotation=40, ha="center")
    ax.tick_params(axis='x', which='major', direction='in', colors="#7E7E7E", pad=2, length=2)
    plt.yticks(fontsize=8, color="#7E7E7E", rotation=0, ha="center")
    ax.tick_params(axis='y', which='major', direction='in', colors="#7E7E7E", pad=10, length=3)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    kws_lines = {"ls":"--", 'color':'#7E7E7E', "alpha":0.3, "linewidth":.5}
    plt.axhline(y=0.2, **kws_lines); plt.axhline(y=0.4, **kws_lines);plt.axhline(y=0.6, **kws_lines);
    plt.axhline(y=0.8, **kws_lines); plt.axhline(y=1.0, **kws_lines);
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -   
    ax.set_facecolor('#f2e8df'); fig.patch.set_facecolor('#f2e8df')
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    plt.tight_layout(); #plt.show(); #plt.close()
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles[0:2],labels[0:2],fontsize=6,bbox_to_anchor=(1.02,1),loc=2,borderaxespad=0.1)
    plt.legend([],[],frameon=False)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -       
    #plt.savefig("tcga.png", format="png", bbox_inches="tight", dpi=300)

In [None]:
hue_order = mutationTypes

In [None]:
dfhfgdfjh

In [None]:
data = pd.merge(data, sampl_data, on='sampleId', how='left')
data = data[['sampleId', 'studyId', 'sampleType', 'ApprovedSymbol', 'mutationType', 'value']].copy()
data['studyId'] = data['studyId'].str.replace('_tcga', '')

In [None]:
%%time
exp_data = md.fetch_molecular_data(entrez_gene_ids=NCBIgenes, molecular_profile_ids=molProfiles)
exp_data = exp_data[['sampleId', 'entrezGeneId', 'studyId', 'value']].copy()
exp_data.sample(3)

In [None]:
data.mutationType.value_counts()

In [None]:
data.sampleType.value_counts()

In [None]:
data_log2 = data.apply(lambda x: np.log2(x + 1) if np.issubdtype(x.dtype, np.number)else x)

In [None]:
data_log2.sample(10)

In [None]:
genes = data_log2.ApprovedSymbol.unique().tolist(); print(genes)

In [None]:
mutationType_palette = {'No_Mutation':'#00FF00',
                        'Frame_Shift_Del':'#e76f51',    'Frame_Shift_Ins':'#f4a261',
                        'In_Frame_Del':'#e76f51',       'In_Frame_Ins':'#e76f51',
                        'Missense_Mutation':'#06d6a0',
                        'Nonsense_Mutation':'#06d6a0', 
                        'Nonstop_Mutation':'#598392', 
                        'Splice_Site':'#598392',        'Splice_Region':'#598392',
                         'Translation_Start_Site':'#e76f51'
                       }

In [None]:
Impact_palette = {'No_impact': '#7E7E7E',
                   'Severe':    '#FF0000',
                   'Moderate':  '#598392',
                   'Mild':      '#598392',
                       }

In [None]:
for gene in sorted(genes):
    fig, ax  = plt.subplots(figsize=(25/2.54, 10/2.54), dpi=150)
    df02 = data_log2.loc[data_log2.ApprovedSymbol == gene]; n = len(df02)
    df02 = df02.dropna(axis=0, how='any')
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -       
    median_df  = df02.groupby(['studyId'])['value'].median().sort_values(ascending=False).reset_index()
    indication_list = median_df.studyId.unique().tolist(); # print(indication_list)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -
    data = df02; x='studyId'; y='value'; hue="impact"; order=indication_list; palette=mutationType_palette
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          - 
    str_kws = {'s':5, 'jitter':.45, 'alpha':0.5, 'dodge':True, 'linewidth':0.2,'edgecolor':'#1E1E1E', 'palette': palette,
               'order':indication_list, 'hue_order':hue_order}
    box_kws = {'linewidth':0.5, 'width':0.75, 'dodge':True, 'showcaps':True, 'whis':1.5, 'fliersize':0, 'saturation':0.20, 
               'order':indication_list, 'hue_order':hue_order}
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    medianprops=dict(linestyle='-', linewidth=1, color="firebrick", alpha=0.5); capprops=dict(linestyle="-", alpha=.5, linewidth=.3);
    boxprops=dict(linestyle="-", alpha=.3, linewidth=.3); whiskerprops=dict(linestyle="--", linewidth=.3, color="#7E7E7E");
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -    
    g = sns.stripplot(data=data, x=x, y=y, hue=hue, **str_kws, zorder=0, ax=ax)
    g = sns.boxplot(data=data,   x=x, y=y, hue=hue, medianprops=medianprops, 
                    capprops=capprops, boxprops=boxprops, whiskerprops=whiskerprops, **box_kws, zorder=1, ax=ax)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -   
    ax.set_title(gene, fontsize=14, color="#7E7E7E", pad=10)
    plt.xlabel("", fontsize=0); plt.ylabel("Gene Expression log2[TPM+1]", fontsize=6, color="#7E7E7E", labelpad=5)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -    
    ax.set_ylim(-.9, 15); ax.set_yscale('linear'); sns.despine(left=True, bottom=True)   
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -        
    ax.tick_params(axis="x", colors="#cfcfcf", pad=0, length=2); ax.tick_params(axis="y", colors="#cfcfcf", pad=0, length=3)
    plt.xticks(fontsize=10, color="#7E7E7E", rotation=60, ha="center"); ax.tick_params(axis='x', which='major', pad=5)
    plt.yticks(fontsize=10, color="#7E7E7E", rotation=0, ha="center"); ax.tick_params(axis='y', which='major', pad=8)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -  
    kws_lines = {"ls":"--", "alpha":0.3, "linewidth":.5}
    plt.axhline(y=3.46, color="g", **kws_lines); plt.axhline(y=6.67, color="orange", **kws_lines); plt.axhline(y=9.97, color="red", **kws_lines)
    kws_box   = {'lw':.2, 'alpha':0.04, 'edgecolor':None}
    plt.axhspan(-1, 3.46, color='green', **kws_box); plt.axhspan(3.46, 6.67, color='orange', **kws_box); plt.axhspan(6.67, 15, color='red', **kws_box)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -       
    #annot_kws1 = {'pairs':pairs, 'data':data, 'x':x, 'y':y, 'order':order, 'hue':hue, 'hue_order':hue_order}
    #annot_params = {"test":"Mann-Whitney", "line_width":0.5, "color":"#7E7E7E", "text_format":"star", 'loc':'inside'}
    #try:
    #    ax, test_results = Annotator(ax, **annot_kws1).reset_configuration().configure(**annot_params, verbose=0).apply_and_annotate()
    #except Exception as inst:
    #    print(inst) 
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -   
    handles, labels = ax.get_legend_handles_labels()
    plt.legend(handles[0:2],labels[0:2],fontsize=6,bbox_to_anchor=(1.02,1),loc=2,borderaxespad=0.1)
    plt.legend([],[],frameon=False)
    #          -          -          -          -          -          -          -          -          -          -          -          -          -          -       
    plt.tight_layout(); plt.show(); #plt.close()
    #plt.savefig("tcga.png", format="png", bbox_inches="tight", dpi=300)

In [None]:
#df_exp = df[~df['molecularProfileId'].str.contains('|'.join(filterout_01))]
#df_exp = df_exp[df_exp['molecularProfileId'].str.contains(molPrfile_01)]
#          -          -          -          -          -          -          -          -          -          -          - 
condition = df['molecularProfileId'].str.contains(molPrfile_02[0])
for substring in molPrfile_02[1:]:
    condition = condition & df['molecularProfileId'].str.contains(substring)
df_mut = df[condition]

In [None]:
molecular_profiles02 = df_mut.molecularProfileId.unique().tolist()
print(len(molecular_profiles02)); print('\033[33m', sorted(molecular_profiles02), '\033[0m')

In [None]:
#molecular_profiles02 = ['acc_tcga_mutations']

In [None]:
#mutationStatuss = mut_data.mutationStatus.unique().tolist(); print('\033[33m', sorted(mutationStatuss), '\033[0m')

In [None]:
print("'cBioPortal_TCGA_MutationLandscape_v01' script run time:", f'{time.time()-start:.0f}', "seconds.")

# 