# Visualizing Ras Classification Scores by Oncogenicity Status

The Pathway Working Group from TCGA PanCancerAtlas curated variants in Ras Pathway genes by their expert-predicted oncogenicity status (either oncogenic or unconfirmed). Here, we output two sets of files

1. Oncogenicity scores by Ras classifier score dataframe
2. Swarm plots of Ras classifier scores by Oncogenicity status

In [1]:
import os
import csv
import numpy as np
import pandas as pd
from decimal import Decimal
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.style.use('seaborn-notebook')

In [3]:
sns.set(style="whitegrid")
sns.set_context("paper", rc={"font.size":14, "axes.titlesize":15, "axes.labelsize":20,
                             'xtick.labelsize':14, 'ytick.labelsize':14})

In [4]:
np.random.seed(123)

In [5]:
def assign_curation(aa_row, df='ras_variant'):
    """
    Determines if the specific amino acid mutation is cataloged as Oncogenic 
    in the Ras pathway or not. To be used as a call to pandas.DataFrame().apply() 
    
    Arguments:
      aa_row - a row that has gene and mutation information
      df - the type of dataframe called, can either be `ras_variant` or `sample`
           depending on the type of dataframe to assign curation
    
    Output:
      Oncogenicity Status for the given amino acid mutation
    """
    gene = aa_row['Hugo_Symbol']
    if df == 'ras_variant':
        aa = aa_row.name
    elif df == 'sample':
        aa = aa_row['HGVSp']
    ras_sub = ras_variant_df[ras_variant_df.index == gene]
    if len(ras_sub) == 0:
        return 'Not Ras'
    
    if aa in ras_sub['mutation'].tolist():
        return 'Oncogenic'
    else:
        return 'Unconfirmed'

In [6]:
# Set File names
aa_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                       'amino_acid_mutation_scores.tsv')
ras_variant_curation_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                         'Ras_pathway_variant_oncogenicity_data.tsv')
sample_mutation_scores_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                                           'mutation_classification_scores.tsv')

# Output Files
aa_out_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
                           'RAS_oncogenecity_predictions.tsv')

In [7]:
# Load Ras variant curation file
ras_variant_df = pd.read_table(ras_variant_curation_file, index_col=0)
ras_variant_df.head(2)

Unnamed: 0_level_0,mutation,gene_type,MutSig,hotspots,copy_number_fusion,GISTIC_amp,GISTIC_del,McCormick
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALK,fusion gene,OG,1,some hotspots & known mutations,fusion gene,,;,yes
ALK,p.Asp1203Tyr,OG,1,some hotspots & known mutations,fusion gene,,;,yes


In [8]:
# Add oncogenecity designation to each amino acid mutation and write to file
aa_scores = pd.read_table(aa_mutation_scores_file, index_col=0)

aa_onco_df = aa_scores.assign(designation = aa_scores.apply(assign_curation, axis=1))
aa_onco_df.to_csv(aa_out_file, sep='\t')

aa_onco_df.head(2)

Unnamed: 0_level_0,Variant_Classification,Hugo_Symbol,Mean,SD,count,low_CI,high_CI,designation
HGVSp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
p.Val600Glu,Missense_Mutation,BRAF,0.379832,0.222155,453,0.360466,0.401139,Oncogenic
p.Gly12Asp,Missense_Mutation,KRAS,0.81998,0.126311,166,0.800135,0.838195,Oncogenic


In [9]:
# Add curation to specific mutation scores
mut_scores_df = pd.read_table(sample_mutation_scores_file, index_col=0)
mut_scores_df = (
    mut_scores_df[mut_scores_df['Variant_Classification']
                  .isin(['Missense_Mutation', 'Nonsense_Mutation'])]
    )
ras_pathway_scores_df = mut_scores_df[mut_scores_df['Hugo_Symbol'].isin(ras_variant_df.index.tolist())]
ras_pathway_scores_df = (
    ras_pathway_scores_df.assign(
        curation = ras_pathway_scores_df.apply(lambda x:
                                               assign_curation(x, df='sample'), axis=1))
    )
ras_pathway_scores_df.head(2)

Unnamed: 0_level_0,log10_mut,total_status,weight,NRAS,HRAS,KRAS,HRAS_gain,KRAS_gain,NRAS_gain,PATIENT_BARCODE,...,SUBTYPE,hypermutated,include,ID.1,Tumor_Sample_Barcode,Hugo_Symbol,HGVSc,HGVSp,Variant_Classification,curation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-2485-01,1.748188,0.0,0.467431,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-02-2485,...,IDHwt,0.0,0.0,TCGA-02-2485-01,TCGA-02-2485-01A-01D-1494-08,EGFR,c.866C>A,p.Ala289Asp,Missense_Mutation,Oncogenic
TCGA-04-1362-01,1.838849,0.0,0.48437,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-04-1362,...,Not_Applicable,0.0,1.0,TCGA-04-1362-01,TCGA-04-1362-01A-01W-0492-08,SOS1,c.3691C>G,p.Leu1231Val,Missense_Mutation,Unconfirmed


In [10]:
# Separate out samples with Oncogenic curation status
oncogenic_sample_df = ras_pathway_scores_df[ras_pathway_scores_df['curation'] == 'Oncogenic']
unconfirmed_sample_df = (
    ras_pathway_scores_df[~ras_pathway_scores_df.index.isin(oncogenic_sample_df.index)]
    )

filtered_df = pd.concat([oncogenic_sample_df, unconfirmed_sample_df])

In [11]:
# Generate and Save plots
plt.rcParams['figure.figsize']=(3.5, 4)
t_test_results = []
x1, x2 = 0, 1

for gene in set(mut_scores_df['Hugo_Symbol']):
    gene_mutation_score_df = filtered_df[filtered_df['Hugo_Symbol'] == gene]
    gene_mutation_score_df = gene_mutation_score_df.dropna(axis=0, subset=['weight'])
    fig_name = os.path.join('..', 'figures', 'variants', 'variant_prediction_{}.pdf'.format(gene))
    try:
        # perform an independent t-test for prediction scores by oncogenicity
        oncogenic_scores = (
            gene_mutation_score_df.loc[
                gene_mutation_score_df['curation'] == 'Oncogenic', 'weight']
            )
        unconfirmed_scores = (
            gene_mutation_score_df.loc[
                gene_mutation_score_df['curation'] == 'Unconfirmed', 'weight']
            )

        t_results = ttest_ind(a = oncogenic_scores,
                              b = unconfirmed_scores, equal_var = False)
        add_result = [gene, t_results.pvalue, t_results.statistic]
        t_test_results.append(add_result)
        
        # Setup p value annotation
        max_val = gene_mutation_score_df['weight'].max()
        y, h = max_val + 0.06, 0.05
        
        # Plot
        ax = sns.stripplot(x='curation', y='weight', data=gene_mutation_score_df, 
                           palette = {'Oncogenic': "seagreen", 'Unconfirmed': 'goldenrod'},
                           jitter=0.35, size=3.25, alpha=0.65)
        ax.axes.set_ylim(0, max_val + 0.2)
        ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
        ax.set_ylabel('Ras Classifier Score')
        ax.set_xlabel(gene)
        plt.axhline(0.5, color='grey', linestyle='dashed', linewidth=2)
        
        # Only display t-test bars if there are two classes of data
        if len(oncogenic_scores) != 0 and len(unconfirmed_scores) != 0:
            plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='black')
            plt.text(.5, y+h, "{:.2E}".format(Decimal(t_results.pvalue)),
                     ha='center', va='bottom', color="black")
    
        plt.tight_layout()
        plt.savefig(fig_name)
        plt.close()
    
    except:
        next

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [12]:
# Write out full t-test results
t_test_file = os.path.join('..', 'results', 'ras_t_test_oncogenicity.tsv')
with open(t_test_file, 'w') as csvfile:
    onco_writer = csv.writer(csvfile, delimiter='\t')
    onco_writer.writerow(['gene', 'p_value', 't_statistic'])
    for t_ in t_test_results:
        onco_writer.writerow(t_)