In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from scipy.stats import poisson
from scipy.stats import fisher_exact

In [61]:
# change working directory to project-2
if os.getcwd().split('/')[-1] != 'project-2':
    os.chdir('../../../')

# set paths
METADATA = 'data/raw/TCGA.BRCA.metadata.txt'
MUTATIONS = 'data/processed/TCGA.BRCA.mutations.qc1.txt'
DNDS = 'results/tables/dnds_simple_results.tsv'
CDS_LENGTHS = 'data/processed/gencode.v23lift37.pc_transcripts.transcripts_in_TCGA_MAF.cds_lengths.tsv'
path_intogen = "data/raw/IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"

df_mut = pd.read_csv(MUTATIONS, sep='\t')
df_meta = pd.read_csv(METADATA, sep='\t')
df_dnds = pd.read_csv(DNDS, sep='\t')
df_cds = pd.read_csv(CDS_LENGTHS, sep='\t')
df_intogen = pd.read_csv(path_intogen, sep="\t")

In [62]:
df_dnds.keys()

Index(['synonymous', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del',
       'In_Frame_Ins', 'Missense_Mutation', 'Nonsense_Mutation',
       'Nonstop_Mutation', 'Translation_Start_Site', 'CDS_length',
       'Hugo_Symbol', 'synonymous_opportunity', 'nonsynonymous_opportunity',
       'observed_nonsynonymous', 'Indels', 'NS_SNV', 'dS', 'dN', 'dN/dS',
       'poisson_pval', 'fisher_odds', 'fisher_pval', 'chi2', 'chi2_pval'],
      dtype='object')

In [63]:
df_intogen.head()

Unnamed: 0,Symbol,Mutations,Samples,Samples (%)
0,HRAS,5,5,0.5
1,KLF4,5,4,0.4
2,KRAS,6,6,0.59
3,JAK2,6,6,0.59
4,SMAD4,6,5,0.5


In [64]:
intersect = df_dnds[df_dnds.Hugo_Symbol.isin(df_intogen.Symbol)][['Hugo_Symbol','synonymous','observed_nonsynonymous','fisher_pval','dN/dS']]
intersect = intersect.sort_values(by='synonymous')
intersect['fisher_pval'] = intersect['fisher_pval'].round(2)
significant_intogen = intersect[intersect['fisher_pval'] < 0.05]

In [68]:
hits = df_dnds[df_dnds['fisher_pval'] < 0.05].set_index('Hugo_Symbol')[['dN/dS', 'fisher_pval']].sort_values(by='dN/dS', ascending=False).dropna().round({'dN/dS':2})
hits['fisher_pval'] = hits['fisher_pval'].apply(lambda x: f"{x:.3e}")
hits.to_csv('results/tables/dnds_significant.csv')

In [43]:
significant_intogen

Unnamed: 0,Hugo_Symbol,synonymous,observed_nonsynonymous,fisher_pval,dN/dS
111,AKT1,0,24,0.01,
304,BRCA2,0,18,0.03,
2679,TBX3,0,22,0.0,
2178,PTEN,0,42,0.0,
2413,SF3B1,0,15,0.04,
1529,MAP2K4,1,25,0.03,5.933658
204,ARID1A,1,21,0.04,5.530878
973,FOXA1,1,25,0.02,6.653073
1009,GATA3,2,75,0.0,10.145395
2346,RUNX1,2,31,0.01,4.379868


In [46]:
intersect['sum'] = intersect['observed_nonsynonymous'] + intersect['synonymous']
intersect.sort_values(by='sum', ascending=False, inplace=True)
intersect

Unnamed: 0,Hugo_Symbol,synonymous,observed_nonsynonymous,fisher_pval,dN/dS,sum
2035,PIK3CA,6,291,0.0,10.86727,297
2770,TP53,2,246,0.0,32.042587,248
1530,MAP3K1,3,89,0.0,7.620841,92
438,CDH1,3,88,0.0,7.693293,91
1009,GATA3,2,75,0.0,10.145395,77
1390,KMT2C,5,71,0.0,3.50663,76
2178,PTEN,0,42,0.0,,42
1718,NCOR1,3,30,0.07,2.545662,33
2346,RUNX1,2,31,0.01,4.379868,33
923,FAT3,6,22,0.67,0.921933,28
