We will perform dNdS using probabilities for single base substitutions (single nucleotide context)

Do we have patients with 2 mutations in a single gene?

General steps:
1. Get the probabilities for all 6 types of single base substitutions
   - C>A
   - C>G
   - C>T
   - T>A
   - T>C
   - T>G
2. 

In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import poisson
import os

import re
from collections import defaultdict
import pandas as pd
from collections import defaultdict
from itertools import product

# change working directory to project-2
if os.getcwd().split('/')[-1] != 'project-2':
    os.chdir('../../../')

In [68]:
# set paths
METADATA = 'data/raw/TCGA.BRCA.metadata.txt'
MUTATIONS = 'data/processed/TCGA.BRCA.mutations.qc1.txt'
CDS_LENGTHS = 'data/processed/gencode.v23lift37.pc_transcripts.transcripts_in_TCGA_MAF.cds_lengths.tsv'
dnds_opportunities = 'data/processed/dnds_opportunities.tsv'

df_opportunities = pd.read_csv(dnds_opportunities, sep='\t')
df_mut = pd.read_csv(MUTATIONS, sep='\t')
df_meta = pd.read_csv(METADATA, sep='\t')
df_cds = pd.read_csv(CDS_LENGTHS, sep='\t')

In [69]:
# 2. Define categories and order
syn_col = 'synonymous'
non_syn_classes = [
    'Frame_Shift_Del', 
    'Frame_Shift_Ins', 
    'In_Frame_Del', 
    'In_Frame_Ins',  
    'Missense_Mutation', 
    'Nonsense_Mutation', 
    'Nonstop_Mutation',
    'Translation_Start_Site'
]
all_classes = [syn_col] + non_syn_classes

# 3. Group and pivot to get counts per gene per class
mutation_counts = df_mut.groupby(['Hugo_Symbol', 'mutation_class']).size()
counts_df = mutation_counts.unstack(fill_value=0)
# 4. Ensure all classes exist
for col in all_classes:
    if col not in counts_df.columns:
        counts_df[col] = 0

# 5. Reorder columns
counts_df = counts_df[all_classes]

counts_df['non-synonymous'] = counts_df[non_syn_classes].sum(axis=1)

# 6. Filter genes with at least 5 total mutations))
counts_df = counts_df[counts_df.sum(axis=1) >= 5]

# 7. Normalize by CDS length
counts_df = counts_df.join(df_opportunities['CDS_length'], how='left')
df = pd.merge(counts_df, df_opportunities[['Hugo_Symbol','synonymous_opportunity', 'nonsynonymous_opportunity']], how='inner', left_index=True, right_on='Hugo_Symbol')


In [70]:
# adjust nonsynonymous opportunities to include indels
df['Indels'] = df['Frame_Shift_Del'] + df['Frame_Shift_Ins'] + df['In_Frame_Del'] + df['In_Frame_Ins']
df['NS_SNV'] = df['Missense_Mutation'] + df['Nonsense_Mutation'] + df['Nonstop_Mutation']
df['nonsynonymous_opportunity'] = df['nonsynonymous_opportunity'] * (1 + ((df['Indels'] + .5)/(df['NS_SNV'] + .5)).mean())

df = df.set_index('Hugo_Symbol')


In [71]:
df.head()

Unnamed: 0_level_0,synonymous,Frame_Shift_Del,Frame_Shift_Ins,In_Frame_Del,In_Frame_Ins,Missense_Mutation,Nonsense_Mutation,Nonstop_Mutation,Translation_Start_Site,non-synonymous,CDS_length,synonymous_opportunity,nonsynonymous_opportunity,Indels,NS_SNV
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A1CF,1,0,0,0,0,2,0,0,0,2,,1242,5003.480225,0,2
A2M,1,0,0,0,0,4,2,0,0,6,,3082,12399.823469,0,6
A2ML1,4,0,0,0,0,6,1,0,0,7,,3006,12273.307072,0,7
AADAC,0,0,0,0,0,3,1,0,0,4,,805,3400.128186,0,4
AADACL4,1,0,0,0,0,2,0,0,0,2,,855,3426.89127,0,2


In [72]:


# Mutation count table: SBS_count[ref][alt]
SBS_count = defaultdict(lambda: defaultdict(int))

def normalize_sbs(ref, alt):
    # Normalize purine ref bases to pyrimidine (G→C, A→T; and likewise alt)
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    if ref in ['G', 'A']:
        ref = complement[ref]
        alt = complement[alt]
    return ref, alt

for hgvsc in df_mut['HGVSc']:
    m = re.search(r'c\.\d+([ACGT])>([ACGT])', str(hgvsc))
    if not m:
        continue
    ref, alt = m.group(1), m.group(2)
    norm_ref, norm_alt = normalize_sbs(ref, alt)
    SBS_count[norm_ref][norm_alt] += 1

records = []
for ref in ['C', 'T']:
    for alt in ['A', 'G', 'T'] if ref == 'C' else ['A', 'C', 'G']:
        count = SBS_count[ref][alt]
        records.append({'ref': ref, 'alt': alt, 'mutation': f"{ref}>{alt}", 'count': count})

sbs_df = pd.DataFrame(records)

# Count total number of 'C' and 'T' bases across all CDSs
base_counts = defaultdict(int)

for seq in df_cds['CDS sequence']:
    seq = str(seq).upper()
    base_counts['C'] += seq.count('C')
    base_counts['T'] += seq.count('T')

# Multiply by 2 * number of individuals (diploid genomes)
# change num individuals to number of unique patient ids
num_individuals = df_mut['patient_id'].nunique()
multiplier = 2 * num_individuals

# Assign opportunities for each of the 6 SBS types
opportunities = {
    'C>A': base_counts['C'] * multiplier,
    'C>G': base_counts['C'] * multiplier,
    'C>T': base_counts['C'] * multiplier,
    'T>A': base_counts['T'] * multiplier,
    'T>C': base_counts['T'] * multiplier,
    'T>G': base_counts['T'] * multiplier
}

sbs_df['opportunities'] = sbs_df['mutation'].map(opportunities)
sbs_df['rate'] = sbs_df['count'] / sbs_df['opportunities']

# DNA codon table
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',

    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',

    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',

    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}

mutation_rate_map = sbs_df.set_index('mutation')['rate'].to_dict()

codon_opportunity = {}

for codon in codon_table:
    syn = 0.0
    nonsyn = 0.0
    ref_aa = codon_table[codon]

    for pos in range(3):
        ref_base = codon[pos]
        for alt_base in "ACGT":
            if alt_base == ref_base:
                continue
            mut_codon = codon[:pos] + alt_base + codon[pos+1:]
            if mut_codon not in codon_table:
                continue
            alt_aa = codon_table[mut_codon]

            norm_ref, norm_alt = normalize_sbs(ref_base, alt_base)
            key = f"{norm_ref}>{norm_alt}"
            rate = mutation_rate_map.get(key, 0.0)

            if ref_aa == alt_aa:
                syn += rate
            else:
                nonsyn += rate

    codon_opportunity[codon] = (syn, nonsyn)

results = []

for idx, row in df_cds.iterrows():
    gene = row['Hugo_Symbol']
    seq = row['CDS sequence'].upper()

    if gene not in df.index:
        continue  # skip genes without observed mutations

    O_NS = df.loc[gene, 'non-synonymous']
    O_S = df.loc[gene, 'synonymous']

    E_S, E_NS = 0.0, 0.0
    for i in range(0, len(seq) - 2, 3):
        codon = seq[i:i+3]
        if codon not in codon_opportunity:
            continue
        syn, nonsyn = codon_opportunity[codon]
        E_S += syn
        E_NS += nonsyn

    E_S = E_S * 2 * num_individuals
    E_NS = E_NS * 2 * num_individuals

    λ = O_S / E_S if E_S > 0 else np.nan
    expected_NS_scaled = λ * E_NS if λ is not None else None
    dnds = O_NS / expected_NS_scaled if expected_NS_scaled else np.nan

    results.append({
        'Hugo_Symbol': gene,
        'synonymous': O_S,
        'observed_nonsynonymous': O_NS,
        'synonymous_opportunity': E_S,
        'nonsynonymous_opportunity': E_NS,
        'λ': λ,
        'dN/dS': dnds
    })

df_dnds = pd.DataFrame(results)


In [73]:
df_dnds.sort_values(by='dN/dS', ascending=False, inplace=True)
df_dnds.head(10)

Unnamed: 0,Hugo_Symbol,synonymous,observed_nonsynonymous,synonymous_opportunity,nonsynonymous_opportunity,λ,dN/dS
5368,TP53,2,246,1.239781,3.097287,1.613188,49.234399
5064,GATA3,2,75,1.755706,3.53089,1.139143,18.646561
1736,PIK3CA,6,291,2.157271,7.328571,2.781292,14.276673
4884,FOXA1,1,25,1.887725,3.85209,0.529738,12.251301
2443,CDH1,3,88,2.675803,6.475851,1.121159,12.120449
831,MAP3K1,3,89,3.749497,11.402825,0.800107,9.755046
5337,MAP2K4,1,25,0.953652,2.888429,1.048601,8.25407
304,ARID1A,1,21,7.259698,18.720393,0.137747,8.143721
36,LRP2,1,24,11.323221,34.319247,0.088314,7.91851
4802,RUNX1,2,31,1.801039,3.840226,1.11047,7.269389


In [74]:
def poisson_pval(x):
    """P-value for observed nonsynonymous mutation count of the gene. Under null, expected should be equal to the observed synonymous mutations times the ratio of nonsynonymous to synonymous opportunities."""
    return 1 - poisson.cdf(x['observed_nonsynonymous']-1, x['synonymous']*x['nonsynonymous_opportunity']/x['synonymous_opportunity'])

df_dnds['poisson_pval'] = df_dnds.apply(lambda x: poisson_pval(x), axis=1)

In [83]:
from src.utils.eval  import evalAccuracy, compareRankings

# load the IntOGen ranking TSV
path_intogen = "data/raw/IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"
df_intogen = pd.read_csv(path_intogen, sep="\t")

# build a dict mapping gene names to their IntOGen relevance.
#RELEVANCE IS SAMPLES%
baseline_ranks = dict(zip(df_intogen["Symbol"], (df_intogen["Samples (%)"] * 0.01)))

df_dn_ds = df_dnds
#df_dn_df = df_dn_ds['Hugo_Symbol'].dropna()
#dn_ds_ranks = dict(zip(df_dn_ds["Hugo_Symbol"], df_dn_ds["dN/dS"]))

dn_ds_ranks = dict(zip(df_dn_ds["Hugo_Symbol"], 1- df_dn_ds["poisson_pval"]))
#dn_ds_ranks = dict(zip(df_dn_ds["Hugo_Symbol"], df_dn_ds["Hugo_Symbol"]))

# calculate accuracy metrics
dcg, bpref, accuracy = evalAccuracy(dn_ds_ranks, baseline_ranks)

df_gene_ranks = compareRankings(dn_ds_ranks, baseline_ranks)

print(f"Using poisson_pval, DCG: {dcg}, Bpref: {bpref}, Accuracy: {accuracy}")

Using poisson_pval, DCG: 0.5202934952448164, Bpref: 0.17296786389413987, Accuracy: 0.17391304347826086


In [None]:
# load the IntOGen ranking TSV
path_intogen = "data/raw/IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"
df_intogen = pd.read_csv(path_intogen, sep="\t")

# build a dict mapping gene names to their IntOGen relevance.
#RELEVANCE IS SAMPLES%
baseline_ranks = dict(zip(df_intogen["Symbol"], (df_intogen["Samples (%)"] * 0.01)))

df_dn_ds = df_dnds
df_dn_df = df_dn_ds[df_dn_ds['Hugo_Symbol'].notna()]

dn_ds_ranks = dict(zip(df_dn_ds["Hugo_Symbol"], df_dn_ds["dN/dS"]))

# calculate accuracy metrics
dcg, bpref, accuracy = evalAccuracy(dn_ds_ranks, baseline_ranks)

df_gene_ranks = compareRankings(dn_ds_ranks, baseline_ranks)

print(f"Using dN/dS, DCG: {dcg}, Bpref: {bpref}, Accuracy: {accuracy}")

Using dN/dS, DCG: 0.5335676792849361, Bpref: 0.24196597353497168, Accuracy: 0.26086956521739135
