In [30]:
import pandas as pd
import os

In [49]:
import math

def evalAccuracy(calculated_ranks, baseline_ranks):
    """
    Compares the ranking of genes to intogen rankings.
    Calculates:
      - DCG: discounted cumulative gain with binary relevance
      - Bpref: binary preference
    
    """
    # set of true (intogen) genes
    
    
    
    driver_genes = set(baseline_ranks.keys())
    sorted_genes = sorted(calculated_ranks, key=calculated_ranks.get, reverse=True)

    # --- DCG  --- WANT CLOSER TO 1
    # DCG is a measure of ranking quality that takes into account the position of relevant items.
    # It is calculated as the sum of the relevance of each relevant item, discounted by its position in the ranking.
    dcg = 0.0
    for i, g in enumerate(sorted_genes):
        if g in driver_genes:
            # (2^relevance - 1) / log2(position+1)
            dcg += ((2 ** baseline_ranks[g]) - 1)/ math.log2(i + 2)
    
    # --- Bpref --- WANT CLOSER TO 1
    # Bpref is a measure of ranking quality that takes into account the number of relevant items ranked above each relevant item.
    R = len(driver_genes)
    bp_sum = 0.0
    rel_seen = 0
    for i, g in enumerate(sorted_genes):
        if g in driver_genes:
            # number of non‑relevant ranked above this relevant
            irrel_before = i - rel_seen
            # cap at R for the standard Bpref formula
            bp_sum += 1 - min(irrel_before, R) / R
            rel_seen += 1
            
    bpref = bp_sum / R if R > 0 else 0.0
            
    # --- Accuracy --- WANT CLOSER TO 1
    # What percent of ground truth genes are missing from the top
    top_num = len(driver_genes)
    top_genes = set(sorted_genes[:top_num])
    missing = len(driver_genes - top_genes)
    accuracy = 1 - (missing / len(driver_genes)) if len(driver_genes) > 0 else 0.0
    
    
    return dcg, bpref, accuracy

In [36]:
path_intogen = "IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"
df_intogen = pd.read_csv(path_intogen, sep="\t")
# build a dict mapping gene names to their IntOGen rank
intogen_ranks = dict(zip(df_intogen["Symbol"], (df_intogen["Samples (%)" ]* 0.01)))

In [35]:
intogen_ranks

{'HRAS': 0.005,
 'KLF4': 0.004,
 'KRAS': 0.0059,
 'JAK2': 0.0059,
 'SMAD4': 0.005,
 'ZXDB': 0.0079,
 'ACVR1B': 0.0099,
 'CDKN1B': 0.0089,
 'EIF4A2': 0.0099,
 'ABL2': 0.0099,
 'TBL1XR1': 0.0079,
 'NCOR2': 0.010900000000000002,
 'FGFR2': 0.011899999999999999,
 'EP300': 0.0149,
 'CASP8': 0.0149,
 'H3C2': 0.0139,
 'FBXW7': 0.0129,
 'RGS7': 0.011899999999999999,
 'AFDN': 0.0129,
 'KDM6A': 0.011899999999999999,
 'MED23': 0.0139,
 'ERBB3': 0.0188,
 'SF3B1': 0.0198,
 'CTCF': 0.021800000000000003,
 'CBFB': 0.021800000000000003,
 'BRCA2': 0.0178,
 'RB1': 0.022799999999999997,
 'AKT1': 0.0278,
 'BRCA1': 0.023799999999999998,
 'TBX3': 0.022799999999999997,
 'PIK3R1': 0.023799999999999998,
 'ERBB2': 0.0258,
 'FOXA1': 0.029700000000000004,
 'ARID1A': 0.028700000000000003,
 'FAT3': 0.0406,
 'NF1': 0.030699999999999998,
 'MAP2K4': 0.0327,
 'RUNX1': 0.0357,
 'NCOR1': 0.0396,
 'PTEN': 0.0426,
 'KMT2C': 0.0842,
 'GATA3': 0.09609999999999999,
 'CDH1': 0.09609999999999999,
 'MAP3K1': 0.07830000000000001,
 

In [32]:
path_intogen = "dndscv.csv"
df_intogen = pd.read_csv(path_intogen)
# build a dict mapping gene names to their IntOGen rank
intogen_ranks = dict(zip(df_intogen["gene_name"], (df_intogen["qallsubs_cv"])))

In [33]:
intogen_ranks

{'TP53': 0.0,
 'PIK3CA': 0.0,
 'AKT1': 0.0,
 'PTEN': 0.0,
 'CDH1': 0.0,
 'MAP2K4': 3.72e-13,
 'KMT2C': 1.27e-12,
 'MAP3K1': 2.02e-10,
 'ARID1A': 9.6e-09,
 'FOXA1': 5.54e-08,
 'CASP8': 1.94e-06,
 'NCOR1': 3.43e-06,
 'RB1': 5.66e-05,
 'RUNX1': 6.53e-05,
 'ERBB2': 0.000111958,
 'CBFB': 0.000168944,
 'NF1': 0.000242789,
 'GATA3': 0.000245433,
 'HIST1H3B': 0.000248234,
 'SMTNL2': 0.002979976,
 'RIC8A': 0.009786689,
 'HIST1H2BC': 0.043589625,
 'CRYGC': 0.096614609,
 'MED23': 0.096614609,
 'SPRR2B': 0.10594949,
 'BRCA1': 0.113043489,
 'CTCF': 0.191401707,
 'SF3B1': 0.287622078,
 'LGALS2': 0.466891662,
 'HAUS5': 0.527051364,
 'SPEN': 0.539472115,
 'DNAJC16': 0.549483789,
 'KRAS': 0.549483789,
 'RAB6A': 0.551659191,
 'HIST1H4J': 0.570535024,
 'RASIP1': 0.66191875,
 'CCDC69': 0.664619241,
 'EGFR': 0.664619241,
 'PCDHB7': 0.664619241,
 'TRAF3': 0.664619241,
 'FAM131B': 0.697321818,
 'MUC19': 0.697321818,
 'RASA2': 0.697321818,
 'NLRP2': 0.840134399,
 'AC006486.9': 0.878874632,
 'MYF6': 0.87887463

In [21]:
path_dn_ds = "TCGA.BRCA.dN_dS.tsv"
df_dn_ds = pd.read_csv(path_dn_ds, sep="\t")
dn_ds_ranks = dict(zip(df_dn_ds["Hugo_Symbol"], df_dn_ds["dN/dS"]))

In [42]:
path_dn_ds = "dndscv.csv"
df_dn_ds = pd.read_csv(path_dn_ds)
dn_ds_ranks = dict(zip(df_dn_ds["gene_name"], (1 - df_dn_ds["qallsubs_cv"])))

In [43]:
dn_ds_ranks

{'TP53': 1.0,
 'PIK3CA': 1.0,
 'AKT1': 1.0,
 'PTEN': 1.0,
 'CDH1': 1.0,
 'MAP2K4': 0.999999999999628,
 'KMT2C': 0.99999999999873,
 'MAP3K1': 0.999999999798,
 'ARID1A': 0.9999999904,
 'FOXA1': 0.9999999446,
 'CASP8': 0.99999806,
 'NCOR1': 0.99999657,
 'RB1': 0.9999434,
 'RUNX1': 0.9999347,
 'ERBB2': 0.999888042,
 'CBFB': 0.999831056,
 'NF1': 0.999757211,
 'GATA3': 0.999754567,
 'HIST1H3B': 0.999751766,
 'SMTNL2': 0.997020024,
 'RIC8A': 0.990213311,
 'HIST1H2BC': 0.956410375,
 'CRYGC': 0.903385391,
 'MED23': 0.903385391,
 'SPRR2B': 0.89405051,
 'BRCA1': 0.886956511,
 'CTCF': 0.808598293,
 'SF3B1': 0.7123779219999999,
 'LGALS2': 0.533108338,
 'HAUS5': 0.47294863600000003,
 'SPEN': 0.46052788499999997,
 'DNAJC16': 0.45051621100000006,
 'KRAS': 0.45051621100000006,
 'RAB6A': 0.44834080899999995,
 'HIST1H4J': 0.429464976,
 'RASIP1': 0.33808125,
 'CCDC69': 0.335380759,
 'EGFR': 0.335380759,
 'PCDHB7': 0.335380759,
 'TRAF3': 0.335380759,
 'FAM131B': 0.30267818199999996,
 'MUC19': 0.30267818199

In [44]:
#Print sorted top 10 intogen genese with score
sorted_intogen = sorted(intogen_ranks.items(), key=lambda x: x[1], reverse=True)
print("Top 10 IntOGen genes:")
for gene, score in sorted_intogen[:10]:
    print(f"{gene}: {score:.2f}")
#Print sorted top 10 dn_ds genese with score
sorted_dn_ds = sorted(dn_ds_ranks.items(), key=lambda x: x[1], reverse=True)
print("\nTop 10 dN/dS genes:")
for gene, score in sorted_dn_ds[:10]:
    print(f"{gene}: {score:.2f}")

Top 10 IntOGen genes:
PIK3CA: 0.33
TP53: 0.30
GATA3: 0.10
CDH1: 0.10
KMT2C: 0.08
MAP3K1: 0.08
PTEN: 0.04
FAT3: 0.04
NCOR1: 0.04
RUNX1: 0.04

Top 10 dN/dS genes:
TP53: 1.00
PIK3CA: 1.00
AKT1: 1.00
PTEN: 1.00
CDH1: 1.00
MAP2K4: 1.00
KMT2C: 1.00
MAP3K1: 1.00
ARID1A: 1.00
FOXA1: 1.00


In [45]:
# create a lookup of gene → its rank in the dn/ds list
dn_ds_rank_map = {gene: i+1 for i, (gene, _) in enumerate(sorted_dn_ds)}

# build a mapping of intogen genes to (intogen_rank, dn_ds_rank)
mapping_intogen_to_dn_ds = {
    gene: (i+1, dn_ds_rank_map.get(gene))
    for i, (gene, _) in enumerate(sorted_intogen)
}

# display as DataFrame for clarity
df_gene_ranks = pd.DataFrame([
    {"Gene": gene, "IntOGen_Rank": ranks[0], "dN_dS_Rank": ranks[1]}
    for gene, ranks in mapping_intogen_to_dn_ds.items()
])

df_gene_ranks


Unnamed: 0,Gene,IntOGen_Rank,dN_dS_Rank
0,PIK3CA,1,2.0
1,TP53,2,1.0
2,GATA3,3,18.0
3,CDH1,4,5.0
4,KMT2C,5,7.0
5,MAP3K1,6,8.0
6,PTEN,7,4.0
7,FAT3,8,8318.0
8,NCOR1,9,12.0
9,RUNX1,10,14.0


In [46]:

df_gene_ranks.sort_values(by="dN_dS_Rank", ascending=True, inplace=True)
df_gene_ranks

Unnamed: 0,Gene,IntOGen_Rank,dN_dS_Rank
1,TP53,2,1.0
0,PIK3CA,1,2.0
14,AKT1,15,3.0
6,PTEN,7,4.0
3,CDH1,4,5.0
10,MAP2K4,11,6.0
4,KMT2C,5,7.0
5,MAP3K1,6,8.0
13,ARID1A,14,9.0
12,FOXA1,13,10.0


In [50]:
dcg, bpref, accuracy  = evalAccuracy(dn_ds_ranks, intogen_ranks)

print(f"DCG: {dcg}")            #Weights the top values more
print(f"Bpref: {bpref}")        #Essentially is the bottom values at least close
print(f"Accuracy: {accuracy}")  #Just are we finding the intogen genes

DCG: 0.5817306754249287
Bpref: 0.4886578449905482
Accuracy: 0.5
