In [7]:
import pandas as pd
import os

# Construct the path relative to the notebook's location.
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

path_expression = os.path.join(notebook_dir, "..", "..", "..", "datasets", "TCGA.BRCA.expression.txt")
df_exp = pd.read_csv(path_expression, delimiter="\t")

In [None]:
import math

def evalAccuracy(Rankings):
    """
    Compares the ranking of genes to intogen rankings.
    Calculates:
      - DCG: discounted cumulative gain with binary relevance
      - Bpref: binary preference
    
    """
    # set of true (intogen) genes
    
    # load the IntOGen ranking TSV
    path_intogen = "IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"
    df_intogen = pd.read_csv(path_intogen, sep="\t")
    
    # build a dict mapping gene names to their IntOGen relevance.
    #RELEVANCE IS #MUTATIONS * SAMPLES%
    intogen_ranks = dict(zip(df_intogen["Symbol"], (df_intogen["Mutations"] * df_intogen["Samples (%)"])))
    
    driver_genes = set(intogen_ranks.keys())
    sorted_genes = sorted(Rankings, key=Rankings.get, reverse=True)

    # --- DCG  ---
    dcg = 0.0
    for i, g in enumerate(sorted_genes):
        if g in driver_genes:
            # (2^relevance - 1) / log2(position+1)
            dcg += ((2 ** intogen_ranks[g]) - 1)/ math.log2(i + 2)
    
    # --- Bpref ---
    R = len(driver_genes)
    bp_sum = 0.0
    rel_seen = 0
    for i, g in enumerate(sorted_genes):
        if g in driver_genes:
            # number of non‑relevant ranked above this relevant
            irrel_before = i - rel_seen
            # cap at R for the standard Bpref formula
            bp_sum += 1 - min(irrel_before, R) / R
            rel_seen += 1
    
    bpref = bp_sum / R if R > 0 else 0.0
    return dcg, bpref

In [22]:
path_intogen = "IntOGen-DriverGenes_TCGA_WXS_BRCA.tsv"
df_intogen = pd.read_csv(path_intogen, sep="\t")
# build a dict mapping gene names to their IntOGen rank
intogen_ranks = dict(zip(df_intogen["Symbol"], (df_intogen["Mutations"] * df_intogen["Samples (%)"] * 0.01)))

In [23]:
intogen_ranks

{'HRAS': 0.025,
 'KLF4': 0.02,
 'KRAS': 0.0354,
 'JAK2': 0.0354,
 'SMAD4': 0.03,
 'ZXDB': 0.0632,
 'ACVR1B': 0.10890000000000001,
 'CDKN1B': 0.09790000000000001,
 'EIF4A2': 0.11879999999999999,
 'ABL2': 0.11879999999999999,
 'TBL1XR1': 0.1027,
 'NCOR2': 0.1526,
 'FGFR2': 0.1785,
 'EP300': 0.2384,
 'CASP8': 0.2384,
 'H3C2': 0.2224,
 'FBXW7': 0.2064,
 'RGS7': 0.19039999999999999,
 'AFDN': 0.2193,
 'KDM6A': 0.21419999999999997,
 'MED23': 0.26409999999999995,
 'ERBB3': 0.37599999999999995,
 'SF3B1': 0.4158,
 'CTCF': 0.545,
 'CBFB': 0.5668000000000001,
 'BRCA2': 0.46280000000000004,
 'RB1': 0.6155999999999999,
 'AKT1': 0.7783999999999999,
 'BRCA1': 0.6664,
 'TBX3': 0.6384,
 'PIK3R1': 0.6901999999999999,
 'ERBB2': 0.8256,
 'FOXA1': 1.0692000000000002,
 'ARID1A': 1.1480000000000001,
 'FAT3': 1.7051999999999998,
 'NF1': 1.3200999999999998,
 'MAP2K4': 1.4388,
 'RUNX1': 1.6065,
 'NCOR1': 1.98,
 'PTEN': 2.3856,
 'KMT2C': 9.1778,
 'GATA3': 12.012500000000001,
 'CDH1': 12.2047,
 'MAP3K1': 10.022400