Per-gene exposure tables for MutPred2

In [1]:
# Common imports and constants
import os

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, notation, data
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

RESULT_DIR = f'{BUCKET}/classes_2025-10-15'
SPLICE_AI_FILTER_THRESHOLD = np.inf#0.2
AF_FILTER_THRESHOLD = np.inf#0.01

In [2]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [3]:
# Load WGS
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [4]:
# Load mutpred table and filter it down to the genes of interest
mutpred_df = pd.concat(
    [
        df[df['gene_symbol'].isin(data.project_genes)]
        for df in tqdm(pd.read_table(f'{BUCKET}/mutpred2/IGVFFI7749UFOK.tsv.gz', chunksize=100000))
    ]
)

In [5]:
# Extract AA change columns
mutpred_df[['aa_ref', 'aa_pos', 'aa_alt']] = mutpred_df['Substitution'].str.extract(
    r'^([A-Z])(\d+)([A-Z])$'
)
mutpred_df = mutpred_df.assign(
    aa_ref=lambda df: df.aa_ref.map(notation.AA_NAMES),
    aa_alt=lambda df: df.aa_alt.map(notation.AA_NAMES)
)

In [4]:
# Load gene-specific Calibration table
# This table is indexed by gene
mp2_gene_thresholds_df = pd.read_csv(
    f'{BUCKET}/calibrations/pillarg_MP2_gene_specific_thresh.csv',
    index_col=0
)

# Mapping from our thresholds to thresholds used in the table and the respective comparison direction
gs_threshold_map = {
    f'{inequality} {sign}{points}': (f'{criterion}_{strength.title()}', comparator)
    for inequality, sign, criterion, _, comparator, _ in notation.DOE_TABLE
    for points, strength in notation.SOE_TABLE
}

In [7]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')
clinvar_bins_df

In [None]:
# Use the VAT to match AA changes to variants

for gene, per_gene_df in tqdm(mutpred_df.groupby('gene_symbol')):
    
    exposures_file = f'{RESULT_DIR}/exposures/mutpred2_{gene}.parquet'
    clinvar_file = f'{RESULT_DIR}/clinvar_maps/mutpred2_{gene}.parquet'
    af_file = f'{RESULT_DIR}/af_maps/mutpred2_{gene}.parquet'
    
    if gctools.blob_exists(exposures_file): continue
     
    gene_result_dfs = []
    clinvar_classes_dfs = []
    joint_af_map = {}
    
    # Calibration thresholds from Bergquist et al. 10.1016/j.gim.2025.101402
    
    variant_classes = (
        (c, per_gene_df[selection])
        for c, selection in (
            (f'{LEQ_CHAR} -4', per_gene_df['MutPred2 score'] <= 0.010), # Benign Strong
            (f'{LEQ_CHAR} -3', per_gene_df['MutPred2 score'] <= 0.031), # Benign Moderate+
            (f'{LEQ_CHAR} -2', per_gene_df['MutPred2 score'] <= 0.197), # Benign Moderate
            (f'{LEQ_CHAR} -1', per_gene_df['MutPred2 score'] <= 0.391), # Benign Supporting
            (f'{GEQ_CHAR} +1', per_gene_df['MutPred2 score'] >= 0.737), # Pathogenic Supporting
            (f'{GEQ_CHAR} +2', per_gene_df['MutPred2 score'] >= 0.829), # Pathogenic Moderate
            (f'{GEQ_CHAR} +3', per_gene_df['MutPred2 score'] >= 0.895), # Pathogenic Moderate+
            (f'{GEQ_CHAR} +4', per_gene_df['MutPred2 score'] >= 0.932)  # Pathogenic Strong
        )
    )

    gene_vat = pd.read_table(f'{BUCKET}/aux_data/{gene}_vat.tsv')
    
    # Filter SpliceAI score and AF
    gene_vat = gene_vat[
        (gene_vat.gnomad_all_af <= AF_FILTER_THRESHOLD) &
        ~(
            (gene_vat.splice_ai_acceptor_gain_score > SPLICE_AI_FILTER_THRESHOLD) |
            (gene_vat.splice_ai_acceptor_loss_score > SPLICE_AI_FILTER_THRESHOLD) |
            (gene_vat.splice_ai_donor_gain_score > SPLICE_AI_FILTER_THRESHOLD) |
            (gene_vat.splice_ai_donor_loss_score > SPLICE_AI_FILTER_THRESHOLD)
        )
    ]
    
    # Match on AA change    
    gene_vat[['aa_ref', 'aa_pos', 'aa_alt']] = gene_vat.aa_change.str.extract(
        r'p\.\(([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])\)'
    )
    
    for classification, mutpred_class_df in tqdm(variant_classes):
        
        variant_df = gene_vat.merge(
            mutpred_class_df,
            left_on=['aa_ref', 'aa_pos', 'aa_alt'],
            right_on=['aa_ref', 'aa_pos', 'aa_alt'],
            how='inner'
        )[['contig', 'position', 'ref_allele', 'alt_allele']]
        
        if variant_df.empty: continue
        
        exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
            variant_df,
            wgs_mt,
            clinvar_bins_df,
            contig_col='contig',
            pos_col='position',
            ref_col='ref_allele',
            alt_col='alt_allele',
            metadata_dict={
                'Dataset': 'MutPred2',
                'Gene': gene,
                'Classifier': 'Calibrated (Bergquist et al. 10.1016/j.gim.2025.101402)',
                'Classification': classification,
                'SpliceAI filter max': SPLICE_AI_FILTER_THRESHOLD,
                'AF filter max': AF_FILTER_THRESHOLD
            }
        )
        
        clinvar_classes_dfs.append(clinvar_df)
        joint_af_map.update(af_map)
        gene_result_dfs.append(exposure_df)
    
    # Gene-specific AM2 thresholds
    if gene in mp2_gene_thresholds_df.index:
        for classification, (label, compare) in gs_threshold_map.items():
            threshold = mp2_gene_thresholds_df.loc[gene, label]
            if np.isnan(threshold):
                continue
            
            mutpred_class_df = per_gene_df[compare(per_gene_df['MutPred2 score'], threshold)]
            
            variant_df = gene_vat.merge(
            mutpred_class_df,
                left_on=['aa_ref', 'aa_pos', 'aa_alt'],
                right_on=['aa_ref', 'aa_pos', 'aa_alt'],
                how='inner'
            )[['contig', 'position', 'ref_allele', 'alt_allele']]

            if variant_df.empty: continue

            exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
                variant_df,
                wgs_mt,
                clinvar_bins_df,
                contig_col='contig',
                pos_col='position',
                ref_col='ref_allele',
                alt_col='alt_allele',
                metadata_dict = {
                    'Dataset': 'MutPred2',
                    'Gene': gene,
                    'Classifier': 'Calibrated (gene-specific)',
                    'Classification': classification,
                    'SpliceAI filter max': SPLICE_AI_FILTER_THRESHOLD,
                    'AF filter max': AF_FILTER_THRESHOLD
                }
            )

            clinvar_classes_dfs.append(clinvar_df)
            joint_af_map.update(af_map)
            gene_result_dfs.append(exposure_df)
    
    if clinvar_classes_dfs:
        pd.concat(clinvar_classes_dfs, ignore_index=True).to_parquet(clinvar_file, index=False)
    if joint_af_map:
        pd.Series(joint_af_map).to_frame(name='AF').to_parquet(af_file)
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_parquet(exposures_file, index=False)