Per-gene exposure tables for predictors

In [1]:
# Common imports and constants
import os

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, notation, data
from cvfgaou import carrier_annotation as ca
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

RESULT_DIR = f'{BUCKET}/classes_2026-02-02'
SPLICE_AI_FILTER_THRESHOLD = np.inf#0.2
AF_FILTER_THRESHOLD = np.inf#0.01

In [2]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [3]:
# Load WGS
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [4]:
# Load tables and filter down to the genes of interest
predictor_dfs = {
    predictor:
        pd.concat(
            [
                df[df['gene_symbol'].isin(data.gene_phenotypes)]
                for df in tqdm(pd.read_csv(
                    f'{BUCKET}/chen_et_al/{filename}', chunksize=1000000
                ))
            ]
        )
    for predictor, filename in [
        ('MutPred2', 'modified_newMP2_calibration_table_20260122.csv.gz'),
        ('AlphaMissense', 'newAM_calibration_table_20260116.csv.gz'),
        ('REVEL', 'newREVEL_calibration_table_20260116.csv.gz')
    ]
}

In [5]:
for predictor in predictor_dfs:
    predictor_dfs[predictor]['POS'] = pd.to_numeric(predictor_dfs[predictor]['POS'], downcast='integer')
    predictor_dfs[predictor]['points'] = pd.to_numeric(predictor_dfs[predictor]['points'], downcast='integer')

In [6]:
predictor_dfs['MutPred2']

In [7]:
predictor_dfs['AlphaMissense']

In [8]:
predictor_dfs['REVEL']

In [9]:
threshold_lists = { # Bergquist et al. thresholds 10.1016/j.gim.2025.101402
    'MutPred2': [
        (f'{LEQ_CHAR} -4', pd.Series.le, 0.010), # Benign Strong
        (f'{LEQ_CHAR} -3', pd.Series.le, 0.031), # Benign Moderate+
        (f'{LEQ_CHAR} -2', pd.Series.le, 0.197), # Benign Moderate
        (f'{LEQ_CHAR} -1', pd.Series.le, 0.391), # Benign Supporting
        (f'{GEQ_CHAR} +1', pd.Series.ge, 0.737), # Pathogenic Supporting
        (f'{GEQ_CHAR} +2', pd.Series.ge, 0.829), # Pathogenic Moderate
        (f'{GEQ_CHAR} +3', pd.Series.ge, 0.895), # Pathogenic Moderate+
        (f'{GEQ_CHAR} +4', pd.Series.ge, 0.932)  # Pathogenic Strong
    ],
    'AlphaMissense': [
        (f'{LEQ_CHAR} -3', pd.Series.le, 0.070), # Benign Moderate+
        (f'{LEQ_CHAR} -2', pd.Series.le, 0.099), # Benign Moderate
        (f'{LEQ_CHAR} -1', pd.Series.le, 0.169), # Benign Supporting
        (f'{GEQ_CHAR} +1', pd.Series.ge, 0.792), # Pathogenic Supporting
        (f'{GEQ_CHAR} +2', pd.Series.ge, 0.906), # Pathogenic Moderate
        (f'{GEQ_CHAR} +3', pd.Series.ge, 0.972), # Pathogenic Moderate+
        (f'{GEQ_CHAR} +4', pd.Series.ge, 0.990)  # Pathogenic Strong
    ],
    'REVEL': [
        (f'{GEQ_CHAR} +4', pd.Series.ge, 0.932), # Pathogenic strong
        (f'{GEQ_CHAR} +3', pd.Series.ge, 0.879), # -- modetate+
        (f'{GEQ_CHAR} +2', pd.Series.ge, 0.773), # -- moderate
        (f'{GEQ_CHAR} +1', pd.Series.ge, 0.644), # Pathogenic supporting
        (f'{LEQ_CHAR} -1', pd.Series.le, 0.290), # Benign supporting
        (f'{LEQ_CHAR} -2', pd.Series.le, 0.183), # -- moderate
        (f'{LEQ_CHAR} -3', pd.Series.le, 0.052), # -- moderate+
        (f'{LEQ_CHAR} -4', pd.Series.le, 0.016) # -- strong
    ]
}

In [10]:
variant_groupers = [
    ca.TableScoresVariantGrouper(
        df,
        score_col='VEP_score',
        gene_col='gene_symbol',
        classifier_name='gene-aggregated calibration',
        dataset=predictor,
        col_mapping={
            'contig': '#CHROM',
            'position': 'POS',
            'ref_allele': 'REF',
            'alt_allele': 'ALT'
        },
        fixed_thresholds=threshold_lists[predictor]
    )
    for predictor, df in predictor_dfs.items()
] + [
    ca.TablePointsVariantGrouper(
        df[df['calibration_approach'] == calibration],
        points_col='points',
        gene_col='gene_symbol',
        classifier_name=classifier,
        dataset=predictor,
        col_mapping={
            'contig': '#CHROM',
            'position': 'POS',
            'ref_allele': 'REF',
            'alt_allele': 'ALT'
        }
    )
    for classifier, calibration in [
        ('domain-aggregated calibration', 'domain_aggregate'),
        ('single-gene calibration', 'single_gene')
    ]
    for predictor, df in predictor_dfs.items()
]

In [11]:
annotator = ca.CarrierAnnotator(
    output_location=RESULT_DIR,
    wgs_mt=wgs_mt,
    clinvar_bins_df=pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins_2026-02-02.csv.gz'),
    vat_loader=lambda gene: pd.read_table(f'{BUCKET}/aux_data/{gene}_vat.tsv'),
    variant_groupers=variant_groupers,
    progress_tracker=tqdm
)

In [12]:
annotator.build_exposure_package()