Here we build carrier tables for MAVEs scored by a combination of Dan's calibrations and VEP methods.

Calculation of combined points provided by Malvika Tejura.

In [1]:
# Common imports and constants
import os
import json

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, notation, data
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

DATAFRAME_VERSION = '17796333' # For bookkeeping
RESULTS_DIR = f'{BUCKET}/combined_classes_2025-12-18'

In [2]:
# Load points dataframe

points_df = pd.read_csv(f'{BUCKET}/precomputed/Dan_fxn_calibrations_new_2025_2018_clust_calib_121425.csv.gz')

In [3]:
points_df.columns.to_list()

In [4]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [5]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [6]:
# Limit to our genes
working_df = points_df[points_df['Gene'].isin(data.gene_phenotypes)]

In [7]:
# Fix for CALM genes
genes = {
    g
    for g in working_df.Gene.drop_duplicates()
    if g != 'CALM1_2_3'
} | {'CALM1', 'CALM2', 'CALM3'}

In [8]:
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

In [9]:
# Evidence threshold mapping from integer points to labels
evidence_strength_series = pd.Series({
    nsign * points: f'{inequality} {sign}{points}'
    for nsign, sign, inequality in ((1, '+', GEQ_CHAR), (-1, '-', LEQ_CHAR))
    for points in (8,4,3,2,1)
}).sort_index(ascending=False)
#evidence_strength_series = pd.Series({
#    +8: "Pathogenic very strong",
#    +4: "Pathogenic strong",
#    +2: "Pathogenic moderate",
#    +1: "Pathogenic supporting",
#    -1: "Benign supporting",
#    -2: "Benign moderate",
#    -4: "Benign strong",
#    -8: "Benign very strong"
#})

In [10]:
# Infer points ranges

point_groups = {
    '0' if points == 0 else f'{GEQ_CHAR if points > 0 else LEQ_CHAR} {points:+d}':
        (pd.Series.eq, points) if points == 0 else (
            (pd.Series.ge, points) if points > 0 else (pd.Series.le, points)
        )
#        ((lambda x: dummy_geq(x, points)) if points > 0 else (lambda x: dummy_leq(x, points)))
#        ((lambda x: x >= points) if points > 0 else (lambda x: x <= points))
    for points in range(-16, 17)
}

In [11]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')

In [14]:
test_gene_df = working_df[working_df.Gene == 'BRCA2']

In [20]:
test_classifier = 'total_points_dan_18_25_YP_REVEL'

In [34]:
test_points_df = test_gene_df.groupby(
    ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
)[test_classifier].apply(
    lambda s: s.iloc[s.abs().argmax()]
).to_frame(name='points').reset_index().astype({
    'Chrom': str,
    'hg38_start': int,
    'ref_allele': str,
    'alt_allele': str,
    'points': float
})

In [None]:
for gene, gene_df in tqdm(working_df.groupby('Gene')):

    exposures_file = f'{RESULTS_DIR}/exposures/combined_{gene}.parquet'
    clinvar_file = f'{RESULTS_DIR}/clinvar_maps/combined_{gene}.parquet'
    af_file = f'{RESULTS_DIR}/af_maps/combined_{gene}.parquet'
    
    if all((gctools.blob_exists(f) for f in (exposures_file, clinvar_file, af_file))):
        print(f'Files for {gene} exist, skipping.')
        continue

    clinvar_classes_dfs = []
    joint_af_map = {}
    gene_result_dfs = []
    
    # In case we run the same analysis for different points columns. Those become our classifiers.
    for classifier in ['total_points_dan_18_25_best_predictor']:
        points_df = gene_df.groupby(
            ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
        )[classifier].apply(
            lambda s: s.iloc[s.abs().argmax()]
        ).to_frame(name='points').reset_index().astype({
            'Chrom': str,
            'hg38_start': int,
            'ref_allele': str,
            'alt_allele': str,
            'points': float
        }).assign(Chromosome = lambda df: 'chr' + df["Chrom"])

        for classification, (compare, threshold) in point_groups.items():

            variant_df = points_df[
                ['Chromosome', 'hg38_start', 'ref_allele', 'alt_allele']
            ][
                compare(points_df.points, threshold)
            ]

            if variant_df.empty:
                continue

            exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
                variant_df,
                wgs_mt,
                clinvar_bins_df,
                contig_col='Chromosome',
                pos_col='hg38_start',
                ref_col='ref_allele',
                alt_col='alt_allele',
                metadata_dict={
                    'Dataset': 'Combined points',
                    'Gene': gene,
                    'Classifier': classifier,
                    'Classification': classification,
                    'Data Version': DATAFRAME_VERSION
                }
            )

            clinvar_classes_dfs.append(clinvar_df)
            joint_af_map.update(af_map)
            gene_result_dfs.append(exposure_df)

    if clinvar_classes_dfs:
        pd.concat(clinvar_classes_dfs, ignore_index=True).to_parquet(clinvar_file)
    if joint_af_map:
        pd.Series(joint_af_map).to_frame(name='AF').to_parquet(af_file)
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_parquet(exposures_file)
