Here we build exposure tables for MAVEs calibrated by Dan's methods

In [15]:
# Common imports and constants
import os
import json

import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, data
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

#SPLICE_AI_FILTER_MAX = 0.2 # Splice AI filtering should happen at the cohort level, not at the variant level
#AF_FILTER_MAX = 0.01 # We don't filter for rare variants in functional data
CALIBRATION_VERSION = '2025-12-08'
CALIBRATION_DIR = f'{BUCKET}/calibrations/calibrations_12_08_25'
DATAFRAME_VERSION = '17796333'
RESULTS_DIR = f'{BUCKET}/classes_2025-12-19'

In [2]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [3]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [2]:
!gsutil ls $WORKSPACE_BUCKET/cvfg_17796333

In [3]:
# Load CVFG dataframe
cvfg_df = pd.read_csv(
    f'{BUCKET}/cvfg_17796333/final_pillar_data_with_clinvar_18_25_gnomad_wREVEL_wAM_wspliceAI_wMutpred2_wtrainvar_gold_standards_expanded_111225.csv.gz',
    dtype={
        'Dataset': str,
        'Gene': str,
        'Chrom': str,
        #'hg38_start': int,
        #'hg38_end': int,
        'ref_allele': str,
        'alt_allele': str,
        #'auth_reported_score': float,
        'auth_reported_func_class': str
    }
)
cvfg_df

In [6]:
list(cvfg_df.columns)

In [7]:
# Filter the dataframe:

working_df = cvfg_df[
    cvfg_df.Gene.isin(data.gene_phenotypes) & # Limit to our genes
    (cvfg_df.Flag != '*') # Drop flagged variants (mapping errors etc.)
]

In [9]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')

In [13]:
!gsutil ls $WORKSPACE_BUCKET/calibrations

In [38]:
# Save exposure for each study in its own file for simplicity
for blob in tqdm(gctools.list_blobs(CALIBRATION_DIR, return_uris=False)):
    if blob.path.endswith('.json'):
        calibration = json.loads(blob.download_as_text())
        
        dataset = calibration['dataset']
        
        exposures_file, clinvar_file, af_file = (
            f'{RESULTS_DIR}/{result_type}/calibrated_{CALIBRATION_VERSION}_{dataset}.parquet'
            for result_type in ('exposures', 'clinvar_maps', 'af_maps')
        )
        
        if gctools.blob_exists(exposures_file):
            print(f'Skipping {exposures_file} because it exists.')
            continue
            pass
        
        dataset_df = working_df[working_df['Dataset'] == dataset]
        
        if dataset_df.empty:
            print(f'{dataset} not found in dataframe')
            continue
        
        # Directionality of thresholds
        increasing = calibration.get('scoreset_flipped') == True
        if increasing:
            print(f'{dataset} was labeled as increasing')
        
        # 2025-11-06 thresholds are provided as a dictionary of lists
        thresholds_dict = {}
        for point_str, value_range in calibration['point_ranges'].items():
            
            if not value_range: continue
            
            points = int(point_str)
            
            # In decreasing scoresets, the threshold is at the bottom of the interval and we include
            # all strictly greater scoring variants in the class for benign evidence.
            # We do the opposite for pathogenic evidence.
            # We do the opposite of the above for increaseing scoresets.            
            threshold_is_bottom = (points < 0) ^ increasing
            
            thresholds_dict[points] = {
                'Classification': f'{GEQ_CHAR if points > 0 else LEQ_CHAR} {points:+d}',
                'Threshold': value_range[0][0 if threshold_is_bottom else 1],
                'Comparison': pd.Series.ge if threshold_is_bottom else pd.Series.le
            }
        
        # Can't work without thresholds
        if not thresholds_dict:
            print(f'No thresholds for {dataset}.')
            continue
        
        thresholds_df = pd.DataFrame.from_dict(thresholds_dict, orient='index').sort_index(ascending=True)
        
        # Check that threholds are increasing(decreasing):
        if increasing:
            assert thresholds_df['Threshold'].is_monotonic_increasing, f'Thresholds for {dataset} are expected to be increasing: {thresholds_df}'
        else:
            assert thresholds_df['Threshold'].is_monotonic_decreasing, f'Thresholds for {dataset} are expected to be decreasing: {thresholds_df}'
                
        gene_result_dfs = []
        clinvar_classes_dfs = []
        joint_af_map = {}
        
        for classification, threshold, compare in tqdm(thresholds_df.itertuples(index=False)):
            
            for gene, gene_df in dataset_df.groupby('Gene'):
                
                variant_df = gene_df[
                    ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
                ][
                    compare(gene_df['auth_reported_score'].astype(float), threshold)
                ].dropna(how='any').assign(Chromosome = lambda df: 'chr'+df['Chrom']).astype({'hg38_start': int})
            
                if variant_df.empty:
                    continue

                exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
                    variant_df,
                    wgs_mt,
                    clinvar_bins_df,
                    contig_col='Chromosome',
                    pos_col='hg38_start',
                    ref_col='ref_allele',
                    alt_col='alt_allele',
                    metadata_dict={
                        'Dataset': dataset,
                        'Gene': gene,
                        'Classifier': f'Calibrated ({CALIBRATION_VERSION})',
                        'Classification': classification,
                        'Data Version': DATAFRAME_VERSION
                    } | ({} if (gene_df.splice_measure == 'Yes').all() else {
                        'SpliceAI filter max': SPLICE_AI_FILTER_MAX
                    })
                )
                
                clinvar_classes_dfs.append(clinvar_df)
                joint_af_map.update(af_map)
                gene_result_dfs.append(exposure_df)
            
        if clinvar_classes_dfs:
            pd.concat(clinvar_classes_dfs, ignore_index=True).to_parquet(clinvar_file, index=False)
        if joint_af_map:
            pd.Series(joint_af_map).to_frame(name='AF').to_parquet(af_file)
        if gene_result_dfs:
            pd.concat(gene_result_dfs, ignore_index=True).to_parquet(exposures_file, index=False)
        

In [31]:
!gsutil ls $WORKSPACE_BUCKET/calibrations/calibrations_11_06_25

In [35]:
!gsutil cat $WORKSPACE_BUCKET/calibrations/calibrations_11_06_25/TP53_Boettcher_2019.json