Here we use the table provided by Malvika Tejura to extract specific variant classes.

In [1]:
# Common imports and constants
import os
import json

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, notation, data
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

DATAFRAME_VERSION = '17796333' # For bookkeeping
RESULTS_DIR = f'{BUCKET}/classes_2025-12-19'
OVERWRITE = False

In [2]:
# Load points dataframe

points_df = pd.read_csv(
    f'{BUCKET}/precomputed/Dan_fxn_calibrations_new_2025_2018_clust_calib_121425.csv.gz',
    dtype = {
        'Gene': str,
        'Dataset': str,
        'consequence': str,
        'Chrom': str
    }
)

In [3]:
points_df.columns.to_list()

In [4]:
# Limit to our genes, and remove flagged variants
working_df = points_df[
    points_df['Gene'].isin(data.gene_phenotypes) &
    (points_df.Flag != '*')
]

In [5]:
score_sets = [
    {
        'Dataset': 'TP53_Fayer_2021_meta',
        'Col': 'OP_points_18_25',
        'Points': True
    }
] + [
    {
        'Dataset': dataset,
        'Col': 'StandardizedClass',
        'Points': False
    }
    for dataset in working_df.Dataset.drop_duplicates()
] + [
    {
        'Dataset': dataset,
        'Col': 'StandardizedClass',
        'Points': False,
        stop_gain_flag: True
    }
    for dataset in working_df.Dataset[working_df.Dataset.str.endswith('_unpublished')].drop_duplicates()
    for stop_gain_flag in ('Stop gain only', 'All but stop gain')
]

In [6]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [7]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [8]:
# Infer points ranges

point_groups = {
    '0' if points == 0 else f'{GEQ_CHAR if points > 0 else LEQ_CHAR} {points:+d}':
        (pd.Series.eq, points) if points == 0 else (
            (pd.Series.ge, points) if points > 0 else (pd.Series.le, points)
        )
#        ((lambda x: dummy_geq(x, points)) if points > 0 else (lambda x: dummy_leq(x, points)))
#        ((lambda x: x >= points) if points > 0 else (lambda x: x <= points))
    for points in range(-16, 17)
}

In [9]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')

In [12]:
for (gene, dataset), dataset_df in tqdm(working_df.groupby(['Gene', 'Dataset'])):

    exposures_file = f'{RESULTS_DIR}/exposures/{dataset}.parquet'
    clinvar_file = f'{RESULTS_DIR}/clinvar_maps/{dataset}.parquet'
    af_file = f'{RESULTS_DIR}/af_maps/{dataset}.parquet'
    
    if all((gctools.blob_exists(f) for f in (exposures_file, clinvar_file, af_file))) and not OVERWRITE:
        print(f'Files for {dataset} exist, skipping.')
        continue

    clinvar_classes_dfs = []
    joint_af_map = {}
    gene_result_dfs = []
    
    # Select relevant score sets.
    
    relevant_scoresets = [
        score_set for score_set in score_sets if score_set['Dataset'] == dataset
    ]
    
    for score_set in tqdm(relevant_scoresets):
        
        classifier = score_set['Col']        
        score_set_df = dataset_df.copy()
        
        if score_set_df.empty:
            continue
        
        if score_set.get('Stop gain only'):
            classifier += ' (stop gain only)'
            score_set_df = score_set_df[
                score_set_df['consequence'].str.contains('stop_gained', na = False)
            ]
        
        if score_set.get('All but stop gain'):
            classifier += ' (without stop gain)'
            score_set_df = score_set_df[
                score_set_df['consequence'].isna() |
                ~score_set_df['consequence'].str.contains('stop_gained', na = False)
            ]
        
        score_set_df = score_set_df[
            ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele', score_set['Col']]
        ].rename(
            columns={score_set['Col']: 'score'}
        ).dropna().astype({
            'Chrom': str,
            'hg38_start': int,
            'ref_allele': str,
            'alt_allele': str,
            'score': float if score_set['Points'] else str
        }).assign(Chromosome = lambda df: 'chr' + df["Chrom"])

        if score_set['Points']:
            classification_dfs = {
                classification:
                    score_set_df[
                        ['Chromosome', 'hg38_start', 'ref_allele', 'alt_allele']
                    ][
                        compare(score_set_df.score, threshold)
                    ]
                for classification, (compare, threshold) in point_groups.items()
            }
        else:
            classification_dfs = {
                classification:
                    score_set_df[
                        ['Chromosome', 'hg38_start', 'ref_allele', 'alt_allele']
                    ][
                        score_set_df.score == classification
                    ]
                for classification in ('NORMAL', 'ABNORMAL')
            }
        
        for classification, variant_df in classification_dfs.items():

            if variant_df.empty:
                continue

            exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
                variant_df,
                wgs_mt,
                clinvar_bins_df,
                contig_col='Chromosome',
                pos_col='hg38_start',
                ref_col='ref_allele',
                alt_col='alt_allele',
                metadata_dict={
                    'Dataset': dataset,
                    'Gene': gene,
                    'Classifier': classifier,
                    'Classification': classification,
                    'Data Version': DATAFRAME_VERSION
                }
            )

            clinvar_classes_dfs.append(clinvar_df)
            joint_af_map.update(af_map)
            gene_result_dfs.append(exposure_df)

    if clinvar_classes_dfs:
        pd.concat(clinvar_classes_dfs, ignore_index=True).to_parquet(clinvar_file)
    if joint_af_map:
        pd.Series(joint_af_map).to_frame(name='AF').to_parquet(af_file)
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_parquet(exposures_file)
