In [1]:
# Common imports and constants
import os

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import carrier_annotation, data

#hailtools, gctools, notation, data
#from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

RESULT_DIR = f'{BUCKET}/classes_2025-12-16'
#SPLICE_AI_FILTER_THRESHOLD = np.inf #0.2
#AF_FILTER_THRESHOLD = np.inf #0.01

In [4]:
revel_calibrations_df = pd.read_csv(
    f'{BUCKET}/calibrations/vep/cluster_genes_combined_variant_level_REVEL.csv.gz',
    usecols=[
        'gene_symbol',
        '#CHROM',
        'POS',
        'REF',
        'ALT',
        'REVEL',
        'old_call',
        'cluster_call'
    ]
)
revel_calibrations_df = revel_calibrations_df[revel_calibrations_df['gene_symbol'].isin(data.gene_phenotypes.keys())]
revel_calibrations_df

In [4]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [5]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [6]:
# Load gene-specific Calibration table
# This table is indexed by gene
revel_gene_thresholds_df = pd.read_csv(
    f'{BUCKET}/calibrations/vep/REVEL_gene_specific_calibration_thresholds_notfiltmp2train.csv.gz',
    index_col=0
)

In [7]:
!gsutil ls $WORKSPACE_BUCKET/calibrations/vep

In [8]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')

In [15]:
# Create annotation object
revel_annotator = carrier_annotation.CarrierAnnotatorREVEL(
    revel_calibrations_df,
    revel_gene_thresholds_df,
    RESULT_DIR,
    wgs_mt,
    clinvar_bins_df,
    vat_loader=lambda gene: pd.read_table(f'{BUCKET}/aux_data/{gene}_vat.tsv'),
    progress_tracker=tqdm
)

In [16]:
revel_annotator.build_exposure_package()