In [1]:
# Common imports and constants
import os

import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools

BUCKET = os.environ["WORKSPACE_BUCKET"]
SPLICE_AI_FILTER_MAX = 0.2
#AF_FILTER_MAX = 0.01 # We don't filter for rare variants in functional data
DATAFRAME_VERSION = '17796333'
RESULTS_DIR = f'{BUCKET}/classes_2025-12-15'

In [3]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [4]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [5]:
!gsutil ls $WORKSPACE_BUCKET/cvfg_17796333

In [6]:
# Load CVFG dataframe
cvfg_df = pd.read_csv(
    f'{BUCKET}/cvfg_17796333/final_pillar_data_with_clinvar_18_25_gnomad_wREVEL_wAM_wspliceAI_wMutpred2_wtrainvar_gold_standards_expanded_111225.csv.gz',
    dtype={
        'Dataset': str,
        'Gene': str,
        'Chrom': str,
        #'hg38_start': int,
        #'hg38_end': int,
        'ref_allele': str,
        'alt_allele': str,
        #'auth_reported_score': float,
        'auth_reported_func_class': str
    }
)
cvfg_df

In [7]:
# Check author provided vales on our genes
working_df = cvfg_df[
    cvfg_df.Gene.isin(data.gene_phenotypes) &
    (cvfg_df.Flag != '*')# Drop flagged variants (mapping errors etc.)
]

In [8]:
clinvar_bins_df = pd.read_csv(f'{BUCKET}/clinvar/clinvar-bins.csv.gz')

In [9]:
for gene, per_gene_df in tqdm(working_df.groupby('Gene')):
    
    exposures_file = f'{RESULTS_DIR}/exposures/author-reported_{gene}.parquet'
    clinvar_file = f'{RESULTS_DIR}/clinvar_maps/author-reported_{gene}.parquet'
    af_file = f'{RESULTS_DIR}/af_maps/author-reported_{gene}.parquet'
    
    if gctools.blob_exists(exposures_file): continue
     
    gene_result_dfs = []
    clinvar_classes_dfs = []
    joint_af_map = {}
    
    for (dataset, functional_class), df in tqdm(per_gene_df.groupby(['Dataset', 'auth_reported_func_class'])):

        try:
            variant_df = df[
                ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
            ].dropna(
                how='any',
            ).assign(
                Chromosome = lambda df: 'chr'+df['Chrom']
            ).astype({'hg38_start': int})
        except:
            print(f'Failed while making df for {dataset}, {gene}, {functional_class}')
            raise
        
        if variant_df.empty: continue
            
        try:
            exposure_df, af_map, clinvar_df = hailtools.get_exposure_package(
                variant_df,
                wgs_mt,
                clinvar_bins_df,
                contig_col='Chromosome',
                pos_col='hg38_start',
                ref_col='ref_allele',
                alt_col='alt_allele',
                metadata_dict={
                    'Dataset': dataset,
                    'Gene': gene,
                    'Classifier': 'Author Reported',
                    'Classification': functional_class,
                    'Data Version': DATAFRAME_VERSION
                    'Measures splicing': (df.splice_measure == 'Yes').all()
                })
            )
            
            clinvar_classes_dfs.append(clinvar_df)
            joint_af_map.update(af_map)
            gene_result_dfs.append(exposure_df)
        except:
            print(f'Failed on {dataset}, {gene}, {functional_class}:')
            print(variant_df)
            raise
        
    if clinvar_classes_dfs:
        pd.concat(clinvar_classes_dfs, ignore_index=True).to_parquet(clinvar_file, index=False)
    if joint_af_map:
        pd.Series(joint_af_map).to_frame(name='AF').to_parquet(af_file)
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_parquet(exposures_file, index=False)
    