In [None]:
# Common imports and constants
import os

import pandas as pd
from tqdm import tqdm

from cvfgaou import hailtools, gctools

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [None]:
# Genes of interest
genes = [
    'APP',
    'BAP1',
    'BARD1',
    'BRCA1',
    'BRCA2',
    'BRIP1',
    'CALM1',
    'CALM2',
    'CALM3',
    'GCK',
    'KCNH2',
    'KCNQ4',
    'MSH2',
    'OTC',
    'PALB2',
    'PRKN',
    'PTEN',
    'RAD51C',
    'RAD51D',
    'SCN5A',
    'SNCA',
    'TARDBP',
    'TP53',
    'VWF'
]

In [None]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [None]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [None]:
# Dictionary for filtering VAT variants.
# form is label: (inclusion substring, [list of exclusion substrings])

vat_filters = {
    'Pathogenic': ('pathogenic', ['likely', 'benign', 'uncertain']),
    'Likely Pathogenic': ('pathogenic', ['benign', 'uncertain']),
    'Uncertain Significance': ('uncertain', []),
    'Likely Benign': ('benign', ['pathogenic', 'uncertain']),
    'Benign': ('benign', ['likely', 'pathogenic', 'uncertain'])
}

for gene in tqdm(genes):
    
    out_file = f'{BUCKET}/prelim_exposures/clinvar_vat_{gene}.csv'
    
    if gctools.blob_exists(out_file): continue
    
    gene_result_dfs = []
    
    gene_vat = pd.read_table(f'{BUCKET}/aux_data/{gene.lower()}_vat.tsv')
    gene_vat = gene_vat[~gene_vat.clinvar_classification.isna()]
    
    for classification, (inclusion_str, exclusion_strs) in tqdm(vat_filters.items()):
        
        variant_selection = gene_vat.clinvar_classification.str.contains(inclusion_str)
        for ex_str in exclusion_strs:
            variant_selection &= ~gene_vat.clinvar_classification.str.contains(ex_str)
        
        variant_df = gene_vat[['contig', 'position', 'ref_allele', 'alt_allele']][
            variant_selection
        ]
        
        if variant_df.empty: continue
        
        result_df = hailtools.get_cols_with_variants(
            variant_df,
            wgs_mt,
            contig_col='contig',
            pos_col='position',
            ref_col='ref_allele',
            alt_col='alt_allele'
        )
        
        result_df.rename(columns = {'s': 'person_id'}, inplace=True)
        result_df['Dataset'] = 'ClinVar'
        result_df['Gene'] = gene
        result_df['Classifier'] = 'Curated (AoU CDR v8 VAT)'
        result_df['Classification'] = classification
        
        gene_result_dfs.append(result_df)
    
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_csv(out_file, index=False)