In [1]:
# Common imports and constants
import os

import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [2]:
# Genes of interest
genes = [
    'APP',
    'BAP1',
    'BARD1',
    'BRCA1',
    'BRCA2',
    'BRIP1',
    'CALM1',
    'CALM2',
    'CALM3',
    'GCK',
    'KCNH2',
    'KCNQ4',
    'MSH2',
    'OTC',
    'PALB2',
    'PRKN',
    'PTEN',
    'RAD51C',
    'RAD51D',
    'SCN5A',
    'SNCA',
    'TARDBP',
    'TP53',
    'VWF'
]

In [3]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [4]:
# Load wgs
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [5]:
for gene in tqdm(genes):
    
    out_file = f'{BUCKET}/prelim_exposures/rare_{gene}.parquet'
    
    if gctools.blob_exists(out_file): continue
    
    gene_result_dfs = []
    
    gene_vat = pd.read_table(f'{BUCKET}/aux_data/{gene.lower()}_vat.tsv')
    
    # By All of Us AF:
    variant_df = gene_vat[['contig', 'position', 'ref_allele', 'alt_allele']][
        gene_vat['gvs_all_af'] < 0.001
    ]
    
    if not variant_df.empty:
        
        result_df = hailtools.get_cols_with_variants(
            variant_df,
            wgs_mt,
            contig_col='contig',
            pos_col='position',
            ref_col='ref_allele',
            alt_col='alt_allele'
        )
        
        result_df.rename(columns = {'s': 'person_id'}, inplace=True)
        result_df['Dataset'] = 'Rare Variants'
        result_df['Gene'] = gene
        result_df['Classifier'] = 'All of Us'
        result_df['Classification'] = 'All of Us AF < 0.001'
        
        gene_result_dfs.append(result_df)

    # By gnomAD AF
    variant_df = gene_vat[['contig', 'position', 'ref_allele', 'alt_allele']][
        gene_vat['gnomad_all_af'] < 0.001
    ]
    
    if not variant_df.empty:
        
        result_df = hailtools.get_detailed_cols_with_variants(
            variant_df,
            wgs_mt,
            contig_col='contig',
            pos_col='position',
            ref_col='ref_allele',
            alt_col='alt_allele'
        )
        
        result_df.reset_index(names='person_id', inplace=True)
        result_df['Dataset'] = 'Rare Variants'
        result_df['Gene'] = gene
        result_df['Classifier'] = 'gnomAD'
        result_df['Classification'] = 'gnomAD AF < 0.001'
        
        gene_result_dfs.append(result_df)
    
    if gene_result_dfs:
        pd.concat(gene_result_dfs, ignore_index=True).to_parquet(out_file, index=False)