Per-gene lists of participants with splice variants

In [1]:
# Common imports and constants
import os

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import hailtools, gctools, notation, data
from cvfgaou.notation import GEQ_CHAR, LEQ_CHAR

BUCKET = os.environ["WORKSPACE_BUCKET"]

RESULT_DIR = f'{BUCKET}/spliceAI_2025-12-10'
SPLICE_AI_FILTER_THRESHOLD = 0.2

In [2]:
import hail as hl
hl.init()
wgs_mt_path = os.getenv("WGS_EXOME_SPLIT_HAIL_PATH")
wgs_mt_path

In [3]:
# Load WGS
wgs_mt = hl.read_matrix_table(wgs_mt_path)
wgs_mt.describe()

In [8]:
# Use the VAT to as SpliceAI source

for gene in tqdm(data.gene_phenotypes):
    
    splice_carriers_file = f'{RESULT_DIR}/splice_carriers_{gene}.parquet'
    
    if gctools.blob_exists(splice_carriers_file): continue
    
    vat_col_types={
        'contig': str,
        'position': int,
        'ref_allele': str,
        'alt_allele': str,
        'splice_ai_acceptor_gain_score': float,
        'splice_ai_acceptor_loss_score': float,
        'splice_ai_donor_gain_score': float,
        'splice_ai_donor_loss_score': float
    }
    
    gene_vat = pd.read_table(
        f'{BUCKET}/aux_data/{gene}_vat.tsv',
        usecols=vat_col_types.keys(),
        dtype=vat_col_types
    )
    
    # Filter SpliceAI score
    gene_vat = gene_vat[
        (gene_vat.splice_ai_acceptor_gain_score > SPLICE_AI_FILTER_THRESHOLD) |
        (gene_vat.splice_ai_acceptor_loss_score > SPLICE_AI_FILTER_THRESHOLD) |
        (gene_vat.splice_ai_donor_gain_score > SPLICE_AI_FILTER_THRESHOLD) |
        (gene_vat.splice_ai_donor_loss_score > SPLICE_AI_FILTER_THRESHOLD)
    ]
    
    if not gene_vat.empty:
    
        try:
            persons_df = hailtools.get_cols_with_variants(
                gene_vat,
                wgs_mt,
                contig_col='contig',
                pos_col='position',
                ref_col='ref_allele',
                alt_col='alt_allele'
            )
        except:
            print('Failed on')
            print(gene_vat)
            raise
    
    persons_df.to_parquet(splice_carriers_file)