In [22]:
# Common imports and constants
import os
import json

import pandas as pd
from tqdm.notebook import tqdm

from cvfgaou import gctools

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [5]:
# Genes of interest
genes = [
    'APP',
    'BAP1',
    'BARD1',
    'BRCA1',
    'BRCA2',
    'BRIP1',
    'CALM1',
    'CALM2',
    'CALM3',
    'GCK',
    'KCNH2',
    'KCNQ4',
    'MSH2',
    'OTC',
    'PALB2',
    'PRKN',
    'PTEN',
    'RAD51C',
    'RAD51D',
    'SCN5A',
    'SNCA',
    'TARDBP',
    'TP53',
    'VWF'
]

## Load ClinVar

In [6]:
col_types = {
    'Type': str,
    'GeneSymbol': str,
    'ClinicalSignificance': str,
    'Assembly': str,
    'Chromosome': str,
    'PositionVCF': int,
    'ReferenceAlleleVCF': str,
    'AlternateAlleleVCF': str
}

clinvar_df = pd.read_table(
    f'{BUCKET}/clinvar/variant_summary_2025-01.txt.gz',
    usecols = list(col_types.keys()),
    dtype = col_types,
    engine = 'pyarrow'
)

In [7]:
# Filter to:
# -GRCh38
# -genes of interest
# -non-null Alleles
clinvar_df.dropna(subset=['PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF'], how='any', inplace=True)
clinvar_df = clinvar_df[
    (clinvar_df.PositionVCF > 0) &
    (clinvar_df.Assembly == 'GRCh38') &
    (clinvar_df.GeneSymbol.isin(genes))
]
# Make chromosome notation consistent with Hail
clinvar_df['Chromosome'] = 'chr' + clinvar_df['Chromosome']
clinvar_df

In [8]:
clinvar_df.ClinicalSignificance.value_counts()

## Build table of ClinVar bins that will be used to annotate other classifications

We will bin into the following mutex significance categories:

- Benign
- Likely benign, inclues
    - Likely benign
    - Benign/Likely benign
- Uncertain significance, includes
    - Uncertain significance
    - Uncertain significance/Uncertain risk allele
- Conflicting
    - Conflicting classifications of pathogenicity
    - Conflicting classifications of pathogenicity; risk factor
- Likely pathogenic, includes
    - Likely pathogenic
    - Pathogenic/Likely pathogenic
    - Likely pathogenic/Likely risk allele
    - Pathogenic/Likely pathogenic/Likely risk allele
- Pathogenic
- Other / not in ClinVar
    - All other classifications.

Notably, "Other" will also include variants not in ClinVar

In [9]:
mutex_binning = {
    'Pathogenic': ['Pathogenic'],
    'Likely pathogenic': [
        'Likely pathogenic',
        'Pathogenic/Likely pathogenic',
        'Likely pathogenic/Likely risk allele',
        'Pathogenic/Likely pathogenic/Likely risk allele',
    ],
    'Uncertain significance': [
        'Uncertain significance',
        'Uncertain significance/Uncertain risk allele'
    ],
    'Conflicting': [
        'Conflicting classifications of pathogenicity',
        'Conflicting classifications of pathogenicity; risk factor'
    ],
    'Likely benign': [
        'Likely benign',
        'Benign/Likely benign'
    ],
    'Benign': ['Benign']
}

In [11]:
# Build a table that defines variant: ClinVar bin
sig2bin_map = {
    significance: sig_bin
    for sig_bin, values in mutex_binning.items()
    for significance in values
}

clinvar_bins_df = clinvar_df[['Chromosome', 'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF']].copy()
clinvar_bins_df['Clinvar significance'] = clinvar_df.ClinicalSignificance.map(sig2bin_map)
clinvar_bins_df.dropna(axis='index', how='any', inplace=True)
clinvar_bins_df.rename(
    columns={
        'PositionVCF': 'Pos',
        'ReferenceAlleleVCF': 'Ref',
        'AlternateAlleleVCF': 'Alt'
    },
    inplace=True
)

In [32]:
def annotate_variants_with_clinvar(
    variant_df,
    chrom_col,
    pos_col,
    ref_col,
    alt_col,
    clinvar_bins_df=clinvar_bins_df
):
    varspec = ['Chromosome', 'Pos', 'Ref', 'Alt']
    df = variant_df[[chrom_col, pos_col, ref_col, alt_col]]
    df.columns = varspec
    result_df = df.merge(
        clinvar_bins_df,
        on=varspec,
        how='left'
    )
    result_df['Clinvar significance'] = result_df['Clinvar significance'].fillna('Other / not in ClinVar')
    return result_df

## Get variant sets for all classifications

### ClinVar class groups

For OR classes, we will use the following significance categories:

- Benign
- Likely benign, inclues
    - Benign
    - Likely benign
    - Benign/Likely benign
- Uncertain significance, includes
    - Uncertain significance
    - Uncertain significance/Uncertain risk allele
- Likely pathogenic, includes
    - Pathogenic
    - Likely pathogenic
    - Pathogenic/Likely pathogenic
    - Likely pathogenic/Likely risk allele
    - Pathogenic/Likely pathogenic/Likely risk allele
- Pathogenic

Notably, categories are not mutually exclusive, and we drop "uncertain risk allele," "conflicting," and "not provided" variants.

In [34]:
binning = {
    'Pathogenic': ['Pathogenic'],
    'Likely pathogenic': [
        'Pathogenic',
        'Likely pathogenic',
        'Pathogenic/Likely pathogenic',
        'Likely pathogenic/Likely risk allele',
        'Pathogenic/Likely pathogenic/Likely risk allele',
    ],
    'Uncertain significance': [
        'Uncertain significance',
        'Uncertain significance/Uncertain risk allele'
    ],
    'Likely benign': [
        'Benign',
        'Likely benign',
        'Benign/Likely benign'
    ],
    'Benign': ['Benign']
}

subset_clinvar_df = clinvar_df

for label, classifications in binning.items():
    selection = False
    for c in classifications:
        selection |= (subset_clinvar_df.ClinicalSignificance == c)

    subset_clinvar_df[label] = selection
        
subset_clinvar_df

In [35]:
clinvar_dfs = []

for gene in tqdm(genes):
    
    for classification in tqdm(binning):
        variant_df = subset_clinvar_df[
            (subset_clinvar_df['GeneSymbol'] == gene) &
            subset_clinvar_df[classification]
        ]
        
        if variant_df.empty: continue
        
        result_df = annotate_variants_with_clinvar(
            variant_df,
            'Chromosome',
            'PositionVCF',
            'ReferenceAlleleVCF',
            'AlternateAlleleVCF'
        )
        
        result_df['Dataset'] = 'ClinVar'
        result_df['Gene'] = gene
        result_df['Classifier'] = 'Curated (2025-01)'
        result_df['Classification'] = classification
        
        clinvar_dfs.append(result_df)

### Assay calibrations 2025-06-05

In [24]:
# Load CVFG dataframe
cvfg_df = pd.read_csv(
    f'{BUCKET}/cvfg_15577805/pillar_data_clinvar38_053125_wREVEL.csv.gz',
    index_col=0,
    dtype={
        'Dataset': str,
        'Gene': str,
        'Chrom': str,
        #'hg38_start': int,
        #'hg38_end': int,
        'ref_allele': str,
        'alt_allele': str,
        #'auth_reported_score': float,
        'auth_reported_func_class': str
    }
)

In [25]:
# Limit to our genes
working_df = cvfg_df[cvfg_df.Gene.isin(genes)]

In [26]:
# Evidence threshold mapping from points to labels
evidence_strength_series = pd.Series({
    +8: "Pathogenic very strong",
    +4: "Pathogenic strong",
    +2: "Pathogenic moderate",
    +1: "Pathogenic supporting",
    -1: "Benign supporting",
    -2: "Benign moderate",
    -4: "Benign strong",
    -8: "Benign very strong"
})

In [28]:
# Common container for frames
assay_calibration_dfs = []

for blob in tqdm(gctools.list_blobs(
    f'{BUCKET}/calibrations/functional_assay_calibration_results_06_06_2025/summaries_2025_06_05/',
    return_uris=False
)):
    if blob.path.endswith('.json'):
        calibration = json.loads(blob.download_as_text())
        
        dataset = calibration['scoreset_name']
        
        dataset_df = working_df[working_df['Dataset'] == dataset]
        
        if dataset_df.empty:
            continue
        
        # Check how thresholds are provided. If lists, assume they should be aligned to 1, 2, 3, 4, 8
        pathogenic_thresholds = calibration['final_pathogenic_thresholds']
        if isinstance(pathogenic_thresholds, list):
            print(f'Warning: pathogenic thresholds provided as a list for {dataset}')
            pathogenic_thresholds = dict(zip([1,2,3,4,8], pathogenic_thresholds))
        
        benign_thresholds = calibration['final_benign_thresholds']
        if isinstance(benign_thresholds, list):
            print(f'Warning: benign thresholds provided as a list for {dataset}')
            benign_thresholds = dict(zip([-1,-2,-3,-4,-8], benign_thresholds))
        
        # Build series of thesholds
        thresholds = pd.concat(
            [
                pd.Series(pathogenic_thresholds),
                pd.Series(benign_thresholds)
            ]
        )
        thresholds.index = pd.to_numeric(thresholds.index)
        thresholds.sort_index(inplace=True)
        
        # Can't work without thresholds
        if thresholds.isna().all():
            continue
        
        # Directionality of thresholds
        increasing = calibration.get('inverted') == 'inverted'
        
        # Check that threholds are increasing(decreasing):
        if increasing:
            assert thresholds.dropna().is_monotonic_increasing, f'Thresholds for {dataset} are expected to be increasing: {thresholds}'
            comparisons = [pd.Series.le] * 4 + [pd.Series.ge] * 4
        else:
            assert thresholds.dropna().is_monotonic_decreasing, f'Thresholds for {dataset} are expected to be decreasing: {thresholds}'
            comparisons = [pd.Series.ge] * 4 + [pd.Series.le] * 4
        comparisons_series = pd.Series(comparisons, [-8,-4,-2,-1,1,2,4,8])
        
        # Align thresholds, labels, and comparisons
        thresholds_df = pd.concat(
            {
                'Classification': evidence_strength_series,
                'Threshold': thresholds,
                'Comparison': comparisons_series
            },
            axis='columns'
        ).dropna(how='any')
        
        result_dfs = []
        
        for classification, threshold, compare in tqdm(thresholds_df.itertuples(index=False)):
            
            for gene, gene_df in dataset_df.groupby('Gene'):
                
                variant_df = gene_df[
                    ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
                ][
                    compare(gene_df['auth_reported_score'].astype(float), threshold)
                ].dropna(how='any').assign(Chromosome = lambda df: 'chr'+df['Chrom']).astype({'hg38_start': int})
            
                if variant_df.empty:
                    continue

                result_df = variant_df[['Chromosome', 'hg38_start', 'ref_allele', 'alt_allele']].rename(
                    columns={
                        'hg38_start': 'Pos',
                        'ref_allele': 'Ref',
                        'alt_allele': 'Alt'
                    }
                ).merge(
                    clinvar_bins_df,
                    on=['Chromosome', 'Pos', 'Ref', 'Alt'],
                    how='left'
                )
                result_df['Clinvar significance'] = result_df['Clinvar significance'].fillna('Other / not in ClinVar')
                
                result_df['Dataset'] = dataset
                result_df['Gene'] = gene
                result_df['Classifier'] = 'Calibrated (2025-06-05)'
                result_df['Classification'] = classification
                
                assay_calibration_dfs.append(result_df)

### Author provided assay scores

In [37]:
assay_author_dfs = []

for gene, per_gene_df in tqdm(working_df.groupby('Gene')):
    
    for (dataset, functional_class), df in tqdm(per_gene_df.groupby(['Dataset', 'auth_reported_func_class'])):

        try:
            variant_df = df[
                ['Chrom', 'hg38_start', 'ref_allele', 'alt_allele']
            ].dropna(
                how='any',
            ).assign(
                Chromosome = lambda df: 'chr'+df['Chrom']
            ).astype({'hg38_start': int})
        except:
            print(f'Failed while making df for {dataset}, {gene}, {functional_class}')
            raise
        
        if variant_df.empty: continue
            
        try:
            result_df = annotate_variants_with_clinvar(
                variant_df,
                'Chromosome',
                'hg38_start',
                'ref_allele',
                'alt_allele'
            )
        except:
            print(f'Failed on {dataset}, {gene}, {functional_class}:')
            print(variant_df)
            raise
        
        result_df['Dataset'] = dataset
        result_df['Gene'] = gene
        result_df['Classifier'] = 'Author Reported'
        result_df['Classification'] = functional_class
        
        assay_author_dfs.append(result_df)

### AlphaMissense

We will need to re-run the main workflow anyway, so we will add the variant statistics collection to that file.

## Collect results and write to file

In [39]:
pd.concat(
    assay_author_dfs + assay_calibration_dfs + clinvar_dfs,
    ignore_index=True
).to_csv(f'{BUCKET}/WIP/variant_clinvar_classes.csv')