In [1]:
# Common imports and constants
import os
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from cvfgaou import aou, odds, gctools, data

DATASET = os.environ["WORKSPACE_CDR"]
BUCKET = os.environ["WORKSPACE_BUCKET"]
DATA_DIR = f'{BUCKET}/data_v2'
CLASSES_VERSION = '2026-01-14' # Used for local dir

In [2]:
# Grab map of ENSG IDs for annotation

ensg_df = pd.read_table(
    f'{BUCKET}/aux_data/gene_metadata.txt',
    usecols=['Gene stable ID version', 'Chromosome/scaffold name', 'Gene name'],
    dtype=str
).drop_duplicates()

In [3]:
# Filter out to chromosomes only
ensg_df = ensg_df[~ensg_df['Chromosome/scaffold name'].str.endswith('PATCH')]

In [4]:
ensg_filtered_df = ensg_df[ensg_df['Gene name'].isin(data.gene_phenotypes)]
ensg_map = ensg_filtered_df['Gene stable ID version']
ensg_map.index = ensg_filtered_df['Gene name']

In [5]:
# Grab table that describes which assays measure splicing
splicing_map_df = pd.read_csv(
    f'{BUCKET}/cvfg_17796333/final_pillar_data_with_clinvar_18_25_gnomad_wREVEL_wAM_wspliceAI_wMutpred2_wtrainvar_gold_standards_expanded_111225.csv.gz',
    usecols = ['Dataset', 'splice_measure']
).drop_duplicates()


In [6]:
splicing_map = pd.Series(splicing_map_df['splice_measure'], index=splicing_map_df['Dataset'])

In [7]:
# Gene case pools definitions
# Taking them from the gene phenotypes defined for the project

cohorts = aou.CohortLoader(
    gene_cohort_map = {
        gene: (
            {f'{p} case pool.tsv.gz' for p in case_phenos},
            {f'{p} control pool.tsv.gz' for p in control_phenos}
        )
        for gene, (case_phenos, control_phenos, _) in data.gene_phenotypes.items()
    },
    ancestry_df = pd.read_table(f'{DATA_DIR}/ancestry_pca.tsv.gz', index_col='research_id'),
    demo_df = pd.read_table(f'{DATA_DIR}/demo.tsv.gz', index_col='person_id'),
    data_dir = DATA_DIR
)
    

In [8]:
!gsutil ls $WORKSPACE_BUCKET

In [9]:
!mkdir -p classes_2026-01-14
# Calibrated assay scores
!gsutil -m cp -r $WORKSPACE_BUCKET/classes_2026-01-13/* classes_2026-01-14/
# Author reported assay scores
!gsutil -m cp -r $WORKSPACE_BUCKET/classes_2026-01-08/* classes_2026-01-14/
# Combined points classes
!gsutil -m cp -r $WORKSPACE_BUCKET/combined_classes_2026-01-08/* classes_2026-01-14/


In [10]:
ls classes_2025-12-20/exposures

In [11]:
pd.read_parquet('classes_2025-12-15/exposures/calibrated_2025-12-08_BRCA2_unpublished.parquet')

In [13]:
common_dir = Path(f'classes_{CLASSES_VERSION}')

or_estimate_dicts = {}

for exposures_path in tqdm((common_dir / 'exposures').iterdir()):
    
    try:
        af_map_path = common_dir / 'af_maps' / exposures_path.name
        clinvar_map_path = common_dir / 'clinvar_maps' / exposures_path.name

        #display(exposures_path)

        clinvar_big_df = pd.read_parquet(clinvar_map_path)
        af_map = pd.read_parquet(af_map_path)

        #display(clinvar_big_df)

        #display(af_map)

        #display(pd.read_parquet(exposures_path))

        for (dataset, gene, classifier, classification), exposure_df in tqdm(
            pd.read_parquet(exposures_path).groupby(
                ['Dataset', 'Gene', 'Classifier', 'Classification'],
                group_keys=True
            )
        ):
            
            # Patch in splice measure info
            exposure_df['Measures splicing'] = (splicing_map.get(dataset) == 'Yes')
            
            #display(exposure_df)
            if gene not in cohorts.gene_cohort_map:
                print(f'{gene} has no cohort definition')
                continue
            
            # Grab cohort
            gene_cohort = cohorts.gene_cohort(gene)
            
            # Init aux info
            aux_info_dict = {
                'Case inclusion phenotypes': list(data.gene_phenotypes[gene][0]),
                'Control exclusion phenotypes': list(data.gene_phenotypes[gene][1]),
                'ENSG': ensg_map[gene]
            }

            # Extract aux info
            for aux_info in ['Measures splicing', 'Data Version']:
                if aux_info in exposure_df.columns:
                    info_val = exposure_df[aux_info].iloc[0]
                    if not (exposure_df[aux_info] == info_val).all():
                        print(f'Warning: multiple values for {aux_info} encountered: {exposure_df}')
                    aux_info_dict[aux_info] = info_val

            # Filter splicing if needed
            if not aux_info_dict.get('Measures splicing'):
                splice_carriers = pd.read_parquet(f'{BUCKET}/spliceAI_2025-12-10/splice_carriers_{gene}.parquet')
                if not splice_carriers.empty:
                    gene_cohort = gene_cohort[
                        ~gene_cohort.index.isin(pd.to_numeric(splice_carriers.s, downcast='integer'))
                    ]
                
                # Hard code splice filter threshold for now since we know it
                aux_info_dict['SpliceAI filter max'] = 0.2
                        
            # The real work
            clinvar_class_map = clinvar_big_df[
                (clinvar_big_df['Dataset'] == dataset) &
                (clinvar_big_df['Gene'] == gene) &
                (clinvar_big_df['Classifier'] == classifier) &
                (clinvar_big_df['Classification'] == classification)
            ].set_index('Variant')['Clinvar significance']

            or_estimate_dicts[
                (dataset, gene, classifier, classification)
            ] = odds.estimate_logOR(
                pd.to_numeric(exposure_df.person_id, downcast='integer'),
                cohorts.gene_cohort(gene)
            ) | odds.collect_variant_stats(
                pd.to_numeric(exposure_df.person_id, downcast='integer'),
                cohorts.gene_cohort(gene),
                exposure_df.variants,
                af_map['AF'],
                clinvar_class_map
            ) | aux_info_dict
        
    except:
        print(f"Error on {exposures_path}")
        raise

In [14]:
or_estimates_df = pd.DataFrame.from_dict(or_estimate_dicts, orient='index')

In [15]:
# Save
or_estimates_df.to_parquet(f'{BUCKET}/or-estimates/raw-or-estimates-{CLASSES_VERSION}.parquet')

In [16]:
or_estimates_df['Few samples'] = or_estimates_df['Cases with variants'] < 5

In [17]:
# Round population count up to a multiple of 20
or_estimates_df[
    ['Cases with variants', 'Controls with variants', 'Cases without variants', 'Controls without variants']
] = -(
    -or_estimates_df[
        ['Cases with variants', 'Controls with variants', 'Cases without variants', 'Controls without variants']
    ] // 20
) * 20

In [18]:
# Fill missing ClinVar classes as zero-counts:
or_estimates_df[
    or_estimates_df.columns[or_estimates_df.columns.str.contains('ClinVar')]
] = or_estimates_df[
    or_estimates_df.columns[or_estimates_df.columns.str.contains('ClinVar')]
].fillna(0).apply(lambda x: pd.to_numeric(x, downcast='integer'))

In [19]:
# Set index names
or_estimates_df.index.set_names(['Dataset', 'Gene', 'Classifier', 'Classification'], inplace=True)

In [20]:
or_estimates_df

In [21]:
# Save
or_estimates_df.to_parquet(f'or-estimates-{CLASSES_VERSION}.parquet')

In [22]:
# Save
or_estimates_df.to_parquet(f'{BUCKET}/or-estimates/or-estimates-{CLASSES_VERSION}.parquet')

In [23]:
raise RuntimeError("Review below cells before running")

In [None]:
or_estimates_df = pd.read_parquet(f'or-estimates-{CLASSES_VERSION}.parquet')

In [None]:
or_estimates_df

In [None]:
or_estimates_df.columns

In [None]:
or_estimates_df['ENSG'] = or_estimates_df.index.to_frame()['Gene'].map(ensg_map)
or_estimates_df['Case inclusion phenotypes'] = or_estimates_df.index.to_frame()['Gene'].map(
    {
        gene: list(data.gene_phenotypes[gene][0])
        for gene in data.gene_phenotypes
    }
)
or_estimates_df['Control exclusion phenotypes'] = or_estimates_df.index.to_frame()['Gene'].map(
    {
        gene: list(data.gene_phenotypes[gene][1])
        for gene in data.gene_phenotypes
    }
)
or_estimates_df

In [None]:
or_estimates_df.reset_index(inplace=True)

In [None]:
# Make classification styling consistent
# Do this by replacing underscores with spaces and capitalizing
# Explicitly manage labels containing acronyms
explicit_map = {
    'possiblyLOF': 'Possibly LOF',
    'possiblyWT': 'Possibly WT',
    'moderate LOF': 'Moderate LOF',
    'severe LOF': 'Severe LOF'
}

or_estimates_df['Classification'] = or_estimates_df['Classification'].where(
    or_estimates_df['Classification'].isin({
        'FUNC', 'INT', 'LOF', 'AF < 0.001', 'GOF', 'WT',
        'possiblyLOF', 'possiblyWT', 'moderate LOF', 'severe LOF'
    }),
    or_estimates_df['Classification'].str.replace('_', ' ').str.capitalize()
).mask(
    or_estimates_df['Classification'].isin(explicit_map),
    or_estimates_df['Classification'].map(explicit_map)
)

In [None]:
## Save estimates

or_estimates_df.to_csv(f'{BUCKET}/WIP/or-estimates_2025-06-09.csv.gz', index=False)

In [None]:
## Local save

or_estimates_df.to_csv(f'or-estimates_2025-06-09.csv.gz', index=False)

In [None]:
or_estimates_df