In [1]:
# Common imports and constants
import os
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from cvfgaou import aou, odds, gctools, data

DATASET = os.environ["WORKSPACE_CDR"]
BUCKET = os.environ["WORKSPACE_BUCKET"]
DATA_DIR = f'{BUCKET}/data_v2'
CLASSES_VERSION = '2025-12-11_predictors' # Used for local dir

In [2]:
# Gene case pools definitions
# Taking them from the gene phenotypes defined for the project

cohorts = aou.CohortLoader(
    gene_cohort_map = {
        gene: (
            {f'{p} case pool.tsv.gz' for p in case_phenos},
            {f'{p} control pool.tsv.gz' for p in control_phenos}
        )
        for gene, (case_phenos, control_phenos, _) in data.gene_phenotypes.items()
    },
    ancestry_df = pd.read_table(f'{DATA_DIR}/ancestry_pca.tsv.gz', index_col='research_id'),
    demo_df = pd.read_table(f'{DATA_DIR}/demo.tsv.gz', index_col='person_id'),
    data_dir = DATA_DIR
)
    

In [3]:
!gsutil ls $WORKSPACE_BUCKET/

In [4]:
!mkdir -p classes_2025-12-11_predictors
!gsutil -m cp -r $WORKSPACE_BUCKET/classes_2025-11-11/* classes_2025-12-11_predictors/

In [5]:
ls classes_2025-12-11_predictors/exposures

In [2]:
test = pd.read_parquet('classes_2025-12-11_predictors/exposures/revel_BARD1.parquet')
test

In [4]:
common_dir = Path(f'classes_{CLASSES_VERSION}')

or_estimate_dicts = {}

for exposures_path in tqdm((common_dir / 'exposures').iterdir()):
    
    try:
        af_map_path = common_dir / 'af_maps' / exposures_path.name
        clinvar_map_path = common_dir / 'clinvar_maps' / exposures_path.name

        #display(exposures_path)

        clinvar_big_df = pd.read_parquet(clinvar_map_path)
        af_map = pd.read_parquet(af_map_path)

        #display(clinvar_big_df)

        #display(af_map)

        #display(pd.read_parquet(exposures_path))

        for (dataset, gene, classifier, classification), exposure_df in tqdm(
            pd.read_parquet(exposures_path).groupby(
                ['Dataset', 'Gene', 'Classifier', 'Classification'],
                group_keys=True
            )
        ):
            #display(exposure_df)
            if gene not in cohorts.gene_cohort_map:
                print(f'{gene} has no cohort definition')
                continue
            
            # Grab cohort
            gene_cohort = cohorts.gene_cohort(gene)
            
            # Filter splicing
            splice_carriers = pd.read_parquet(f'{BUCKET}/spliceAI_2025-12-10/splice_carriers_{gene}.parquet')
            
            if not splice_carriers.empty:
                gene_cohort = gene_cohort[~gene_cohort.index.isin(pd.to_numeric(splice_carriers.s, downcast='integer'))]
            
            # Hard code the splicing filter for now
            aux_info_dict = {
                'SpliceAI filter max': 0.2,
                'Case inclusion phenotypes': list(data.gene_phenotypes[gene][0]),
                'Control exclusion phenotypes': list(data.gene_phenotypes[gene][1])
            }

            # The real work
            clinvar_class_map = clinvar_big_df[
                (clinvar_big_df['Dataset'] == dataset) &
                (clinvar_big_df['Gene'] == gene) &
                (clinvar_big_df['Classifier'] == classifier) &
                (clinvar_big_df['Classification'] == classification)
            ].set_index('Variant')['Clinvar significance']

            or_estimate_dicts[
                (dataset, gene, classifier, classification)
            ] = odds.estimate_logOR(
                pd.to_numeric(exposure_df.person_id, downcast='integer'),
                gene_cohort
            ) | odds.collect_variant_stats(
                pd.to_numeric(exposure_df.person_id, downcast='integer'),
                gene_cohort,
                exposure_df.variants,
                af_map['AF'],
                clinvar_class_map
            ) | aux_info_dict
        
    except:
        print(f"Error on {exposures_path}")
        raise

In [5]:
or_estimates_df = pd.DataFrame.from_dict(or_estimate_dicts, orient='index')

In [11]:
or_estimates_df['Few samples'] = or_estimates_df['Cases with variants'] < 5

In [12]:
# Round population count up to a multiple of 20
or_estimates_df[
    ['Cases with variants', 'Controls with variants', 'Cases without variants', 'Controls without variants']
] = -(
    -or_estimates_df[
        ['Cases with variants', 'Controls with variants', 'Cases without variants', 'Controls without variants']
    ] // 20
) * 20

In [13]:
# Fill missing ClinVar classes as zero-counts:
or_estimates_df[
    or_estimates_df.columns[or_estimates_df.columns.str.contains('ClinVar')]
] = or_estimates_df[
    or_estimates_df.columns[or_estimates_df.columns.str.contains('ClinVar')]
].fillna(0).apply(lambda x: pd.to_numeric(x, downcast='integer'))

In [14]:
# Set index names
or_estimates_df.index.set_names(['Dataset', 'Gene', 'Classifier', 'Classification'], inplace=True)

In [15]:
or_estimates_df

In [16]:
# Save
or_estimates_df.to_parquet(f'or-estimates-{CLASSES_VERSION}.parquet')

In [17]:
# Save
or_estimates_df.to_parquet(f'{BUCKET}/or-estimates/or-estimates-{CLASSES_VERSION}.parquet')

In [6]:
# Make-up annotations

or_estimates_df['Case inclusion phenotypes'] = or_estimates_df['Gene'].map(lambda g: list(data.gene_phenotypes[g][0]))
or_estimates_df['Control exclusion phenotypes'] = or_estimates_df['Gene'].map(lambda g: list(data.gene_phenotypes[g][1]))
or_estimates_df['ENSG'] = or_estimates_df['Gene'].map()
or_estimates_df

In [None]:
raise RuntimeError("Review below cells before running")

In [16]:
or_estimates_df.reset_index(inplace=True)

In [17]:
# Make classification styling consistent
# Do this by replacing underscores with spaces and capitalizing
# Explicitly manage labels containing acronyms
explicit_map = {
    'possiblyLOF': 'Possibly LOF',
    'possiblyWT': 'Possibly WT',
    'moderate LOF': 'Moderate LOF',
    'severe LOF': 'Severe LOF'
}

or_estimates_df['Classification'] = or_estimates_df['Classification'].where(
    or_estimates_df['Classification'].isin({
        'FUNC', 'INT', 'LOF', 'AF < 0.001', 'GOF', 'WT',
        'possiblyLOF', 'possiblyWT', 'moderate LOF', 'severe LOF'
    }),
    or_estimates_df['Classification'].str.replace('_', ' ').str.capitalize()
).mask(
    or_estimates_df['Classification'].isin(explicit_map),
    or_estimates_df['Classification'].map(explicit_map)
)

In [22]:
## Save estimates

or_estimates_df.to_csv(f'{BUCKET}/WIP/or-estimates_2025-06-09.csv.gz', index=False)

In [20]:
## Local save

or_estimates_df.to_csv(f'or-estimates_2025-06-09.csv.gz', index=False)

In [21]:
or_estimates_df