In [1]:
# Common imports and constants
import os
import pandas as pd

from cvfgaou import data

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [2]:
assays_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2025-12-20.parquet').reset_index()
veps_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2025-12-11_predictors.parquet').reset_index()

In [3]:
# Drop unused assays column
assays_or_df.drop(columns='Measures splicing', inplace=True)

In [4]:
# Patch ENSG and phenotype column maps for VEP frame

# Grab map of ENSG IDs for annotation
ensg_df = pd.read_table(
    f'{BUCKET}/aux_data/gene_metadata.txt',
    usecols=['Gene stable ID version', 'Chromosome/scaffold name', 'Gene name'],
    dtype=str
).drop_duplicates()

# Filter out to chromosomes only
ensg_df = ensg_df[~ensg_df['Chromosome/scaffold name'].str.endswith('PATCH')]

ensg_filtered_df = ensg_df[ensg_df['Gene name'].isin(data.gene_phenotypes)]
ensg_map = ensg_filtered_df['Gene stable ID version']
ensg_map.index = ensg_filtered_df['Gene name']

# Attach
veps_or_df['ENSG'] = veps_or_df['Gene'].map(ensg_map)
veps_or_df['Case inclusion phenotypes'] = veps_or_df['Gene'].map(lambda gene: list(data.gene_phenotypes[gene][0]))
veps_or_df['Control exclusion phenotypes'] = veps_or_df['Gene'].map(lambda gene: list(data.gene_phenotypes[gene][1]))

In [5]:
# Merge
collated_or_df = pd.concat([assays_or_df, veps_or_df], ignore_index=True)

In [6]:
# Drop small samples
collated_or_df = collated_or_df[~collated_or_df['Few samples']]
collated_or_df.drop(columns = 'Few samples', inplace=True)

In [7]:
# Attach CDR version
collated_or_df['AoU CDR'] = 'v8'

In [8]:
collated_or_df

In [9]:
# Save file for portal
collated_or_df.to_csv('all-or-estimates_2025-12-30.csv.gz', index=False)

In [11]:
# Save file for workspace
collated_or_df.set_index(['Dataset', 'Gene', 'Classifier', 'Classification']).to_parquet(f'{BUCKET}/or-estimates/all-or-estimates_2025-12-30.parquet')