In [1]:
# Common imports and constants
import os
import pandas as pd

from cvfgaou import data

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [2]:
# Main tables
assays_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-02-05.parquet').reset_index()
veps_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-02-03_predictors.parquet').reset_index()

In [3]:
# Grab TSC2 functional annotations for two-part dataset
tsc2_breakdown_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-01-14.parquet').reset_index()

In [4]:
tsc2_df = tsc2_breakdown_df[
    tsc2_breakdown_df['Dataset'].isin(['TSC2_rapgap_unpublished', 'TSC2_tuberin_unpublished']) &
    (tsc2_breakdown_df['Classifier'] == 'StandardizedClass')
].copy()

In [5]:
# Update names
tsc2_df['Dataset'] = tsc2_df['Dataset'].map({
    'TSC2_rapgap_unpublished': 'TSC2_rapgap_IGVF',
    'TSC2_tuberin_unpublished': 'TSC2_tuberin_IGVF'
})

# Drop unused columns
tsc2_df.drop(columns=['Measures splicing', 'Few samples'], inplace=True)

In [6]:
tsc2_df

In [7]:
assays_or_df

In [8]:
veps_or_df

In [9]:
# Drop unused assays column
assays_or_df.drop(columns='Measures splicing', inplace=True)

In [10]:
# Update VEP classifier names
veps_or_df['Classifier'] = veps_or_df['Classifier'].map({
    'domain-aggregated calibration': 'Domain aggregation',
    'single-gene calibration': 'Gene-specific',
    'gene-aggregated calibration': 'Genome-wide aggregation'
})

In [11]:
# Merge
collated_or_df = pd.concat([assays_or_df, tsc2_df, veps_or_df], ignore_index=True)

In [12]:
# Attach CDR version
collated_or_df['AoU CDR'] = 'v8'

In [13]:
# Mark large CI's as unpowered
collated_or_df['Powered'] = (collated_or_df['LogOR_UI'] - collated_or_df['LogOR_LI'] < 20)

In [14]:
# Mark publication genes
collated_or_df['Publication'] = collated_or_df['Gene'].isin(data.p3_genes)

In [15]:
collated_or_df['Classifier'].value_counts()

In [16]:
# Save combined
collated_or_df.set_index(['Dataset', 'Gene', 'Classifier', 'Classification']).to_parquet(
    f'{BUCKET}/or-estimates/publication-or-estimates-2026-02-13.parquet'
)

In [17]:
# Drop unpowered rows
collated_or_df = collated_or_df[collated_or_df['Powered'] & collated_or_df['Publication']]

In [18]:
# Can drop the column now
collated_or_df.drop(columns = ['Powered', 'Publication'], inplace=True)

In [19]:
# Save file for portal
collated_or_df.to_csv('all-or-estimates_2026-02-13.csv.gz', index=False)
collated_or_df.to_csv(f'{BUCKET}/or-estimates/all-or-estimates_2026-02-13.csv.gz', index=False)