In [19]:
# Common imports and constants
import os
import pandas as pd

from cvfgaou import data

BUCKET = os.environ["WORKSPACE_BUCKET"]

In [20]:
assays_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-01-14.parquet').reset_index()
veps_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-01-19_predictors.parquet').reset_index()
mp2_or_df = pd.read_parquet(f'{BUCKET}/or-estimates/or-estimates-2026-01-14_mp2.parquet').reset_index()

In [21]:
# Drop unused assays column
assays_or_df.drop(columns='Measures splicing', inplace=True)

In [22]:
# Harmonize MP2 levels
mp2_or_df['Classifier'] = mp2_or_df['Classifier'].replace({'Calibrated (Bergquist et al. 10.1016/j.gim.2025.101402)': 'Calibrated (genome-wide)'})

In [23]:
# Merge
collated_or_df = pd.concat([assays_or_df, veps_or_df, mp2_or_df], ignore_index=True)

In [24]:
# Attach CDR version
collated_or_df['AoU CDR'] = 'v8'

In [25]:
# Mark large CI's as unpowered
collated_or_df['Powered'] = (collated_or_df['LogOR_UI'] - collated_or_df['LogOR_LI'] < 100)

In [26]:
# Erase small sample info
collated_or_df.drop(columns = 'Few samples', inplace=True)

In [27]:
# Harmonization of column values

In [28]:
# It looks like there is a "calibrated" set of scores for TP53_Fayer_2021_Meta now, so we don't use the OP points
collated_or_df = collated_or_df[collated_or_df['Classifier'] != 'OP_points_18_25']

In [32]:
# Named assay calibration method
collated_or_df['Classifier'] = collated_or_df['Classifier'].replace({
    "Calibrated (2025-12-25)": "ExCALIBR",
    "Calibrated (genome-wide)": "Gene-aggregated calibration",
    "Calibrated (gene-specific)": "Gene-specific calibration",
    "Calibrated (cluster-based)": "Cluster-aggregated calibration"
})

In [33]:
collated_or_df['Classifier'].value_counts()

In [34]:
# Save combined
collated_or_df.set_index(['Dataset', 'Gene', 'Classifier', 'Classification']).to_parquet(
    f'{BUCKET}/or-estimates/publication-or-estimates-2026-01-20.parquet'
)

In [35]:
# Drop unpowered rows
collated_or_df = collated_or_df[collated_or_df['Powered']]

In [36]:
# Can drop the column now
collated_or_df.drop(columns = 'Powered', inplace=True)

In [37]:
# Save file for portal
collated_or_df
collated_or_df.to_csv('all-or-estimates_2026-01-20.csv.gz', index=False)