In [None]:
import pandas as pd
import numpy as np
import os

# 1. Setup Paths
DATA_DIR = '../data/OAICompleteData_ASCII'
PARQUET_PATH = '../data/processed/OAI_model_ready_data.parquet'
# We use the Labcorp file because it has 600 patients (better than 129) and correct IDs
BIOSPEC_PATH = f"{DATA_DIR}/Biospec_FNIH_Labcorp00.txt"

# 2. Load the Main Cohort
df_cohort = pd.read_parquet(PARQUET_PATH)
print(f"Main Cohort: {df_cohort.shape} (Knees)")

try:
    # 3. Load the Biomarker Data
    df_bio = pd.read_csv(BIOSPEC_PATH, sep='|', on_bad_lines='skip')
    print(f"Biomarker Data Loaded: {df_bio.shape} (Rows)")

    # --- SELECT SPECIFIC BIOMARKERS ---
    # These are scientifically validated OA markers found in the file
    # 'V00' indicates Baseline visit (matching our X-rays)
    # '_lc' likely indicates "Lab Concentration"
    target_markers = {
        'V00Serum_Comp_lc': 'Bio_COMP',     # Cartilage Oligomeric Matrix Protein (Cartilage breakdown)
        'V00Serum_CTXI_lc': 'Bio_CTXI',     # C-Telopeptide of Type I Collagen (Bone resorption)
        'V00Serum_HA_lc':   'Bio_HA',       # Hyaluronic Acid (Inflammation/Synovitis)
        'V00Serum_C2C_lc':  'Bio_C2C',      # Collagen Type II Cleavage (Cartilage degradation)
        'V00Serum_CPII_lc': 'Bio_CPII'      # Procollagen Type II C-Propeptide (Cartilage synthesis)
    }
    
    # Keep ID and the selected markers
    df_bio_clean = df_bio[['ID'] + list(target_markers.keys())].copy()
    
    # Rename for clarity
    df_bio_clean.rename(columns=target_markers, inplace=True)
    
    print(f"Selected {len(target_markers)} markers for fusion.")

    # 4. Clean & Impute (Within the bio-subset first)
    clean_cols = list(target_markers.values())
    
    for col in clean_cols:
        # Force numeric
        df_bio_clean[col] = pd.to_numeric(df_bio_clean[col], errors='coerce')
        # Impute local NaNs with median
        df_bio_clean[col] = df_bio_clean[col].fillna(df_bio_clean[col].median())

    # 5. Merge with Main Cohort
    # Left merge: Keep all knees. 
    # Patients without bio data will get NaN initially.
    df_tri_modal = pd.merge(df_cohort, df_bio_clean, on='ID', how='left')
    
    # 6. Global Imputation
    # For the ~3000 patients who weren't in the FNIH sub-study, we fill with the cohort median.
    # This preserves the distribution center while allowing the architecture to train.
    for col in clean_cols:
        median_val = df_tri_modal[col].median()
        df_tri_modal[col] = df_tri_modal[col].fillna(median_val)
        
        # Normalize (Z-score) - Vital for Neural Networks
        df_tri_modal[col] = (df_tri_modal[col] - df_tri_modal[col].mean()) / df_tri_modal[col].std()

    print(f"\nNew Tri-Modal Cohort Shape: {df_tri_modal.shape}")
    print(df_tri_modal[['ID'] + clean_cols].head())

    # 7. Save Final Tri-Modal Dataset
    OUTPUT_PATH = '../data/processed/OAI_tri_modal_real.parquet'
    df_tri_modal.to_parquet(OUTPUT_PATH, index=False)
    print(f"\nSUCCESS: Real Tri-Modal Data saved to {OUTPUT_PATH}")

except Exception as e:
    print(f"ERROR: {e}")