In [None]:
import pandas as pd
import numpy as np

# Set the base path to our data
DATA_DIR = '../data/OAICompleteData_ASCII'

# --- 1. Define File Paths ---

# Our Primary Key / Demographics
enrollees_path = f"{DATA_DIR}/Enrollees.txt"

# Our Survival Target (TKR)
outcomes_path = f"{DATA_DIR}/OUTCOMES99.txt"

# Modality 1: Image Labels (Baseline)
xray_labels_path = f"{DATA_DIR}/KXR_SQ_BU00.txt"

# Modality 2: Clinical Features (Baseline)
clinical_path = f"{DATA_DIR}/AllClinical00.txt"

# Modality 3: Genetic Features (Baseline)
biomarkers_path = f"{DATA_DIR}/Biomarkers00.txt"

print(f"File paths defined. Ready to load data.")

In [None]:
# --- 2. Load the DataFrames ---
# We use sep='|' because these are pipe-delimited text files.
# on_bad_lines='skip' is a safety precaution for these complex files.

try:
    df_enrollees = pd.read_csv(enrollees_path, sep='|', on_bad_lines='skip')
    df_outcomes = pd.read_csv(outcomes_path, sep='|', on_bad_lines='skip')
    df_xray_labels = pd.read_csv(xray_labels_path, sep='|', on_bad_lines='skip')
    df_clinical = pd.read_csv(clinical_path, sep='|', on_bad_lines='skip')
    df_biomarkers = pd.read_csv(biomarkers_path, sep='|', on_bad_lines='skip')

    print("--- 1. Enrollees (Master List) ---")
    print(f"Shape: {df_enrollees.shape}")
    print(f"Columns: {df_enrollees.columns.tolist()}\n")

    print("--- 2. Outcomes (Survival Target) ---")
    print(f"Shape: {df_outcomes.shape}")
    print(f"Columns: {df_outcomes.columns.tolist()}\n")

    print("--- 3. X-Ray Labels (Modality 1) ---")
    print(f"Shape: {df_xray_labels.shape}")
    print(f"Columns: {df_xray_labels.columns.tolist()}\n")
    
    print("--- 4. Clinical Features (Modality 2) ---")
    print(f"Shape: {df_clinical.shape}")
    print(f"Columns: {df_clinical.columns.tolist()}\n")
    
    print("--- 5. Biomarkers (Modality 3) ---")
    print(f"Shape: {df_biomarkers.shape}")
    print(f"Columns: {df_biomarkers.columns.tolist()}\n")
    
    print("\nAll files loaded successfully.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check the file paths and ensure the files are not empty.")

In [None]:
# --- 3. Standardize Keys & Inspect Key Columns ---

# 1. Standardize the Patient ID in the Outcomes table
# We rename 'id' (lowercase) to 'ID' (uppercase) to match the other tables
try:
    df_outcomes.rename(columns={'id': 'ID'}, inplace=True)
    print("Standardized 'id' -> 'ID' in df_outcomes.\n")
except Exception as e:
    print(f"Error renaming column: {e}")


# 2. Inspect X-Ray 'SIDE' column
# We need to know how "Right" and "Left" are coded. (Likely 1=Right, 2=Left)
print("--- Inspecting X-Ray 'SIDE' ---")
if 'SIDE' in df_xray_labels.columns:
    print(f"Values: {df_xray_labels['SIDE'].unique()}")
    print(f"Value Counts:\n{df_xray_labels['SIDE'].value_counts()}\n")
else:
    print("Column 'SIDE' not found in df_xray_labels!\n")


# 3. Inspect X-Ray 'V00XRKL' (Baseline KL Grade) column
# This is our primary image feature. We need to see the grades.
# 0=Healthy, 1=Doubtful, 2=Minimal, 3=Moderate, 4=Severe
print("--- Inspecting 'V00XRKL' (Baseline KL Grade) ---")
if 'V00XRKL' in df_xray_labels.columns:
    print(f"Values: {df_xray_labels['V00XRKL'].unique()}")
    print(f"Value Counts (sorted):\n{df_xray_labels['V00XRKL'].value_counts().sort_index()}\n")
else:
    print("Column 'V00XRKL' not found in df_xray_labels!\n")

# 4. Inspect X-Ray 'READPRJ' (Reading Project) column
# 16k rows means multiple reads. We need to pick one.
print("--- Inspecting 'READPRJ' (Reader Project) ---")
if 'READPRJ' in df_xray_labels.columns:
    print(f"Values: {df_xray_labels['READPRJ'].unique()}")
    print(f"Value Counts:\n{df_xray_labels['READPRJ'].value_counts()}\n")
else:
    print("Column 'READPRJ' not found in df_xray_labels!\n")

In [None]:
# --- 4. Clean the X-Ray Labels DataFrame ---

# 1. Create a copy to avoid modifying the original
df_xray_labels_clean = df_xray_labels.copy()

# 2. Filter for the main reading project (READPRJ == 15)
df_xray_labels_clean = df_xray_labels_clean[df_xray_labels_clean['READPRJ'] == 15].copy()
print(f"Filtered by READPRJ=15. New shape: {df_xray_labels_clean.shape}")

# 3. Clean 'V00XRKL' (KL Grade)
#    - First, replace the 'missing' string with NaN
df_xray_labels_clean['V00XRKL'] = df_xray_labels_clean['V00XRKL'].replace(
    '.: Missing Form/Incomplete Workbook', np.nan
)
#    - Second, extract the number (e.g., '2: 2' -> 2).
#      We split on ':' and take the first character, then convert to numeric.
df_xray_labels_clean['KL_Grade'] = pd.to_numeric(
    df_xray_labels_clean['V00XRKL'].str.split(':').str[0]
)

# 4. Clean 'SIDE'
#    - Extract the number (e.g., '1: Right' -> 1)
df_xray_labels_clean['Knee_Side'] = pd.to_numeric(
    df_xray_labels_clean['SIDE'].str.split(':').str[0]
)

# 5. Keep only the essential columns
df_xray_labels_clean = df_xray_labels_clean[['ID', 'Knee_Side', 'KL_Grade']]

# 6. Check for duplicates (e.g., two 'Right' knees for the same patient ID)
duplicates = df_xray_labels_clean.duplicated(subset=['ID', 'Knee_Side']).sum()
print(f"Found {duplicates} duplicate knee entries.\n")

# 7. (If duplicates exist, drop them)
if duplicates > 0:
    df_xray_labels_clean = df_xray_labels_clean.drop_duplicates(
        subset=['ID', 'Knee_Side'], keep='first'
    )
    print(f"Dropped duplicates. New shape: {df_xray_labels_clean.shape}\n")

# 8. Inspect the final clean DataFrame
print("--- Cleaned X-Ray Labels Info ---")
df_xray_labels_clean.info()
print("\n--- Cleaned X-Ray Labels Head ---")
print(df_xray_labels_clean.head())
print("\n--- New 'KL_Grade' Value Counts ---")
print(df_xray_labels_clean['KL_Grade'].value_counts(dropna=False).sort_index())

In [None]:
# --- 5. Create the Master Patient DataFrame ---

# 1. Start with the master list of patients
df_master_patient = df_enrollees

# 2. Merge outcomes (our survival target)
# We add suffixes to identify duplicate columns (e.g., VERSION_enrol, VERSION_out)
df_master_patient = pd.merge(
    df_master_patient, 
    df_outcomes, 
    on='ID', 
    how='outer',
    suffixes=('_enrol', '_out')
)

# 3. Merge clinical features (Modality 2)
df_master_patient = pd.merge(
    df_master_patient,
    df_clinical,
    on='ID',
    how='outer',
    suffixes=('_left', '_clin') # Suffixes from previous merge are handled
)

# 4. Merge biomarker features (Modality 3)
df_master_patient = pd.merge(
    df_master_patient,
    df_biomarkers,
    on='ID',
    how='outer',
    suffixes=('_clin', '_bio') # Suffixes from previous merge are handled
)

# 5. Inspect the final master patient table
print("--- Master Patient DataFrame Info ---")
print(f"Shape: {df_master_patient.shape}")
print(f"Total Patients: {df_master_patient['ID'].nunique()}\n")

# Check for any merge failures (should be 4796)
if df_master_patient.shape[0] != 4796:
    print(f"WARNING: Merge created {df_master_patient.shape[0]} rows, not 4796. Check for one-to-many joins.")
else:
    print("Merge successful: 4796 patients, one row per patient.")

print("\n--- Master Patient DataFrame Head ---")
print(df_master_patient.head())

In [None]:
# --- 6. Create the Final Tri-Modal Cohort (Knee-Level) ---

# We perform a 'left' merge, starting from our clean knee table (df_xray_labels_clean)
# and joining the patient-level data (df_master_patient) to it.
# This ensures we only keep the 8982 knees we have baseline X-ray data for.

df_final_cohort = pd.merge(
    df_xray_labels_clean,
    df_master_patient,
    on='ID',
    how='left'
)

# --- Inspect the Final Cohort DataFrame ---
print("--- Final Tri-Modal Cohort Info ---")
print(f"Shape: {df_final_cohort.shape}")
print(f"Total Knees: {df_final_cohort.shape[0]}")
print(f"Total Unique Patients: {df_final_cohort['ID'].nunique()}\n")

# Verify the merge
# The shape should be (8982 rows, 3 + 1393 - 1 = 1395 columns)
expected_cols = df_xray_labels_clean.shape[1] + df_master_patient.shape[1] - 1
if df_final_cohort.shape[0] == 8982 and df_final_cohort.shape[1] == expected_cols:
    print("SUCCESS: Final cohort created.")
    print("Each row now represents one knee and contains all 3 modalities + outcome data.")
else:
    print(f"WARNING: Merge failed. Expected shape (8982, {expected_cols}), got {df_final_cohort.shape}")

print("\n--- Final Cohort DataFrame Head ---")
# This head() will be very wide, but it confirms the merge
print(df_final_cohort.head())

In [None]:
# --- 7. Save the Final Cohort to a File ---

# Define a new directory for our processed data
import os
PROCESSED_DATA_DIR = '../data/processed'
if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)

# Define the output file path
OUTPUT_FILE_PATH = f"{PROCESSED_DATA_DIR}/OAI_tri_modal_cohort_knee_level.parquet"

try:
    # Save the DataFrame to Parquet format
    # This is much more efficient than CSV for large tables.
    df_final_cohort.to_parquet(OUTPUT_FILE_PATH, index=False)
    
    print(f"SUCCESS: Final cohort saved to:")
    print(OUTPUT_FILE_PATH)
    
    # Verify by reloading it
    df_reloaded = pd.read_parquet(OUTPUT_FILE_PATH)
    print(f"\nVerification: Reloaded file with shape {df_reloaded.shape}")

except Exception as e:
    print(f"An error occurred while saving: {e}")