In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sksurv.preprocessing import OneHotEncoder

# 1. Load the final, model-ready dataset
DATA_FILE = 'data/processed/OAI_model_ready_data.parquet'
df = pd.read_parquet(DATA_FILE)

print(f"Loaded model-ready data with shape: {df.shape}")
print(df.head())

# 2. Define Features (X) and Target (y)
# These are the 5 features for our baseline model
FEATURE_COLS = ['KL_Grade', 'Age', 'BMI', 'Sex', 'WOMAC_Score']
X = df[FEATURE_COLS]

# These are our two survival target columns
EVENT_COL = 'event'
TIME_COL = 'time_to_event'

# 3. Create the 'y' structured array for scikit-survival
# This format (bool, float) is required by the library.
y = np.empty(len(df), dtype=[
    ('event', bool), 
    ('time_to_event', np.float64)
])

y['event'] = df[EVENT_COL].astype(bool)
y['time_to_event'] = df[TIME_COL].astype(np.float64)

print(f"\nCreated feature matrix X with shape: {X.shape}")
print(f"Created target array y with shape: {y.shape}")

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print(f"Split data into {len(X_train)} train and {len(X_test)} test samples.")

# --- 5. Preprocess the features (CORRECTED) ---

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define which columns are which
categorical_cols = ['KL_Grade', 'Sex']
numerical_cols = ['Age', 'BMI', 'WOMAC_Score']

# Create a preprocessor
# 1. OneHotEncoder: Expands categorical features. 'drop=first' avoids dummy variable trap.
# 2. StandardScaler: Scales numerical features.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough' # Pass through any other columns (though we have none)
)

# Fit on training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names for clarity
feature_names = (
    numerical_cols + 
    preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()
)

print("Features successfully preprocessed (scaled and one-hot encoded).")
print(f"Final processed training shape: {X_train_processed.shape}")
print(f"Feature names: {feature_names}")

In [None]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

# 1. Initialize the CoxPH model
# We set a baseline alpha for regularization
cox_model = CoxPHSurvivalAnalysis(alpha=0.1)

# 2. Train the model
# We use the processed features and the structured survival target
cox_model.fit(X_train_processed, y_train)

# 3. Evaluate on the Test Set
# Get the predicted risk scores for the test set
test_risk_scores = cox_model.predict(X_test_processed)

# Calculate the Concordance Index (C-index)
# This measures how well the model's risk score predictions
# match the actual order of events.
# 1.0 is perfect, 0.5 is random chance.
c_index_result = concordance_index_censored(
    y_test['event'],      # Ground truth (did an event happen?)
    y_test['time_to_event'], # Ground truth (how long did it take?)
    test_risk_scores      # Our model's prediction
)

print(f"--- Baseline Model Evaluation ---")
print(f"Concordance Index (C-index) on Test Set: {c_index_result[0]:.4f}")

# 4. Show Feature Importance
# Let's see what the model thinks is important
feature_importance = pd.Series(
    cox_model.coef_,
    index=feature_names
).sort_values(ascending=False)

print("\n--- Feature Importance (Model Coefficients) ---")
print(feature_importance)