In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np

from src import config
import src.optimization_utils as ou


### 1. Data Cleaning & Preprocessing

In [None]:
pd.set_option('display.max_columns', None)

# Load cleaned data from config path
df = pd.read_csv(config.DATA_PROCESSED)
df.columns = df.columns.astype(str).str.replace("\n"," ").str.replace(r"\s+"," ", regex=True).str.strip()

print(f"Loaded {len(df)} patients, {df.shape[1]} columns")
print(f"Data path: {config.DATA_PROCESSED}")


In [None]:
#change data type to numeric for calculation
df[["gap_score_preop", "gap_score_postop"]]\
    = df[["gap_score_preop", "gap_score_postop"]]\
        .apply(pd.to_numeric, errors="coerce").astype("Int64")
df[["ODI_preop", "ODI_12mo"]]\
    = df[["ODI_preop", "ODI_12mo"]]\
        .apply(pd.to_numeric, errors="coerce").astype(float)

### 1.1 Define input variables

In [None]:
#patient preop fixed parameters
PATIENT_FIXED_COLS = config.PATIENT_FIXED_COLS

PREDICTORS= [p for p in PATIENT_FIXED_COLS if "gap" not in p]
print("== Predictors ==")
for i in PREDICTORS:
    print(i)

#surgical plan
PLAN_COLS = config.PLAN_COLS
print("\n == Surgical plan ==")
for i in PLAN_COLS:
    print(i)

#patient fixed parameters + surgical plan
FEATURES = PREDICTORS + PLAN_COLS

FEATURES_CAT = [c for c in FEATURES if df[c].dtype == "object"]
print("\n == Categorical Features ==")
for i in FEATURES_CAT:
    print(i)

FEATURES_NUM = [c for c in FEATURES if df[c].dtype != "object"]
print("\n == Numerical Features (includes binary) ==")
for i in FEATURES_NUM:
    print(i)



### 2. Build Models

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_predict, cross_val_score, KFold, LeaveOneOut
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, PredictionErrorDisplay, make_scorer

from xgboost import XGBRegressor

### 2.1 Set up for model building and hyperparameter tuning

In [None]:

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
    )

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")),  
           ("encoder", OneHotEncoder(handle_unknown="ignore")) 
        ]
    )

# Transform heterogeneous data types
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, FEATURES_NUM),
        ("cat", categorical_transformer, FEATURES_CAT),
        ]
    )

#hyperparameter tuning parameters
param_distributions = {"regressor__n_estimators": [200, 300, 400, 500, 600],
               "regressor__max_depth": [ 8, 12, 15, 20, None],
               "regressor__min_samples_split" : [2, 5, 10, 15, 20],
               "regressor__min_samples_leaf": [2, 4, 6, 8, 10], 
               "regressor__max_features": ["sqrt", "log2"]}

ridge_param_distributions = {"regressor__alpha": [0.01, 0.1, 1, 10, 100, 1000],
                             "regressor__solver": ["auto", "svd", "cholesky"]}


In [None]:
def train_rf_model(X_train, X_test, y_train, y_test):
    "Builds and fits a Random Forest model using a pipeline"

    pipeline = Pipeline([
        ("preprocessor", preprocessor), 
        ("regressor", RandomForestRegressor(n_estimators=300, 
                                            random_state=42,
                                            max_depth=8,
                                            min_samples_leaf=5,
                                            oob_score=True))
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    oob = pipeline.named_steps["regressor"].oob_score_
    
    print("-- Random Forest Model --")

    print(f"R² score: {round(r2,3)}")
    print(f"RMSE: {round(rmse,3)}")
    print(f"MAE: {round(mae, 3)}")
    print(f"OOB score: {round(oob,3)}")

    return pipeline, y_pred
                   
def randomized_search_cv(pipeline, param_distributions, X_train,y_train, X_test,y_test):
    "Performs randomized search cross-validation on a model"
    
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    #fit the model
    random_search.fit(X_train,y_train)

    #best paramaters
    best_parameters = random_search.best_params_
    best_cv_R2 = random_search.best_score_
    best_model = random_search.best_estimator_

    #prediction
    y_pred = random_search.predict(X_test)

    #metrics
    test_R2 = r2_score(y_test, y_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) 

    print("--Randomized search results--")
    print("Best parameters:")
    for k,v in best_parameters.items():
        print(f"  {k}: {v}")
    print(f"Best CV R2: {round(best_cv_R2,3)}")
    print(f"Test R2: {round(test_R2,3)}")
    print(f"Test RMSE: {round(test_rmse,3)}")

    return best_model, y_pred

def train_xgboost_model(X_train, X_test, y_train, y_test):
    "Builds and fits an XGBoost model using a pipeline"

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        ))
        ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    print("-- XGBOOST Model --")

    print(f"R² score: {round(r2,3)}")
    print(f"RMSE: {round(rmse,3)}")
    print(f"MAE: {round(mae, 3)}")
  
    return pipeline, y_pred

def train_ridge_model(X_train, X_test, y_train, y_test, alpha=1):
    "Builds and fits a Ridge Regression model within a pipeline"

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", Ridge(alpha=alpha))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    print("-- Ridge Regression Model --")

    print(f"R² score: {round(r2,3)}")
    print(f"RMSE: {round(rmse,3)}")
    print(f"MAE: {round(mae, 3)}")

    return pipeline, y_pred     


### 3. Model Evaluation 
#### 3.1 Model 1: L4S1

In [None]:
df["delta_L4S1"] = df["L4_S1_postop"] - df["L4S1_preop"]

#drop null values
df = df.dropna(subset=["delta_L4S1"])

X = df[FEATURES]
y = df["delta_L4S1"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Model comparison
L4S1_rf_pipeline, L4S1_ypred\
      = train_rf_model(X_train, X_test, y_train, y_test)
print()
L4S1_xgb_pipeline, L4S1_xgb_ypred\
      = train_xgboost_model(X_train, X_test,y_train,y_test)
print()
#3 ridge regression - might be good for overfitting and correlated features
L4S1_ridge_pipeline,L4S1_ridge_ypred\
      = train_ridge_model(X_train, X_test,y_train,y_test)

In [None]:
#Randomized search cross-validation to finetune Ridge regression parameters

finetuned_L4S1_model, finetuned_L4S1_ypred  = \
    randomized_search_cv(L4S1_ridge_pipeline,ridge_param_distributions,X_train,y_train,X_test,y_test)

In [None]:
#Visualization of residuals

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("L4S1 Ridge Regression Model", fontsize=16)

# Plot 1: Residual vs. Predicted (original model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=L4S1_ridge_ypred, kind="residual_vs_predicted", 
    ax=ax[0])
ax[0].set_title("Residual vs. Predicted")

# Plot 2:  Residual histogram (original model)
L4S1_ridge_residuals = y_test - L4S1_ridge_ypred
sns.histplot(x=L4S1_ridge_residuals, kde=True, ax=ax[1], color='steelblue')
ax[1].set_title("Residual Distribution")
ax[1].set_xlabel("Residuals")

plt.show()

#### 3.2 Model 2: LL

In [None]:
df["delta_LL"] = df["LL_postop"] - df["LL_preop"]

df = df.dropna(subset=["delta_LL"])
    
X = df[FEATURES]
y = df["delta_LL"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model comparison
LL_rf_pipeline, LL_ypred\
      = train_rf_model(X_train, X_test,y_train,y_test)
print()

LL_xgb_pipeline, LL_xgb_ypred\
      = train_xgboost_model(X_train, X_test,y_train,y_test)

print()

#3 ridge regression - might be good for overfitting and correlated features
LL_ridge_pipeline, LL_ridge_ypred\
      = train_ridge_model(X_train, X_test,y_train,y_test)

In [None]:
#Randomized search cross-validation to finetune Ridge regression parameters

finetuned_ll_model, finetuned_ll_ypred = \
    randomized_search_cv(LL_ridge_pipeline, ridge_param_distributions,X_train,y_train,X_test,y_test)


In [None]:
#Visualization of residuals

fig, ax = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle("LL Ridge Regression Model", fontsize=16)

# Plot 1: Residual vs. Predicted (original model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=LL_ridge_ypred, kind="residual_vs_predicted", 
    ax=ax[0,0])
ax[0,0].set_title("Original Model: Residual vs. Predicted")

# Plot 2:  Residual histogram (original model)
ll_ridge_residuals = y_test - LL_ridge_ypred
sns.histplot(x=ll_ridge_residuals, kde=True, ax=ax[0,1], color='steelblue')
ax[0, 1].set_title("Original Model: Residual Distribution")
ax[0, 1].set_xlabel("Residuals")

# Plot 3: Residual vs. Predicted (finetuned model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=finetuned_ll_ypred, kind="residual_vs_predicted", 
    ax=ax[1, 0]
)
ax[1, 0].set_title("Finetuned Model: Residual vs. Predicted")

# Plot 4: Residual Histogram (finetuned model)
LL_ridge_cv_residuals = y_test - finetuned_ll_ypred
sns.histplot(x=LL_ridge_cv_residuals, kde=True, ax=ax[1, 1], color='seagreen')
ax[1, 1].set_title("Finetuned Model: Residual Distribution")
ax[1, 1].set_xlabel("Residuals")


plt.show()

### 3.3 Model 3:T4PA Change

In [None]:
df["delta_T4PA"] = df["T4PA_postop"] - df["T4PA_preop"]

df = df.dropna(subset=["delta_T4PA"])
    
X = df[FEATURES]
y = df["delta_T4PA"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model comparison
T4PA_rf_pipeline, T4PA_ypred\
      = train_rf_model(X_train, X_test, y_train, y_test)

print()
T4PA_xgb_pipeline, T4PA_xgb_ypred\
      = train_xgboost_model(X_train, X_test, y_train, y_test)

print()

T4PA_ridge_pipeline, T4PA_ridge_ypred\
      = train_ridge_model(X_train, X_test, y_train, y_test)

In [None]:
#Randomized search cross-validation to finetune Ridge regression parameters

finetuned_T4PA_ridge, finetuned_T4PA_ypred  = \
    randomized_search_cv(T4PA_ridge_pipeline,ridge_param_distributions, X_train,y_train,X_test,y_test)


In [None]:
#Visualization of residuals

fig, ax = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle("T4PA Ridge Regression Model", fontsize=16)

# Plot 1: Residual vs. Predicted (original model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=T4PA_ridge_ypred, kind="residual_vs_predicted", 
    ax=ax[0,0])
ax[0,0].set_title("Original Model: Residual vs. Predicted")

# Plot 2:  Residual histogram (original model)
T4PA_ridge_residuals = y_test - T4PA_ridge_ypred
sns.histplot(x=T4PA_ridge_residuals, kde=True, ax=ax[0,1], color='steelblue')
ax[0, 1].set_title("Original Model: Residual Distribution")
ax[0, 1].set_xlabel("Residuals")

# Plot 3: Residual vs. Predicted (finetuned model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=finetuned_T4PA_ypred, kind="residual_vs_predicted", 
    ax=ax[1, 0]
)
ax[1, 0].set_title("Finetuned Model: Residual vs. Predicted")

# Plot 4: Residual Histogram (finetuned model)
T4PA_cv_residuals = y_test - finetuned_T4PA_ypred
sns.histplot(x=T4PA_cv_residuals, kde=True, ax=ax[1, 1], color='seagreen')
ax[1, 1].set_title("Finetuned Model: Residual Distribution")
ax[1, 1].set_xlabel("Residuals")


plt.show()

### 3.4 Model 4:L1PA Change

In [None]:
df["delta_L1PA"] = df["L1PA_postop"] - df["L1PA_preop"]

df = df.dropna(subset=["delta_L1PA"])
    
X = df[FEATURES]
y = df["delta_L1PA"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model comparison

L1PA_rf_pipeline, L1PA_ypred\
      = train_rf_model(X_train, X_test, y_train, y_test)

print()

L1PA_xgb_pipeline, L1PA_xgb_ypred\
      = train_xgboost_model(X_train, X_test, y_train, y_test)

print()

L1PA_ridge_pipeline, L1PA_ridge_ypred\
      = train_ridge_model(X_train, X_test,y_train, y_test)

In [None]:
#finetune random forest model
finetuned_L1PA_pipeline, finetuned_L1PA_ypred= \
    randomized_search_cv(L1PA_rf_pipeline, param_distributions,X_train, y_train, X_test, y_test)


In [None]:
#Visualization of residuals

fig, ax = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle("L1PA Model Comparison", fontsize=16)

# Plot 1: Residual vs. Predicted (original model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=L1PA_ypred, kind="residual_vs_predicted", 
    ax=ax[0, 0])
ax[0, 0].set_title("Original Model: Residual vs. Predicted")

# Plot 2:  Residual histogram (original model)
L1PA_residuals = y_test - L1PA_ypred
sns.histplot(x=L1PA_residuals, kde=True, ax=ax[0, 1], color='steelblue')
ax[0, 1].set_title("Original Model: Residual Distribution")
ax[0, 1].set_xlabel("Residuals")

# Plot 3: Residual vs. Predicted (finetuned model)
PredictionErrorDisplay.from_predictions(
    y_true=y_test, y_pred=finetuned_L1PA_ypred, kind="residual_vs_predicted", 
    ax=ax[1, 0]
)
ax[1, 0].set_title("Finetuned Model: Residual vs. Predicted")

# Plot 4: Residual Histogram (finetuned model)
L1PA_cv_residuals = y_test - finetuned_L1PA_ypred
sns.histplot(x=L1PA_cv_residuals, kde=True, ax=ax[1, 1], color='seagreen')
ax[1, 1].set_title("Finetuned Model: Residual Distribution")
ax[1, 1].set_xlabel("Residuals")

plt.show()

#### 3.5 MODEL 5: ODI After 12 months

In [None]:
df["delta_ODI"] = df["ODI_12mo"] - df["ODI_preop"] 

df = df.dropna(subset=["delta_ODI"]) #many null rows
    
X = df[FEATURES]
y = df["delta_ODI"]

#split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model comparison

ODI_rf_pipeline, ODI_ypred\
      = train_rf_model(X_train, X_test, y_train, y_test)

print()

ODI_xgb_pipeline, ODI_xgb_ypred\
      = train_xgboost_model(X_train, X_test, y_train, y_test)

print()

ODI_ridge_pipeline, ODI_ridge_ypred\
      = train_ridge_model(X_train, X_test,y_train, y_test)

In [None]:
#finetune ridge regression model
finetuned_ODI_pipeline, finetuned_ODI_ypred= \
    randomized_search_cv(ODI_ridge_pipeline, ridge_param_distributions,X_train, y_train, X_test, y_test)


### 4. Export Model

In [None]:
import joblib, sklearn
from src import config

# L4S1 model
L4S1_dir = config.ARTIFACTS_DIR / "L4S1"
L4S1_dir.mkdir(parents=True, exist_ok=True)

bundle_L4S1 = {
    "pipe": L4S1_ridge_pipeline,
    "features": FEATURES,
    "target": "delta_L4S1",
    "sklearn_version": sklearn.__version__,
    "model_name": "L4S1_ridge_reg",
}

out_path = L4S1_dir / "L4S1_ridge_reg.joblib"
joblib.dump(bundle_L4S1, out_path)
print("Saved:", out_path)

# LL model
LL_dir = config.ARTIFACTS_DIR / "LL"
LL_dir.mkdir(parents=True, exist_ok=True)

bundle_LL = {
    "pipe": LL_ridge_pipeline,
    "features": FEATURES,
    "target": "delta_LL",
    "sklearn_version": sklearn.__version__,
    "model_name": "LL_ridge_reg",
}

out_path = LL_dir / "LL_ridge_reg.joblib"
joblib.dump(bundle_LL, out_path)
print("Saved:", out_path)

# T4PA model
T4PA_dir = config.ARTIFACTS_DIR / "T4PA"
T4PA_dir.mkdir(parents=True, exist_ok=True)

bundle_T4PA = {
    "pipe": T4PA_ridge_pipeline,
    "features": FEATURES,
    "target": "delta_T4PA",
    "sklearn_version": sklearn.__version__,
    "model_name": "T4PA_ridge_reg",
}

out_path = T4PA_dir / "T4PA_ridge_reg.joblib"
joblib.dump(bundle_T4PA, out_path)
print("Saved:", out_path)

# L1PA model
L1PA_dir = config.ARTIFACTS_DIR / "L1PA"
L1PA_dir.mkdir(parents=True, exist_ok=True)

bundle_L1PA = {
    "pipe": finetuned_L1PA_pipeline,
    "features": FEATURES,
    "target": "delta_L1PA",
    "sklearn_version": sklearn.__version__,
    "model_name": "L1PA_rf_reg",
}

out_path = L1PA_dir / "L1PA_rf_reg.joblib"
joblib.dump(bundle_L1PA, out_path)
print("Saved:", out_path)

"""# ODI model
ODI_dir = config.ARTIFACTS_DIR / "ODI"
ODI_dir.mkdir(parents=True, exist_ok=True)

bundle_ODI = {
    "pipe": ODI_ridge_pipeline,
    "features": FEATURES,
    "target": "delta_ODI",
    "sklearn_version": sklearn.__version__,
    "model_name": "ODI_ridge_reg",
}

out_path = ODI_dir / "ODI_ridge_reg.joblib"
joblib.dump(bundle_ODI, out_path)
print("Saved:", out_path)"""