In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np

from src import config
import src.optimization_utils as ou


### 1. Data Cleaning & Preprocessing

In [None]:
pd.set_option('display.max_columns', None)

# Load cleaned data from config path
df = pd.read_csv(config.DATA_PROCESSED)
df.columns = df.columns.astype(str).str.replace("\n"," ").str.replace(r"\s+"," ", regex=True).str.strip()

print(f"Loaded {len(df)} patients, {df.shape[1]} columns")
print(f"Data path: {config.DATA_PROCESSED}")


In [None]:
#change data type to numeric for calculation
df[["gap_score_preop", "gap_score_postop"]]\
    = df[["gap_score_preop", "gap_score_postop"]]\
        .apply(pd.to_numeric, errors="coerce").astype("Int64")
df[["ODI_preop", "ODI_12mo"]]\
    = df[["ODI_preop", "ODI_12mo"]]\
        .apply(pd.to_numeric, errors="coerce").astype(float)

### 1.1 Define input and response variables

In [None]:
#patient preop fixed parameters
PATIENT_FIXED_COLS = config.PATIENT_FIXED_COLS
FEATURES = config.DELTA_MODEL_FEATURES.copy()

print("== FEATURES ==")
for i in FEATURES:
    print(i)

FEATURES.remove("gap_score_preop")
FEATURES.remove("gap_category")

#from Hari's code
NUMERIC_FEATURES = df[FEATURES].select_dtypes(
    exclude=["object", "string", "category"]
).columns.tolist()

CATEGORICAL_FEATURES = df[FEATURES].select_dtypes(
    include=["object", "string", "category"]
).columns.tolist()

print("\n == Numerical Features (includes binary) ==")
for i in NUMERIC_FEATURES:
    print(i)

print("\n == Categorical Features ==")
for i in CATEGORICAL_FEATURES:
    print(i)


In [None]:
if "delta_L4S1" not in df.columns:
    df["delta_L4S1"] = df["L4_S1_postop"] - df["L4S1_preop"]

if "delta_LL" not in df.columns:
    df["delta_LL"] = df["LL_postop"] - df["LL_preop"]

if "delta_T4PA" not in df.columns:
    df["delta_T4PA"] = df["T4PA_postop"] - df["T4PA_preop"]

if "delta_L1PA" not in df.columns:
    df["delta_L1PA"] = df["L1PA_postop"] - df["L1PA_preop"]

if "delta_ODI" not in df.columns:
    df["delta_ODI"] = df["ODI_12mo"] - df["ODI_preop"] 

### 2. Build Models

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_predict, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, PredictionErrorDisplay, make_scorer

from xgboost import XGBRegressor

### 2.1 Set up for model building and hyperparameter tuning

In [None]:
#same as Hari's
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
    )

ridge_numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())]
    )
#same as Hari's
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")),  
           ("encoder", OneHotEncoder(handle_unknown="ignore")) 
        ]
    )

# Transform heterogeneous data types; same as Hari's
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES),
        ]
    )
ridge_preprocessor = ColumnTransformer(
    transformers=[
        ("num", ridge_numeric_transformer,  NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES),
        ]
    )



In [None]:
def train_rf_model(target_col, model_name):
    "Builds and fits a Random Forest model using a pipeline"

    data = df[FEATURES + [target_col]].dropna(subset=[target_col])

    X = data[FEATURES]
    y = data[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestRegressor(n_estimators=300, 
                                            random_state=42,
                                            max_depth=8,
                                            min_samples_leaf=5,
                                            oob_score=True)

    pipe = Pipeline([
        ("preprocessor", clone(preprocessor)), 
        ("regressor", model)
    ])

    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_test)

    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test,preds))
    mae = mean_absolute_error(y_test, preds)
    oob = pipe.named_steps["regressor"].oob_score_
    
    print(f"\n{model_name}")

    print(f"R² score: {round(r2,3)}")
    print(f"RMSE: {round(rmse,3)}")
    print(f"MAE: {round(mae, 3)}")
    print(f"OOB score: {round(oob,3)}")

    return pipe, X_train, X_test, y_train, y_test, preds

def train_ridge_model(target_col, model_name,alpha=1):
    "Builds and fits a Ridge Regression model within a pipeline"

    data = df[FEATURES + [target_col]].dropna(subset=[target_col])

    X = data[FEATURES]
    y = data[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    pipe = Pipeline([
        ("preprocessor", clone(ridge_preprocessor)),
        ("regressor", Ridge(alpha=alpha))
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    r2 = r2_score(y_test,preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    
    print(f"\n{model_name}")

    print(f"R² score: {round(r2,3)}")
    print(f"RMSE: {round(rmse,3)}")
    print(f"MAE: {round(mae, 3)}")

    return pipe, X_train, X_test, y_train, y_test, preds    


### 3. Model Evaluation 
#### 3.1 Model 1: L4S1

In [None]:
L4S1_model, X_train_L4S1, X_test_L4S1, y_train_L4S1, y_test_L4S1, y_pred_L4S1 = train_ridge_model("delta_L4S1", "ΔL4S1 Model")

#### Visualization

In [None]:
#actual vs predicted plot
fig, ax = plt.subplots()
ax.set_title("L4S1 Ridge Regression Model: Actual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_L4S1, 
    y_pred=y_pred_L4S1, 
    kind="actual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()
#residual vs predicted plot
fig, ax = plt.subplots()

ax.set_title("L4S1 Ridge Regression Model: Residual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_L4S1, 
    y_pred=y_pred_L4S1, 
    kind="residual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()

#residual histogram

fig, ax = plt.subplots()
ax.set_title("L4S1 Ridge Regression Model: Residual Histogram")

L4S1_residuals = y_test_L4S1 - y_pred_L4S1
sns.histplot(x=L4S1_residuals, kde=True, ax=ax)

plt.show()


#### Feature Importance

In [None]:
import shap

X_test_transformed = L4S1_model.named_steps["preprocessor"].transform(X_test_L4S1)
feature_names = L4S1_model.named_steps["preprocessor"].get_feature_names_out()

explainer = shap.LinearExplainer(
    L4S1_model.named_steps["regressor"],
    L4S1_model.named_steps["preprocessor"].transform(X_train_L4S1),
    feature_names=feature_names
)
shap_values = explainer(X_test_transformed)
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, plot_type="bar")


In [None]:
shap.plots.beeswarm(shap_values)

#### 3.2 Model 2: LL

In [None]:
LL_model, X_train_LL, X_test_LL, y_train_LL, y_test_LL, y_pred_LL= train_rf_model("delta_LL", "ΔLL Model")

#### Visualization of model performance

In [None]:
#actual vs predicted plot
fig, ax = plt.subplots()
ax.set_title("LL Random Forest Model: Actual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_LL, 
    y_pred=y_pred_LL, 
    kind="actual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()
#residual vs predicted plot
fig, ax = plt.subplots()

ax.set_title("LL Random Forest Model: Residual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_LL, 
    y_pred=y_pred_LL, 
    kind="residual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()

#residual histogram

fig, ax = plt.subplots()
ax.set_title("LL Random Forest Model: Residual Histogram")

LL_residuals = y_test_LL - y_pred_LL
sns.histplot(x=LL_residuals, kde=True, ax=ax)

plt.show()

#### Feature Importance

In [None]:
X_test_transformed = LL_model.named_steps["preprocessor"].transform(X_test_LL)
feature_names = LL_model.named_steps["preprocessor"].get_feature_names_out()

explainer = shap.LinearExplainer(
    LL_model.named_steps["regressor"],
    LL_model.named_steps["preprocessor"].transform(X_train_LL),
    feature_names=feature_names
)
shap_values = explainer(X_test_transformed)
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, plot_type="bar")



In [None]:
shap.plots.beeswarm(shap_values)

### 3.3 Model 3:T4PA Change

In [None]:
T4PA_model, X_train_T4PA,X_test_T4PA, y_train_T4PA, y_test_T4PA, y_pred_T4PA = train_ridge_model("delta_T4PA", "ΔT4PA Model")

In [None]:
#actual vs predicted plot
fig, ax = plt.subplots()
ax.set_title("T4PA Ridge Regression Model: Actual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_T4PA, 
    y_pred=y_pred_T4PA, 
    kind="actual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()
#residual vs predicted plot
fig, ax = plt.subplots()

ax.set_title("T4PA Ridge Regression Model: Residual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_T4PA, 
    y_pred=y_pred_T4PA, 
    kind="residual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()

#residual histogram
fig, ax = plt.subplots()
ax.set_title("T4PA Ridge Regression Model: Residual Histogram")

T4PA_residuals = y_test_T4PA - y_pred_T4PA
sns.histplot(x=T4PA_residuals, kde=True, ax=ax)

plt.show()


#### Feature Importance

In [None]:
X_test_transformed = T4PA_model.named_steps["preprocessor"].transform(X_test_T4PA)
feature_names = T4PA_model.named_steps["preprocessor"].get_feature_names_out()

explainer = shap.LinearExplainer(
    T4PA_model.named_steps["regressor"],
    T4PA_model.named_steps["preprocessor"].transform(X_train_T4PA),
    feature_names=feature_names
)
shap_values = explainer(X_test_transformed)
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, plot_type="bar")


In [None]:
shap.plots.beeswarm(shap_values)

### 3.4 Model 4:L1PA Change

In [None]:
L1PA_model, X_train_L1PA,X_test_L1PA, y_train_L1PA,y_test_L1PA, y_pred_L1PA = train_ridge_model("delta_L1PA", "ΔL1PA Model")

In [None]:
#actual vs predicted plot
fig, ax = plt.subplots()
ax.set_title("L1PA Ridge Regression Model: Actual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_L1PA, 
    y_pred=y_pred_L1PA, 
    kind="actual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()
#residual vs predicted plot
fig, ax = plt.subplots()

ax.set_title("L1PA Ridge Regression Model: Residual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_L1PA, 
    y_pred=y_pred_L1PA, 
    kind="residual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()

#residual histogram
fig, ax = plt.subplots()
ax.set_title("L1PA Ridge Regression Model: Residual Histogram")

L1PA_residuals = y_test_L1PA - y_pred_L1PA
sns.histplot(x=L1PA_residuals, kde=True, ax=ax)

plt.show()


#### Feature Importance

In [None]:
X_test_transformed = L1PA_model.named_steps["preprocessor"].transform(X_test_L1PA)
feature_names = L1PA_model.named_steps["preprocessor"].get_feature_names_out()

explainer = shap.LinearExplainer(
    L1PA_model.named_steps["regressor"],
    L1PA_model.named_steps["preprocessor"].transform(X_train_L1PA),
    feature_names=feature_names
)
shap_values = explainer(X_test_transformed)
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, plot_type="bar")


In [None]:
shap.plots.beeswarm(shap_values)

#### 3.5 MODEL 5: ODI After 12 months

In [None]:
ODI_model, X_train_ODI,X_test_ODI,y_train_ODI,y_test_ODI, y_pred_ODI = train_ridge_model("delta_ODI", "ΔODI Model")


In [None]:
#actual vs predicted plot
fig, ax = plt.subplots()
ax.set_title("ODI Ridge Regression Model: Actual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_ODI, 
    y_pred=y_pred_ODI, 
    kind="actual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()
#residual vs predicted plot
fig, ax = plt.subplots()

ax.set_title("ODI Ridge Regression Model: Residual vs Predicted")

PredictionErrorDisplay.from_predictions(
    y_true=y_test_ODI, 
    y_pred=y_pred_ODI, 
    kind="residual_vs_predicted", 
    scatter_kwargs={"alpha": 0.5},
    ax=ax)

plt.show()

#residual histogram
fig, ax = plt.subplots()
ax.set_title("ODI Ridge Regression Model: Residual Histogram")

ODI_residuals = y_test_ODI - y_pred_ODI
sns.histplot(x=ODI_residuals, kde=True, ax=ax)

plt.show()


#### Feature Importance

In [None]:
X_test_transformed = ODI_model.named_steps["preprocessor"].transform(X_test_ODI)
feature_names = ODI_model.named_steps["preprocessor"].get_feature_names_out()

explainer = shap.LinearExplainer(
    ODI_model.named_steps["regressor"],
    ODI_model.named_steps["preprocessor"].transform(X_train_ODI),
    feature_names=feature_names
)
shap_values = explainer(X_test_transformed)
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, plot_type="bar")


In [None]:
shap.plots.beeswarm(shap_values)

### 4. Export Model

In [None]:
def final_rf_model(target_col):
    data = df[FEATURES + [target_col]].dropna(subset=[target_col])

    X = data[FEATURES]
    y = data[target_col]

    model = RandomForestRegressor(n_estimators=300, 
                                            random_state=42,
                                            max_depth=8,
                                            min_samples_leaf=5,
                                            oob_score=True)

    pipe = Pipeline([
        ("preprocessor", clone(preprocessor)), 
        ("regressor", model)
    ])

    pipe.fit(X, y)

    return pipe

def final_ridge_model(target_col, alpha=1):
    data = df[FEATURES + [target_col]].dropna(subset=[target_col])

    X = data[FEATURES]
    y = data[target_col]

    pipe = Pipeline([
        ("preprocessor", clone(ridge_preprocessor)),
        ("regressor", Ridge(alpha=alpha))
    ])

    pipe.fit(X, y)

    return pipe

In [None]:
#build final model for deployment - L4S1 ridge regression 
L4S1_pipe = final_ridge_model("delta_L4S1")
L4S1_pipe


In [None]:
#build final model for deployment - LL random forest

LL_pipe = final_rf_model("delta_LL")
LL_pipe

In [None]:
#build final model for deployment - T4PA ridge regression
T4PA_pipe = final_ridge_model("delta_T4PA")
T4PA_pipe


In [None]:
#build final model for deployment - L1PA ridge regression
L1PA_pipe = final_ridge_model("delta_L1PA")
L1PA_pipe

In [None]:
#build final model for deployment - ODI ridge regression
ODI_pipe = final_ridge_model("delta_L1PA")
ODI_pipe

In [None]:
import joblib, sklearn
from src import config

# L4S1 model
L4S1_dir = config.ARTIFACTS_DIR / "L4S1"
L4S1_dir.mkdir(parents=True, exist_ok=True)

bundle_L4S1 = {
    "pipe": L4S1_pipe,
    "features": FEATURES,
    "target": "delta_L4S1",
    "sklearn_version": sklearn.__version__,
    "model_name": "RidgeRegressor_delta_L4S1"
}
out_path = L4S1_dir / "delta_L4S1_model.joblib"
joblib.dump(bundle_L4S1, out_path)
print("Saved:", out_path)

# LL model
LL_dir = config.ARTIFACTS_DIR / "LL"
LL_dir.mkdir(parents=True, exist_ok=True)

bundle_LL = {
    "pipe": LL_pipe,
    "features": FEATURES,
    "target": "delta_LL",
    "sklearn_version": sklearn.__version__,
    "model_name": "RandomForest_delta_LL",
}

out_path = LL_dir / "delta_LL_model.joblib"
joblib.dump(bundle_LL, out_path)
print("Saved:", out_path)

# T4PA model
T4PA_dir = config.ARTIFACTS_DIR / "T4PA"
T4PA_dir.mkdir(parents=True, exist_ok=True)

bundle_T4PA = {
    "pipe": T4PA_pipe,
    "features": FEATURES,
    "target": "delta_T4PA",
    "sklearn_version": sklearn.__version__,
    "model_name": "RidgeRegressor_delta_T4PA",
}

out_path = T4PA_dir / "delta_T4PA_model.joblib"
joblib.dump(bundle_T4PA, out_path)
print("Saved:", out_path)

# L1PA model
L1PA_dir = config.ARTIFACTS_DIR / "L1PA"
L1PA_dir.mkdir(parents=True, exist_ok=True)

bundle_L1PA = {
    "pipe": L1PA_pipe,
    "features": FEATURES,
    "target": "delta_L1PA",
    "sklearn_version": sklearn.__version__,
    "model_name": "RidgeRegressor_delta_L1PA",
}

out_path = L1PA_dir / "delta_L1PA_model.joblib"
joblib.dump(bundle_L1PA, out_path)
print("Saved:", out_path)

# ODI model
ODI_dir = config.ARTIFACTS_DIR / "ODI"
ODI_dir.mkdir(parents=True, exist_ok=True)

bundle_ODI = {
    "pipe": ODI_pipe,
    "features": FEATURES,
    "target": "delta_ODI",
    "sklearn_version": sklearn.__version__,
    "model_name": "RidgeRegressor_delta_ODI",
}

out_path = ODI_dir / "delta_ODI_model.joblib"
joblib.dump(bundle_ODI, out_path)
print("Saved:", out_path)