In [None]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/MSDS_database_cleaned_deidentified_revised.xlsx")
df = pd.read_excel(DATA_PATH)

df.columns = df.columns.astype(str).str.replace("\n"," ").str.replace(r"\s+"," ", regex=True).str.strip()
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'data/MSDS_database_cleaned_deidentified_revised.xlsx'

In [None]:
target = "mech_fail_last"
df[target].value_counts(dropna=False)


In [None]:
df['ALIF'].value_counts()

In [None]:
preop_cols = ['age', 'sex', 'PI_preop', 'PT_preop', 'LL_preop', 'SS_preop', 'T4PA_preop', 'L1PA_preop', 
              'SVA_preop', 'cobb_main_curve_preop', 'FC_preop', 'tscore_femneck_preop', 'HU_UIV_preop', 
              'HU_UIVplus1_preop', 'HU_UIVplus2_preop', 'num_levels']

plan_cols = ["UIV_implant", "num_fused_levels", "ALIF", "XLIF", "TLIF", "num_rods", "num_screws", "osteotomy"]

features = preop_cols + plan_cols
features = [c for c in features if c in df.columns]

X = df[features].copy()
y = df[target].copy()

mask = y.notna()
X = X.loc[mask]
y = y.loc[mask].astype(int)

X.shape, y.value_counts()


In [None]:
preop_cols

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                          ("scaler", StandardScaler())]), num_cols),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ]
)

model = LogisticRegression(max_iter=2000, class_weight="balanced")
pipe = Pipeline([("preprocess", preprocess), ("model", model)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

probs = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba")[:, 1]
probs[:10]


In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, precision_score, recall_score, f1_score

auc = roc_auc_score(y, probs)
ap  = average_precision_score(y, probs)

preds = (probs >= 0.5).astype(int)
tn, fp, fn, tp = confusion_matrix(y, preds).ravel()

print("ROC-AUC:", auc)
print("Avg Precision (PR-AUC):", ap)
print("Precision:", precision_score(y, preds, zero_division=0))
print("Recall:", recall_score(y, preds, zero_division=0))
print("F1:", f1_score(y, preds, zero_division=0))
print({"tn": tn, "fp": fp, "fn": fn, "tp": tp})


In [None]:
results = df.loc[mask, :].copy()
results["pred_fail_prob_cv"] = probs

results[["pred_fail_prob_cv", "mech_fail_last"]].sort_values("pred_fail_prob_cv", ascending=False).head(15)


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
rows = []
for t in thresholds:
    preds_t = (probs >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, preds_t).ravel()
    rows.append({
        "threshold": t,
        "precision": precision_score(y, preds_t, zero_division=0),
        "recall": recall_score(y, preds_t, zero_division=0),
        "f1": f1_score(y, preds_t, zero_division=0),
        "tp": tp, "fp": fp, "tn": tn, "fn": fn
    })

pd.DataFrame(rows)


In [None]:
# Fit on all data for demonstration (not evaluation)
pipe.fit(X, y)

# pick one patient row
i = X.index[0]
x0 = X.loc[[i]].copy()

p_base = pipe.predict_proba(x0)[:, 1][0]

# toggle ALIF if it exists
x1 = x0.copy()
if "ALIF" in x1.columns:
    x1["ALIF"] = 1 - int(x1["ALIF"].iloc[0])

p_new = pipe.predict_proba(x1)[:, 1][0]

print("Base risk:", p_base)
print("Toggled ALIF risk:", p_new)
print("Change:", p_new - p_base)


In [None]:
# VG added to save the model

from pathlib import Path
import joblib

ARTIFACT_DIR = Path("../artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

bundle = {
    "pipe": pipe,
    "features": features,
    "target": "mech_fail_last",
}

joblib.dump(bundle, ARTIFACT_DIR / "mech_fail_model.joblib")
print("Saved:", ARTIFACT_DIR / "mech_fail_model.joblib")


In [None]:
# Fit on all data for demonstration
pipe.fit(X, y)

def toggle_and_diff(col, n=10):
    diffs = []
    idxs = list(X.index)[:n]
    for i in idxs:
        x0 = X.loc[[i]].copy()
        if col not in x0.columns:
            continue
        if pd.isna(x0[col].iloc[0]):
            continue
        try:
            base = pipe.predict_proba(x0)[:,1][0]
            x1 = x0.copy()
            x1[col] = 1 - int(x1[col].iloc[0])  # assumes 0/1
            new = pipe.predict_proba(x1)[:,1][0]
            diffs.append(new - base)
        except:
            pass
    return diffs

for c in ["ALIF", "TLIF", "XLIF"]:
    if c in X.columns:
        d = toggle_and_diff(c, n=30)
        print(c, "nonzero diffs:", sum(abs(x) > 1e-6 for x in d), "avg abs change:", np.mean(np.abs(d)) if d else None)


In [None]:
import numpy as np
import pandas as pd

# Fit model on all data (for scoring / optimizer use)
pipe.fit(X, y)

FEATURES = list(X.columns)

def score_mech_fail(preop_plan_dict: dict) -> float:
    """
    Input: dict with patient preop + plan fields (keys should match FEATURES).
    Missing keys are allowed (treated as NaN and imputed).
    Output: probability of mechanical failure (0..1).
    """
    row = {c: preop_plan_dict.get(c, np.nan) for c in FEATURES}
    X_new = pd.DataFrame([row], columns=FEATURES)
    return float(pipe.predict_proba(X_new)[:, 1][0])

# demo (uses an existing patient row)
demo_dict = X.iloc[0].to_dict()
print("Predicted risk:", score_mech_fail(demo_dict))
