In [8]:
import json, os, warnings, joblib, inspect
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

# Auto-config is created for you already; adjust the path if needed
AUTOCONFIG_PATH = "autoconfig.json"
with open(AUTOCONFIG_PATH, "r") as f:
    CFG = json.load(f)

DATA_PATH = CFG["data_path"]
TARGET_COL = CFG.get("target_col")

df = pd.read_csv(DATA_PATH)
print(f"Loaded: {df.shape} rows x cols")
df.head(10)


Loaded: (5110, 12) rows x cols


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,28.1,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [9]:
# Overview
display(df.describe(include='all').T)

# Missingness
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing_df = missing.to_frame('missing')
missing_df['missing_percent'] = (missing_df['missing'] / len(df) * 100).round(2)
missing_df.head(20)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,5110.0,,,,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
gender,5110.0,3.0,Female,2994.0,,,,,,,
age,5110.0,,,,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,,,,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,,,,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
ever_married,5110.0,2.0,Yes,3353.0,,,,,,,
work_type,5110.0,5.0,Private,2925.0,,,,,,,
Residence_type,5110.0,2.0,Urban,2596.0,,,,,,,
avg_glucose_level,5110.0,,,,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,5110.0,,,,28.862035,7.699562,10.3,23.8,28.1,32.8,97.6


Unnamed: 0,missing,missing_percent


In [12]:
# Numeric histograms
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != TARGET_COL]
for c in numeric_cols[:8]:  # limit to avoid dozens of plots
    fig = px.histogram(df, x=c, nbins=50, title=f"Histogram — {c}")
    fig.show()

# Box plots by class
for c in numeric_cols[:6]:
    fig = px.box(df, x=TARGET_COL, y=c, title=f"Boxplot by {TARGET_COL} — {c}")
    fig.show()

# Correlation heatmap
if len(numeric_cols) >= 2:
    corr = df[numeric_cols].corr(numeric_only=True)
    fig = px.imshow(corr, title="Correlation heatmap (numeric)")
    fig.show()


In [13]:
# Split and preprocess
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

numeric_features = [c for c in X.columns if pd.api.types.is_numeric_dtype(df[c])]
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),  # plays nice with sparse downstream
])

# OneHotEncoder arg compatibility across sklearn versions
ohe_kwargs = {}
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    ohe_kwargs["sparse_output"] = True
else:
    ohe_kwargs["sparse"] = True

categorical_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", **ohe_kwargs)),
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, numeric_features),
        ("cat", categorical_pre, categorical_features),
    ]
)

# Two baseline models
log_reg = Pipeline(steps=[("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
rf = Pipeline(steps=[("pre", pre), ("clf", RandomForestClassifier(n_estimators=200, random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() <= 20 else None
)

log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

from sklearn.metrics import f1_score
def eval_model(pipe, name):
    y_pred = pipe.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    f1_macro = report["macro avg"]["f1-score"]
    print(f"{name} — F1 macro: {f1_macro:.4f}")
    return f1_macro, y_pred

f1_lr, ypred_lr = eval_model(log_reg, "LogReg")
f1_rf, ypred_rf = eval_model(rf, "RandomForest")
best = rf if f1_rf >= f1_lr else log_reg
best_name = "RandomForest" if best is rf else "LogReg"
print("Selected best:", best_name)

# Classification report
y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, text_auto=True, title=f"Confusion Matrix — {best_name}", labels=dict(x="Pred", y="True"))
fig.update_xaxes(side="top")
fig.show()

# ROC curve if binary
if y.nunique() == 2:
    if hasattr(best, "predict_proba"):
        y_proba = best.predict_proba(X_test)[:, 1]
    elif hasattr(best, "decision_function"):
        from sklearn.preprocessing import MinMaxScaler
        y_proba = MinMaxScaler().fit_transform(best.decision_function(X_test).reshape(-1,1)).ravel()
    else:
        y_proba = None
    if y_proba is not None:
        auc = roc_auc_score(y_test, y_proba)
        # Use 1 as the positive label when present
        pos_label = 1 if 1 in np.unique(y) else list(np.unique(y))[0]
        fpr, tpr, thr = roc_curve(y_test, y_proba, pos_label=pos_label)
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC AUC={auc:.3f}"))
        fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Chance", line=dict(dash="dash")))
        fig.update_layout(title=f"ROC Curve — {best_name} (AUC={auc:.3f})", xaxis_title="FPR", yaxis_title="TPR")
        fig.show()


LogReg — F1 macro: 0.5254
RandomForest — F1 macro: 0.5055
Selected best: LogReg
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       972
           1       0.67      0.04      0.08        50

    accuracy                           0.95      1022
   macro avg       0.81      0.52      0.53      1022
weighted avg       0.94      0.95      0.93      1022



In [16]:
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score, roc_auc_score
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

assert y.nunique() == 2, "PR analysis expects a binary target."

# Choose positive label (prefer 1 if present; otherwise the minority class)
classes = np.array(sorted(y.unique()))
pos_label = 1 if 1 in classes else y.value_counts().idxmin()

# Get probabilities for positive class if possible
if hasattr(best, "predict_proba"):
    y_scores = best.predict_proba(X_test)[:, list(best.classes_).index(pos_label)]
elif hasattr(best, "decision_function"):
    from sklearn.preprocessing import MinMaxScaler
    y_scores = MinMaxScaler().fit_transform(best.decision_function(X_test).reshape(-1,1)).ravel()
else:
    raise ValueError("Selected model has no probability/score output.")

ap = average_precision_score(y_test==pos_label, y_scores)
roc = roc_auc_score(y_test==pos_label, y_scores)

prec, rec, thr = precision_recall_curve(y_test==pos_label, y_scores)
f1s = (2 * prec * rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1s[:-1])  # thr has len-1 vs prec/rec
best_thr = float(thr[best_idx])
best_f1 = float(f1s[best_idx])

print(f"Average Precision (PR AUC): {ap:.4f}")
print(f"ROC AUC: {roc:.4f}")
print(f"Best F1 threshold: {best_thr:.4f}  | Best F1: {best_f1:.4f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=rec, y=prec, mode="lines", name=f"PR curve (AP={ap:.3f})"))
fig.update_layout(title="Precision–Recall Curve", xaxis_title="Recall", yaxis_title="Precision")
fig.show()

# Threshold–F1 sweep
fig2 = px.line(x=thr, y=f1s[:-1], labels={"x":"Threshold","y":"F1"}, title="F1 vs Threshold")
fig2.add_vline(x=best_thr, line_dash="dash", annotation_text=f"Best F1={best_f1:.3f} @ {best_thr:.2f}")
fig2.show()

# Metrics at best-F1 threshold
y_pred_thr = (y_scores >= best_thr).astype(int)
print("Classification report @ best-F1 threshold:")
print(classification_report((y_test==pos_label).astype(int), y_pred_thr, zero_division=0))


Average Precision (PR AUC): 0.2606
ROC AUC: 0.8400
Best F1 threshold: 0.1411  | Best F1: 0.3522


Classification report @ best-F1 threshold:
              precision    recall  f1-score   support

           0       0.98      0.92      0.95       972
           1       0.26      0.56      0.35        50

    accuracy                           0.90      1022
   macro avg       0.62      0.74      0.65      1022
weighted avg       0.94      0.90      0.92      1022



In [17]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import confusion_matrix

# Define your costs here (example: FN is more costly than FP)
COST_FP = 1.0
COST_FN = 5.0

grid = np.linspace(0.05, 0.95, 19)
costs = []
for t in grid:
    yp = (y_scores >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix((y_test==pos_label).astype(int), yp).ravel()
    total_cost = COST_FP*fp + COST_FN*fn
    costs.append({"threshold": t, "FP": fp, "FN": fn, "cost": total_cost})

cost_df = pd.DataFrame(costs).sort_values("threshold")
display(cost_df)

best_row = cost_df.loc[cost_df["cost"].idxmin()]
best_thr_cost = float(best_row["threshold"])
print(f"Min-cost threshold: {best_thr_cost:.2f}  | Cost={best_row['cost']}  (FP={best_row['FP']}, FN={best_row['FN']})")

fig = px.line(cost_df, x="threshold", y="cost", title=f"Cost vs Threshold (FP={COST_FP}, FN={COST_FN})")
fig.add_vline(x=best_thr_cost, line_dash="dash", annotation_text=f"Min-cost @ {best_thr_cost:.2f}")
fig.show()


Unnamed: 0,threshold,FP,FN,cost
0,0.05,254,9,299.0
1,0.1,133,15,208.0
2,0.15,73,24,193.0
3,0.2,47,36,227.0
4,0.25,26,42,236.0
5,0.3,9,45,234.0
6,0.35,3,46,233.0
7,0.4,1,47,236.0
8,0.45,1,48,241.0
9,0.5,1,48,241.0


Min-cost threshold: 0.15  | Cost=193.0  (FP=73.0, FN=24.0)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score

# (Re)use preprocessing: 'pre' from earlier cell
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()<=20 else None
)

# Baseline: class_weight='balanced'
pipe_bal = Pipeline(steps=[("pre", pre), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))])
pipe_bal.fit(X_train, y_train)
if hasattr(pipe_bal, "predict_proba"):
    ap_bal = average_precision_score((y_test==pos_label).astype(int),
                                     pipe_bal.predict_proba(X_test)[:, list(pipe_bal.classes_).index(pos_label)])
else:
    ap_bal = np.nan
print("Class-weighted Logistic Regression:")
print(classification_report(y_test, pipe_bal.predict(X_test), zero_division=0))
print(f"AP: {ap_bal:.4f}")

# SMOTE (optional) — requires imbalanced-learn
try:
    from imblearn.pipeline import Pipeline as ImbPipeline
    from imblearn.over_sampling import SMOTE
    smote_pipe = ImbPipeline(steps=[
        ("pre", pre),
        ("smote", SMOTE(random_state=42)),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    smote_pipe.fit(X_train, y_train)
    ap_sm = average_precision_score((y_test==pos_label).astype(int),
                                    smote_pipe.predict_proba(X_test)[:, list(smote_pipe.classes_).index(pos_label)])
    print("\nSMOTE + Logistic Regression:")
    print(classification_report(y_test, smote_pipe.predict(X_test), zero_division=0))
    print(f"AP: {ap_sm:.4f}")
except Exception as e:
    print("SMOTE not available. To enable: pip install imbalanced-learn")
    smote_pipe, ap_sm = None, np.nan


Class-weighted Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.75      0.82      1022

AP: 0.2598

SMOTE + Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.78      0.55      1022
weighted avg       0.95      0.75      0.82      1022

AP: 0.2624


In [20]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

scorer = "f1_macro" if y.nunique()>2 else "average_precision"

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression grid
lr_grid = {"clf__C": [0.1, 0.5, 1.0, 2.0, 5.0]}
lr_pipe = Pipeline(steps=[("pre", pre), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))])
lr_search = GridSearchCV(lr_pipe, lr_grid, cv=cv, scoring=scorer, n_jobs=-1, verbose=1)
lr_search.fit(X_train, y_train)

# Random Forest grid
rf_grid = {
    "clf__n_estimators": [200, 400],
    "clf__max_depth": [None, 8, 16],
    "clf__min_samples_leaf": [1, 2, 5]
}
rf_pipe = Pipeline(steps=[("pre", pre), ("clf", RandomForestClassifier(random_state=42))])
rf_search = GridSearchCV(rf_pipe, rf_grid, cv=cv, scoring=scorer, n_jobs=-1, verbose=1)
rf_search.fit(X_train, y_train)

print("Best LR params:", lr_search.best_params_, "| CV score:", lr_search.best_score_)
print("Best RF params:", rf_search.best_params_, "| CV score:", rf_search.best_score_)

# Pick best-by-CV
cv_best_est = lr_search.best_estimator_ if lr_search.best_score_ >= rf_search.best_score_ else rf_search.best_estimator_
cv_best_name = "LogReg (tuned)" if cv_best_est is lr_search.best_estimator_ else "RandomForest (tuned)"
print("Selected by CV:", cv_best_name)

# Test-set evaluation
y_pred_cv = cv_best_est.predict(X_test)
print("\nTest set report (CV-selected model):")
print(classification_report(y_test, y_pred_cv, zero_division=0))

# Optional: probabilities for PR/ROC
if hasattr(cv_best_est, "predict_proba") and y.nunique()==2:
    y_scores_cv = cv_best_est.predict_proba(X_test)[:, list(cv_best_est.classes_).index(pos_label)]
    ap_cv = average_precision_score((y_test==pos_label).astype(int), y_scores_cv)
    auc_cv = roc_auc_score((y_test==pos_label).astype(int), y_scores_cv)
    print(f"AP: {ap_cv:.4f} | ROC AUC: {auc_cv:.4f}")
else:
    y_scores_cv = None


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best LR params: {'clf__C': 0.1} | CV score: 0.19235827755986612
Best RF params: {'clf__max_depth': 8, 'clf__min_samples_leaf': 5, 'clf__n_estimators': 200} | CV score: 0.19524121243826112
Selected by CV: RandomForest (tuned)

Test set report (CV-selected model):
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022

AP: 0.2304 | ROC AUC: 0.8261


In [21]:
from sklearn.calibration import CalibratedClassifierCV

cal_best = None
if y.nunique()==2:
    if hasattr(cv_best_est, "predict_proba") or hasattr(cv_best_est, "decision_function"):
        method = "isotonic"  # try 'sigmoid' if overfitting
        cal = CalibratedClassifierCV(cv_best_est, method=method, cv=3)
        cal.fit(X_train, y_train)
        cal_best = cal

        y_scores_cal = cal_best.predict_proba(X_test)[:, list(cal_best.classes_).index(pos_label)]
        ap_cal = average_precision_score((y_test==pos_label).astype(int), y_scores_cal)
        auc_cal = roc_auc_score((y_test==pos_label).astype(int), y_scores_cal)
        print(f"Calibrated ({method}) AP: {ap_cal:.4f} | ROC AUC: {auc_cal:.4f}")
    else:
        print("Calibration skipped: estimator lacks score/proba.")
else:
    print("Calibration skipped: multi-class not enabled here.")


Calibrated (isotonic) AP: 0.2242 | ROC AUC: 0.8266


In [22]:
import pandas as pd

rows = []

def eval_row(name, est, scores=None):
    try:
        yp = est.predict(X_test)
        r = classification_report(y_test, yp, output_dict=True, zero_division=0)
        macro_f1 = r["macro avg"]["f1-score"]
        ap = np.nan
        auc = np.nan
        if y.nunique()==2 and hasattr(est, "predict_proba"):
            sc = est.predict_proba(X_test)[:, list(est.classes_).index(pos_label)]
            ap = average_precision_score((y_test==pos_label).astype(int), sc)
            auc = roc_auc_score((y_test==pos_label).astype(int), sc)
        rows.append({"model": name, "macro_f1": macro_f1, "AP": ap, "ROC_AUC": auc})
    except Exception as e:
        rows.append({"model": name, "macro_f1": np.nan, "AP": np.nan, "ROC_AUC": np.nan})

eval_row(f"Selected earlier — {best_name}", best)
eval_row(cv_best_name, cv_best_est)
if cal_best is not None:
    eval_row(cv_best_name + " + Calibrated", cal_best)
if 'pipe_bal' in globals():
    eval_row("LogReg (class_weight=balanced)", pipe_bal)
if 'smote_pipe' in globals() and smote_pipe is not None:
    eval_row("SMOTE + LogReg", smote_pipe)

comp_df = pd.DataFrame(rows).sort_values(by=["AP","macro_f1"], ascending=False)
display(comp_df)


Unnamed: 0,model,macro_f1,AP,ROC_AUC
4,SMOTE + LogReg,0.547583,0.262421,0.84321
0,Selected earlier — LogReg,0.52543,0.260576,0.840041
3,LogReg (class_weight=balanced),0.544106,0.259845,0.841337
1,RandomForest (tuned),0.487462,0.230398,0.82607
2,RandomForest (tuned) + Calibrated,0.487462,0.224223,0.826553


In [23]:
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

# Utility to extract feature names after OneHot
def get_feature_names(preprocessor, numeric_features, categorical_features):
    # numeric: as-is
    num_names = list(numeric_features)
    # categorical: get OHE categories
    cat_names = []
    try:
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        cats = ohe.categories_
        for col, cats_for_col in zip(categorical_features, cats):
            cat_names.extend([f"{col}={c}" for c in cats_for_col])
    except Exception as e:
        cat_names = list(categorical_features)  # fallback
    return num_names + cat_names

feature_names = get_feature_names(pre, numeric_features, categorical_features)

def show_top_k_importances(estimator, names, k=20, title="Importances"):
    imp = None
    if hasattr(estimator, "named_steps"):
        est = estimator.named_steps.get("clf", estimator)
    else:
        est = estimator
    # Linear model coefficients
    if hasattr(est, "coef_"):
        coef = est.coef_
        if coef.ndim > 1:  # pick positive class if binary
            idx = 1 if coef.shape[0] > 1 else 0
            coef = coef[idx]
        imp = pd.DataFrame({"feature": names, "importance": coef})
        imp["abs"] = imp["importance"].abs()
        imp = imp.sort_values("abs", ascending=False).head(k)
        imp = imp.drop(columns="abs")
    # Tree-based feature importances
    elif hasattr(est, "feature_importances_"):
        fi = est.feature_importances_
        imp = pd.DataFrame({"feature": names, "importance": fi}).sort_values("importance", ascending=False).head(k)
    if imp is not None:
        fig = px.bar(imp, x="importance", y="feature", orientation="h", title=title)
        fig.update_layout(yaxis={"categoryorder":"total ascending"})
        fig.show()
    else:
        print("Model doesn't expose native importances; using permutation below.")

# Native importances if available
show_top_k_importances(best, feature_names, title=f"{best_name} — native importances")

# Permutation importance (model-agnostic) — can be slow
try:
    if hasattr(best, "predict"):
        perm = permutation_importance(best, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
        imp_perm = pd.DataFrame({"feature": feature_names, "importance": perm.importances_mean}) \
                    .sort_values("importance", ascending=False).head(20)
        fig = px.bar(imp_perm, x="importance", y="feature", orientation="h", title="Permutation importance (test set)")
        fig.update_layout(yaxis={"categoryorder":"total ascending"})
        fig.show()
except Exception as e:
    print("Permutation importance failed:", e)


Permutation importance failed: All arrays must be of the same length


In [24]:
# Choose the final model to export
final_model = cal_best or cv_best_est or best
final_name = ("Calibrated " + cv_best_name) if cal_best is not None else (cv_best_name if cv_best_est is not None else best_name)

# Choose a default threshold:
#  - If calibrated scores exist: use min-cost threshold (from Cell 10) if defined; else best-F1; else 0.5
default_threshold = None
if y.nunique()==2 and (hasattr(final_model, "predict_proba") or hasattr(final_model, "decision_function")):
    default_threshold = locals().get("best_thr_cost") or locals().get("best_thr") or 0.5
else:
    default_threshold = 0.5

# Save
ARTIFACT_DIR = ""
MODEL_PATH = os.path.join(ARTIFACT_DIR, "model.joblib")
META_PATH = os.path.join(ARTIFACT_DIR, "model_meta.json")

joblib.dump(final_model, MODEL_PATH)
meta = {
    "best_model": final_name,
    "target_col": TARGET_COL,
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "default_threshold": float(default_threshold),
    "pos_label": int(pos_label) if isinstance(pos_label, (int, np.integer)) else str(pos_label)
}
with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", MODEL_PATH, META_PATH)
print("Default threshold:", default_threshold, "| Positive label:", pos_label)


Saved: model.joblib model_meta.json
Default threshold: 0.15 | Positive label: 1
