In [None]:
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import traceback
import joblib
import warnings
warnings.filterwarnings("ignore")


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)

In [None]:
try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    from imblearn.over_sampling import SMOTE
    HAS_SMOTE = True
except Exception:
    HAS_SMOTE = False

try:
    import shap
    HAS_SHAP = True
except Exception:
    HAS_SHAP = False

In [None]:
CSV_PATH = "rwsi_data.csv"
XLSX_PATH = "RWSI.xlsx"
OUTPUT_DIR = Path("rwsi_full_outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
TEST_SIZE = 0.2
RANDOM_STATE = 42
CV_FOLDS = 5
LOW_CARDINALITY_THRESHOLD = 20
USE_SMOTE = False
CLASS_WEIGHT_BALANCED = False

In [None]:
report_lines = []
def write_report_line(s=""):
    print(s)
    report_lines.append(str(s))

In [None]:
write_report_line("==== Loading files ====")
xlsx_path = Path(XLSX_PATH)
csv_path = Path(CSV_PATH)
if not csv_path.exists():
    raise FileNotFoundError(f"CSV not found at {csv_path}")
df = pd.read_csv(csv_path)
write_report_line(f"Loaded CSV: {csv_path} shape={df.shape}")

detected_target = None
if xlsx_path.exists():
    try:
        xls = pd.ExcelFile(xlsx_path)
        write_report_line(f"Loaded Excel: {xlsx_path} sheets={xls.sheet_names}")

        sheet_text = ""
        for name in xls.sheet_names:
            sheet_df = xls.parse(name, nrows=400).fillna("").astype(str)
            rows_joined = sheet_df.agg(' '.join, axis=1).tolist()
            sheet_text += " " + " ".join(rows_joined).lower()

        import re
        m = re.search(r"(monetaryconversion|monetary conversion|target variable|target)\s*(?:[:\-]|\s)\s*([a-zA-Z0-9_]+)?", sheet_text)

        if "monetaryconversion" in sheet_text.replace(" ", "") or "monetary conversion" in sheet_text:

            if "MonetaryConversion" in df.columns:
                detected_target = "MonetaryConversion"
                write_report_line("Detected target from Excel text: MonetaryConversion")

        if detected_target is None and m and m.group(2):
            cand = m.group(2)
            if cand in df.columns:
                detected_target = cand
                write_report_line(f"Detected target from Excel regex: {cand}")
    except Exception as e:
        write_report_line("Failed to parse Excel for instructions: " + repr(e))
else:
    write_report_line("Excel file not found; proceeding with heuristics.")


==== Loading files ====
Loaded CSV: rwsi_data.csv shape=(12330, 20)
Excel file not found; proceeding with heuristics.


In [None]:
if detected_target is None:

    candidates = ["MonetaryConversion", "monetaryconversion", "Monetary_Conversion", "conversion", "Conversion", "target", "label", "y"]
    for c in candidates:
        if c in df.columns:
            detected_target = c
            write_report_line(f"Heuristic selected target: {detected_target}")
            break
if detected_target is None:

    detected_target = df.columns[-1]
    write_report_line(f"No explicit target found; falling back to last column: {detected_target}")


Heuristic selected target: MonetaryConversion


In [None]:
write_report_line("\n==== Quick EDA ====")
write_report_line(f"Columns ({len(df.columns)}): {df.columns.tolist()}")
write_report_line(f"Dataset shape: {df.shape}")
write_report_line(f"Target column chosen: {detected_target}")


==== Quick EDA ====
Columns (20): ['SessionID', 'AdClicks', 'InfoSectionCount', 'InfoSectionTime', 'HelpPageVisits', 'HelpPageTime', 'ItemBrowseCount', 'ItemBrowseTime', 'ExitRateFirstPage', 'SessionExitRatio', 'PageEngagementScore', 'HolidayProximityIndex', 'VisitMonth', 'UserPlatformID', 'WebClientCode', 'MarketZone', 'TrafficSourceCode', 'UserCategory', 'IsWeekendVisit', 'MonetaryConversion']
Dataset shape: (12330, 20)
Target column chosen: MonetaryConversion


In [None]:
if detected_target in df.columns:
    targ_counts = df[detected_target].value_counts(dropna=False)
    write_report_line(f"Target value counts:\n{targ_counts.to_dict()}")
else:
    raise KeyError(f"Target column {detected_target} not present in CSV")

Target value counts:
{'No': 10422, 'Yes': 1908}


In [None]:
write_report_line("\n==== Preprocessing & feature engineering ====")

df = df.loc[:, ~df.isna().all()]
write_report_line(f"After dropping empty columns: {df.shape}")


==== Preprocessing & feature engineering ====
After dropping empty columns: (12330, 20)


In [None]:
datetime_cols = []
for col in df.columns:
    if "date" in col.lower() or "time" in col.lower() and df[col].dtype == object:
        parsed = pd.to_datetime(df[col], errors="coerce")
        if parsed.notna().sum() > 0:
            df[col + "_parsed"] = parsed
            datetime_cols.append(col + "_parsed")
            write_report_line(f"Parsed {col} -> {col+'_parsed'} (parsed count {parsed.notna().sum()})")

In [None]:
for col in df.select_dtypes(include=["datetime64[ns]", "datetime64"]).columns.tolist():
    if col not in datetime_cols:
        datetime_cols.append(col)

In [None]:
for dcol in list(dict.fromkeys(datetime_cols)):
    try:
        df[dcol] = pd.to_datetime(df[dcol], errors="coerce")
        df[f"{dcol}_year"] = df[dcol].dt.year
        df[f"{dcol}_month"] = df[dcol].dt.month
        df[f"{dcol}_day"] = df[dcol].dt.day
        df[f"{dcol}_weekday"] = df[dcol].dt.weekday
        df[f"{dcol}_dayofyear"] = df[dcol].dt.dayofyear

        try:
            df[f"{dcol}_unix"] = df[dcol].astype("int64") // 10**9
        except Exception:
            df[f"{dcol}_unix"] = pd.to_numeric(df[dcol].view("int64"), errors="coerce") // 10**9
        df.drop(columns=[dcol], inplace=True)
        write_report_line(f"Converted datetime column {dcol} into numeric features")
    except Exception as e:
        write_report_line(f"Datetime conversion failed for {dcol}: {e}")


In [None]:
df = df[~df[detected_target].isna()].reset_index(drop=True)
write_report_line(f"After dropping rows with missing target: {df.shape}")

After dropping rows with missing target: (12330, 20)


In [None]:
y_raw = df[detected_target].copy()
X = df.drop(columns=[detected_target]).copy()


In [None]:
y = y_raw
if y.dtype == object:
    y_num = pd.to_numeric(y, errors="coerce")
    if y_num.notna().sum() / len(y) > 0.5:
        y = y_num
        write_report_line("Converted majority of target values to numeric; treating as numeric.")
is_regression = False
if pd.api.types.is_numeric_dtype(y) and y.nunique() > 20:
    is_regression = True
write_report_line("Problem type: " + ("Regression" if is_regression else "Classification"))


Problem type: Classification


In [None]:
label_encoder = None
if not is_regression:
    if not pd.api.types.is_numeric_dtype(y):
        label_encoder = LabelEncoder()
        y_enc = label_encoder.fit_transform(y.astype(str))
        write_report_line(f"Label mapping: {dict(enumerate(label_encoder.classes_))}")
    else:
        y_enc = y.astype(int).values
else:
    y_enc = y.astype(float).values


Label mapping: {0: 'No', 1: 'Yes'}


In [None]:
bool_cols = [c for c in X.columns if X[c].dtype == "bool"]
for c in bool_cols:
    X[c] = X[c].astype(int)

In [None]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

write_report_line(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
write_report_line(f"Categorical columns ({len(cat_cols)}): {cat_cols}")

low_card = [c for c in cat_cols if X[c].nunique(dropna=False) <= LOW_CARDINALITY_THRESHOLD]
high_card = [c for c in cat_cols if X[c].nunique(dropna=False) > LOW_CARDINALITY_THRESHOLD]
write_report_line(f"Low-card categorical: {low_card}")
write_report_line(f"High-card categorical: {high_card}")


Numeric columns (13): ['AdClicks', 'InfoSectionCount', 'InfoSectionTime', 'HelpPageVisits', 'HelpPageTime', 'ItemBrowseCount', 'ItemBrowseTime', 'ExitRateFirstPage', 'SessionExitRatio', 'PageEngagementScore', 'HolidayProximityIndex', 'TrafficSourceCode', 'IsWeekendVisit']
Categorical columns (6): ['SessionID', 'VisitMonth', 'UserPlatformID', 'WebClientCode', 'MarketZone', 'UserCategory']
Low-card categorical: ['VisitMonth', 'UserPlatformID', 'WebClientCode', 'MarketZone', 'UserCategory']
High-card categorical: ['SessionID']


In [None]:
import sklearn
sk_ver = tuple(int(x) for x in sklearn.__version__.split('.')[:2])
ohe_kwargs = {}
if sk_ver >= (1, 2):

    ohe_kwargs['sparse_output'] = False
else:
    ohe_kwargs['sparse'] = False

num_transformer = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_low_transformer = None
if low_card:
    cat_low_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", **ohe_kwargs))
    ])
cat_high_transformer = None
if high_card:
    cat_high_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ])

transformers = []
if numeric_cols:
    transformers.append(("num", num_transformer, numeric_cols))
if low_card:
    transformers.append(("cat_low", cat_low_transformer, low_card))
if high_card:
    transformers.append(("cat_high", cat_high_transformer, high_card))
preprocessor = ColumnTransformer(transformers=transformers, remainder="drop", sparse_threshold=0)


In [None]:
if not is_regression:
    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_enc)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE)

write_report_line(f"Train/test shapes: {X_train.shape}, {X_test.shape}")


Train/test shapes: (9864, 19), (2466, 19)


In [None]:
if USE_SMOTE and (not is_regression) and HAS_SMOTE:
    write_report_line("Applying SMOTE on training set to handle class imbalance (USE_SMOTE=True)")
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
    X_train, y_train = X_train_res, y_train_res
    write_report_line(f"After SMOTE train shape: {X_train.shape}")
elif USE_SMOTE and not HAS_SMOTE:
    write_report_line("USE_SMOTE requested but imblearn not installed. Skipping SMOTE.")


In [None]:
write_report_line("\n==== Model definitions ====")
models = {}

if not is_regression:
    lr_kwargs = {"max_iter": 2000}
    dt_kwargs = {"random_state": RANDOM_STATE}
    rf_kwargs = {"n_estimators": 200, "random_state": RANDOM_STATE, "n_jobs": 1}
    if CLASS_WEIGHT_BALANCED:
        lr_kwargs["class_weight"] = "balanced"
        dt_kwargs["class_weight"] = "balanced"
        rf_kwargs["class_weight"] = "balanced"
    models["LogisticRegression"] = Pipeline([("pre", preprocessor), ("clf", LogisticRegression(**lr_kwargs))])
    models["DecisionTree"] = Pipeline([("pre", preprocessor), ("clf", DecisionTreeClassifier(**dt_kwargs))])
    models["RandomForest"] = Pipeline([("pre", preprocessor), ("clf", RandomForestClassifier(**rf_kwargs))])
    models["NaiveBayes"] = Pipeline([("pre", preprocessor), ("clf", GaussianNB())])
    models["SVC"] = Pipeline([("pre", preprocessor), ("clf", SVC(probability=True, random_state=RANDOM_STATE))])
    if HAS_XGB:
        models["XGBoost"] = Pipeline([("pre", preprocessor), ("clf", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE))])
else:
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    models["LinearRegression"] = Pipeline([("pre", preprocessor), ("reg", LinearRegression())])
    models["DecisionTreeRegressor"] = Pipeline([("pre", preprocessor), ("reg", DecisionTreeRegressor(random_state=RANDOM_STATE))])
    models["RandomForestRegressor"] = Pipeline([("pre", preprocessor), ("reg", RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE))])
    if HAS_XGB:
        models["XGBoostRegressor"] = Pipeline([("pre", preprocessor), ("reg", xgb.XGBRegressor(random_state=RANDOM_STATE))])

write_report_line(f"Models to train: {list(models.keys())}")



==== Model definitions ====
Models to train: ['LogisticRegression', 'DecisionTree', 'RandomForest', 'NaiveBayes', 'SVC', 'XGBoost']


In [None]:
write_report_line("\n==== Training & evaluation ====")
results = {}
for name, pipe in models.items():
    try:
        write_report_line(f"\n-- Training {name} ...")
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_proba = None
        if not is_regression:
            try:
                clf = pipe.named_steps[list(pipe.named_steps.keys())[-1]]
                if hasattr(clf, "predict_proba"):
                    y_proba = clf.predict_proba(pipe.named_steps["pre"].transform(X_test) if False else pipe.predict_proba if False else pipe.predict_proba)

                    y_proba = pipe.predict_proba(X_test)
            except Exception:
                y_proba = None

        if not is_regression:
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
            rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
            f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

            try:
                if y_proba is not None:
                    if y_proba.shape[1] == 2:
                        roc = roc_auc_score(y_test, y_proba[:, 1])
                    else:
                        roc = roc_auc_score(pd.get_dummies(y_test), y_proba, average="macro")
                else:
                    roc = None
            except Exception:
                roc = None
            cm = confusion_matrix(y_test, y_pred)
            results[name] = {"accuracy": float(acc), "precision_macro": float(prec), "recall_macro": float(rec), "f1_macro": float(f1), "roc_auc": float(roc) if roc is not None else None, "confusion_matrix": cm.tolist()}
            write_report_line(f"{name} metrics -> Acc: {acc:.4f}, Precision (macro): {prec:.4f}, Recall (macro): {rec:.4f}, F1 (macro): {f1:.4f}, ROC-AUC: {roc}")
        else:

            from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
            mse = mean_squared_error(y_test, y_pred); rmse = np.sqrt(mse); mae = mean_absolute_error(y_test, y_pred); r2 = r2_score(y_test, y_pred)
            results[name] = {"rmse": float(rmse), "mae": float(mae), "r2": float(r2)}
            write_report_line(f"{name} metrics -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


        try:
            if not is_regression:

                scoring = "f1_macro"
                cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
                cv_scores = cross_val_score(pipe, X, y_enc if not is_regression else y, scoring=scoring, cv=cv, n_jobs=1)
                write_report_line(f"{name} CV ({scoring}) mean ± std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
                results[name]["cv_"+scoring] = float(cv_scores.mean())
            else:
                cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
                neg_mse = cross_val_score(pipe, X, y, scoring="neg_mean_squared_error", cv=cv, n_jobs=1)
                cv_rmse = np.sqrt(-neg_mse)
                write_report_line(f"{name} CV RMSE mean ± std: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")
                results[name]["cv_rmse_mean"] = float(cv_rmse.mean())
        except Exception as e:
            write_report_line(f"Cross-validation failed for {name}: {e}")
            results[name]["cv_error"] = str(e)

        model_path = OUTPUT_DIR / f"{name}.joblib"
        joblib.dump(pipe, model_path)
        write_report_line(f"Saved pipeline to: {model_path}")

    except Exception as e:
        write_report_line(f"Training failed for {name}: {repr(e)}")
        write_report_line(traceback.format_exc())
        results[name] = {"error": str(e)}


==== Training & evaluation ====

-- Training LogisticRegression ...
LogisticRegression metrics -> Acc: 0.8788, Precision (macro): 0.8110, Recall (macro): 0.6599, F1 (macro): 0.6993, ROC-AUC: None
LogisticRegression CV (f1_macro) mean ± std: 0.7140 ± 0.0145
Saved pipeline to: rwsi_full_outputs/LogisticRegression.joblib

-- Training DecisionTree ...
DecisionTree metrics -> Acc: 0.7936, Precision (macro): 0.6497, Recall (macro): 0.7004, F1 (macro): 0.6658, ROC-AUC: None
DecisionTree CV (f1_macro) mean ± std: 0.6705 ± 0.0241
Saved pipeline to: rwsi_full_outputs/DecisionTree.joblib

-- Training RandomForest ...
RandomForest metrics -> Acc: 0.8990, Precision (macro): 0.8354, Recall (macro): 0.7425, F1 (macro): 0.7773, ROC-AUC: None
RandomForest CV (f1_macro) mean ± std: 0.7871 ± 0.0069
Saved pipeline to: rwsi_full_outputs/RandomForest.joblib

-- Training NaiveBayes ...
NaiveBayes metrics -> Acc: 0.7174, Precision (macro): 0.6260, Recall (macro): 0.7195, F1 (macro): 0.6264, ROC-AUC: None
Nai

In [None]:
write_report_line("\n==== Feature importances (tree models) ====")
for name in models:
    if "DecisionTree" in name or "RandomForest" in name or ("XGBoost" in name and HAS_XGB):
        try:
            pipe = joblib.load(OUTPUT_DIR / f"{name}.joblib")
            pre = pipe.named_steps["pre"]
            feature_names = []
            for tname, trans, cols in pre.transformers_:
                if tname == "num":
                    feature_names.extend(cols)
                elif tname == "cat_low":
                    ohe = trans.named_steps["onehot"]
                    names = ohe.get_feature_names_out(cols).tolist()
                    feature_names.extend(names)
                elif tname == "cat_high":
                    feature_names.extend(cols)
            last_step = list(pipe.named_steps.keys())[-1]
            model_obj = pipe.named_steps[last_step]
            if hasattr(model_obj, "feature_importances_"):
                importances = model_obj.feature_importances_
                idx = np.argsort(importances)[::-1][:30]
                fi = [(feature_names[i] if i < len(feature_names) else f"f{i}", float(importances[i])) for i in idx]
                write_report_line(f"{name} top features: {fi[:20]}")
                results[name]["feature_importances"] = fi
            else:
                write_report_line(f"{name} model has no feature_importances_ attribute")
        except Exception as e:
            write_report_line(f"Failed to extract importances for {name}: {e}")



==== Feature importances (tree models) ====
DecisionTree top features: [('PageEngagementScore', 0.41218393736379244), ('SessionExitRatio', 0.068517975531233), ('SessionID', 0.06826503019226485), ('ItemBrowseTime', 0.06474474503844489), ('ExitRateFirstPage', 0.05995865675188367), ('ItemBrowseCount', 0.050620335277355), ('InfoSectionTime', 0.04447332683315313), ('InfoSectionCount', 0.03419893011514298), ('VisitMonth_November', 0.019962064287095797), ('HelpPageTime', 0.018804226474127817), ('TrafficSourceCode', 0.01751812542441936), ('AdClicks', 0.01470519790204845), ('HelpPageVisits', 0.011183121949028832), ('MarketZone_North America', 0.007351915519834998), ('VisitMonth_March', 0.006861964114417008), ('MarketZone_Other', 0.006738687201029202), ('IsWeekendVisit', 0.005799569893359543), ('MarketZone_Asia-Pacific', 0.0056211766553364555), ('UserCategory_Returning', 0.005289259396823774), ('UserPlatformID_MacOS', 0.005282900497130034)]
RandomForest top features: [('PageEngagementScore', 0.

In [None]:
if HAS_SHAP and "RandomForest" in models:
    try:
        write_report_line("\n==== SHAP explanation for RandomForest (top 20) ====")
        rf_pipe = joblib.load(OUTPUT_DIR / "RandomForest.joblib")

        X_pre = preprocessor.fit_transform(X) if False else None
        X_sample = X.sample(n=min(200, len(X)), random_state=RANDOM_STATE)
        X_trans = rf_pipe.named_steps["pre"].transform(X_sample)
        model_rf = rf_pipe.named_steps["clf"]
        explainer = shap.TreeExplainer(model_rf)
        shap_values = explainer.shap_values(X_trans)

        try:
            if isinstance(shap_values, list):

                mean_abs = np.mean([np.abs(sv).mean(0) for sv in shap_values], axis=0)
            else:
                mean_abs = np.abs(shap_values).mean(0)

            feat_names = []
            for tname, trans, cols in rf_pipe.named_steps["pre"].transformers_:
                if tname == "num":
                    feat_names.extend(cols)
                elif tname == "cat_low":
                    ohe = trans.named_steps["onehot"]
                    feat_names.extend(ohe.get_feature_names_out(cols).tolist())
                elif tname == "cat_high":
                    feat_names.extend(cols)
            idx = np.argsort(mean_abs)[::-1][:20]
            shap_top = [(feat_names[i], float(mean_abs[i])) for i in idx]
            write_report_line(f"SHAP top features (RandomForest): {shap_top}")
            results["RandomForest"]["shap_top20"] = shap_top
        except Exception as e:
            write_report_line("SHAP summarization failed: " + str(e))
    except Exception as e:
        write_report_line("SHAP step failed: " + str(e))


==== SHAP explanation for RandomForest (top 20) ====
SHAP summarization failed: only integer scalar arrays can be converted to a scalar index


In [None]:
write_report_line("\n==== Saving report & artifacts ====")
report_path = OUTPUT_DIR / "rwsi_full_report.txt"
with open(report_path, "w") as f:
    f.write("\n".join(report_lines))
write_report_line(f"Report written to: {report_path}")


==== Saving report & artifacts ====
Report written to: rwsi_full_outputs/rwsi_full_report.txt


In [None]:
import json
with open(OUTPUT_DIR / "rwsi_results.json", "w") as f:
    json.dump(results, f, indent=2)
write_report_line(f"Results JSON saved to: {OUTPUT_DIR / 'rwsi_results.json'}")

Results JSON saved to: rwsi_full_outputs/rwsi_results.json


In [None]:
write_report_line("\n==== Summary of trained models & key metrics ====")
for m, info in results.items():
    write_report_line(f"Model: {m}")
    if isinstance(info, dict):
        for k in ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc", "cv_f1_macro", "cv_rmse_mean", "rmse"]:
            if k in info:
                write_report_line(f"  {k}: {info[k]}")
        if "confusion_matrix" in info:
            write_report_line(f"  confusion_matrix: {info['confusion_matrix']}")
    else:
        write_report_line(f"  info: {info}")

write_report_line("\nAll artifacts (models, report, results) saved to: " + str(OUTPUT_DIR))
write_report_line("If you want: I can (1) run class-balance experiments (SMOTE / class_weight), (2) hyperparameter tune RandomForest/XGBoost, or (3) produce plots (ROC/PR/feature distributions). Tell me which and I'll run it next.")


==== Summary of trained models & key metrics ====
Model: LogisticRegression
  accuracy: 0.8787510137875101
  precision_macro: 0.8110464051122082
  recall_macro: 0.6599496538071168
  f1_macro: 0.6993088868416899
  roc_auc: None
  cv_f1_macro: 0.7139921749531424
  confusion_matrix: [[2036, 48], [251, 131]]
Model: DecisionTree
  accuracy: 0.7935928629359287
  precision_macro: 0.6496782872253903
  recall_macro: 0.7004288470621338
  f1_macro: 0.6657745602419471
  roc_auc: None
  cv_f1_macro: 0.6704925855242673
  confusion_matrix: [[1741, 343], [166, 216]]
Model: RandomForest
  accuracy: 0.8990267639902676
  precision_macro: 0.8354445226366409
  recall_macro: 0.7424983167690005
  f1_macro: 0.7773486154135121
  roc_auc: None
  cv_f1_macro: 0.7871181839036276
  confusion_matrix: [[2020, 64], [185, 197]]
Model: NaiveBayes
  accuracy: 0.7173560421735604
  precision_macro: 0.626023834100465
  recall_macro: 0.7194619187828482
  f1_macro: 0.6263528335565547
  roc_auc: None
  cv_f1_macro: 0.6467863