
# IFN580 – Assignment 1: Starter Notebook


## 1) Config & Imports

In [None]:

# ---- User config ----
RANDOM_STATE = 42
TARGET_COL   = "IsBadBuy"   # change if different
DATA_PATHS   = [
    "kick.csv",                         # put the CSV next to this notebook
    "assignment 1 data kick.csv",       # alt name (rename as needed)
    "./data/kick.csv",                  # common project structure
]

# ---- Imports ----
import os, sys, math, json, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve, auc, RocCurveDisplay, classification_report,
                             accuracy_score)
from sklearn.feature_selection import RFE, RFECV, SelectFromModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
np.random.seed(RANDOM_STATE)


## 2) Load Data

In [None]:

def find_data(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

DATA_PATH = find_data(DATA_PATHS)
if DATA_PATH is None:
    raise FileNotFoundError(
        f"CSV not found. Place your kick dataset next to this notebook as 'kick.csv' "
        f"or update DATA_PATHS."
    )

print(f"Using data at: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
# --- PATCH AFTER "2) Load Data": handle '?' and coerce numeric-like columns ---
# Replace '?' with real missing values so the pipeline can impute correctly
df.replace('?', np.nan, inplace=True)

# Try to convert columns that look numeric into real numbers
coerced = []
for c in df.columns:
    if c == "IsBadBuy":
        continue
    s = pd.to_numeric(df[c], errors="coerce")
    # Treat as numeric if most values can be converted
    if (s.notna().mean() > 0.6) and (s.notna().sum() > 100):
        df[c] = s
        coerced.append(c)
print("Coerced to numeric (first few):", coerced[:10], "| total:", len(coerced))



## 3) Quick Audit (Task 1)



In [None]:

# Basic info
display(df.info())
display(df.describe(include='all').T)

# Target distribution (before preprocessing)
if TARGET_COL not in df.columns:
    raise KeyError(f"TARGET_COL '{TARGET_COL}' not found. Set TARGET_COL correctly.")

target_counts = df[TARGET_COL].value_counts(dropna=False)
target_ratio  = target_counts / len(df)
print("Target counts (before):")
display(pd.DataFrame({"count": target_counts, "ratio": target_ratio}))


## 4) Train/Test Split (stratified)

In [None]:
TARGET_COL = "IsBadBuy"  # change if your target name differs

# Drop pure identifiers and raw text date (we already have PurchaseTimestamp)
drop_cols = [c for c in ["PurchaseID", "PurchaseDate"] if c in df.columns]
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + drop_cols)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("Train/Test sizes:", X_train.shape, X_test.shape)



## 5) Preprocessing Pipeline (ColumnTransformer)
- **Numeric**: median imputation + (optional) scaling
- **Categorical**: mode imputation + OneHotEncoder(handle_unknown="ignore")


In [None]:
import sklearn
from packaging import version

# Identify feature types
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.columns.difference(numeric_features).tolist()

# Numeric: impute median then standardize
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # dense pipeline
])

# Categorical: impute mode then OHE (compatible with new sklearn)
ohe_kwargs = dict(handle_unknown="ignore")
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe_kwargs["sparse_output"] = False   # new param name in newer sklearn
else:
    ohe_kwargs["sparse"] = False          # legacy param name for older sklearn

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(**ohe_kwargs))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))


## 6) Helpers: ROC / Confusion Matrix / Summary

In [None]:
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc, RocCurveDisplay,
    precision_recall_curve, average_precision_score,
    accuracy_score
)
import matplotlib.pyplot as plt
import numpy as np

def get_scores(model, X):
    """Return a 1-D score/probability array for ROC/PR curves."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    raise ValueError("Model has neither predict_proba nor decision_function.")

def plot_roc(model, X_test, y_test, title=None):
    """Plot ROC curve and return AUC."""
    y_score = get_scores(model, X_test)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                    estimator_name=title or type(model).__name__).plot()
    plt.title((title or "ROC") + f" | AUC={roc_auc:.3f}")
    plt.show()
    return roc_auc, y_score

def plot_cm(model, X_test, y_test, title=None):
    """Plot confusion matrix at default threshold 0.5 (or model's default)."""
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(ax=ax)
    ax.set_title(title or "Confusion Matrix")
    plt.show()
    return cm

def plot_pr(model, X_test, y_test, title=None):
    """Plot Precision-Recall curve (useful for imbalanced data)."""
    y_score = get_scores(model, X_test)
    p, r, _ = precision_recall_curve(y_test, y_score)
    ap = average_precision_score(y_test, y_score)
    plt.figure()
    plt.plot(r, p)
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title((title or "Precision-Recall") + f" | AP={ap:.3f}")
    plt.grid(True, alpha=0.3)
    plt.show()
    return ap, y_score
    
def summary_scores(model, X_tr, y_tr, X_te, y_te):
    """Return a tiny dict of train/test accuracy for quick reporting."""
    yhat_tr = model.predict(X_tr)
    yhat_te = model.predict(X_te)
    return {
        "train_acc": accuracy_score(y_tr, yhat_tr),
        "test_acc":  accuracy_score(y_te, yhat_te),
    }



## 7) Baselines: Decision Tree & Logistic Regression

In [None]:

dt_clf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))
])
dt_clf.fit(X_train, y_train)
dt_scores = summary_scores(dt_clf, X_train, y_train, X_test, y_test)
dt_auc = plot_roc(dt_clf, X_test, y_test, title="ROC – DecisionTree (baseline)")
_ = plot_cm(dt_clf, X_test, y_test, title="CM – DecisionTree (baseline)")
print("DT (baseline) scores:", dt_scores, "AUC:", dt_auc)

log_clf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=2000, solver="lbfgs"))
])
log_clf.fit(X_train, y_train)
log_scores = summary_scores(log_clf, X_train, y_train, X_test, y_test)
log_auc = plot_roc(log_clf, X_test, y_test, title="ROC – LogisticRegression (baseline)")
_ = plot_cm(log_clf, X_test, y_test, title="CM – LogisticRegression (baseline)")
print("LogReg (baseline) scores:", log_scores, "AUC:", log_auc)


## 8) GridSearchCV Stubs (DT / Logistic)

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Decision Tree grid
dt_grid = {
    "clf__max_depth": [None, 5, 8, 12],
    "clf__min_samples_split": [2, 10, 50],
    "clf__min_samples_leaf": [1, 5, 20]
}
dt_pipe = Pipeline([("prep", preprocess),
                    ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))])
dt_gs = GridSearchCV(dt_pipe, dt_grid, cv=cv, n_jobs=-1, scoring="roc_auc")
dt_gs.fit(X_train, y_train)
print("DT best params:", dt_gs.best_params_, "best AUC:", dt_gs.best_score_)
dt_best = dt_gs.best_estimator_
_ = plot_roc(dt_best, X_test, y_test, title="ROC – DecisionTree (tuned)")
_ = plot_cm(dt_best, X_test, y_test, title="CM – DecisionTree (tuned)")

# Logistic grid (L2 regularisation)
log_grid = {
    "clf__C": np.logspace(-6, 3, 10),
    "clf__penalty": ["l2"]
}
log_pipe = Pipeline([("prep", preprocess),
                     ("clf", LogisticRegression(max_iter=5000, solver="lbfgs"))])
log_gs = GridSearchCV(log_pipe, log_grid, cv=cv, n_jobs=-1, scoring="roc_auc")
log_gs.fit(X_train, y_train)
print("LogReg best params:", log_gs.best_params_, "best AUC:", log_gs.best_score_)
log_best = log_gs.best_estimator_
_ = plot_roc(log_best, X_test, y_test, title="ROC – LogisticRegression (tuned)")
_ = plot_cm(log_best, X_test, y_test, title="CM – LogisticRegression (tuned)")


## 9) Feature Selection (RFE / SelectFromModel via DT)

In [None]:

# Refit preprocess on full training to get feature names after OHE
prep_only = preprocess.fit(X_train)
feature_names = []
# Obtain transformed feature names
try:
    feature_names = prep_only.get_feature_names_out().tolist()
except Exception as e:
    print("Could not extract feature names from ColumnTransformer:", e)

# --- RFE with Logistic ---
# We do RFE on a *dense* matrix; transform X_train -> matrix first
Xtr_mat = prep_only.transform(X_train)
Xte_mat = prep_only.transform(X_test)

base_log = LogisticRegression(max_iter=5000, solver="lbfgs")
rfe = RFECV(base_log, step=1, cv=cv, scoring="roc_auc", n_jobs=-1)
rfe.fit(Xtr_mat, y_train)

selected_mask = rfe.support_
if feature_names and len(feature_names) == len(selected_mask):
    selected_features = [f for f, keep in zip(feature_names, selected_mask) if keep]
    print(f"RFE kept {len(selected_features)} features")
else:
    selected_features = np.where(selected_mask)[0].tolist()
    print(f"RFE kept {len(selected_features)} (index positions; names unavailable)")

# Train Logistic on RFE-selected features
base_log.fit(Xtr_mat[:, selected_mask], y_train)
y_score = base_log.predict_proba(Xte_mat[:, selected_mask])[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score)
plt.figure()
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc(fpr, tpr),
                estimator_name="LogReg (RFE)").plot()
plt.title("ROC – Logistic (RFE-selected)")
plt.show()

# --- SelectFromModel using DT (from tuned DT) ---
dt_selector = SelectFromModel(dt_best["clf"], prefit=True, threshold="median")
# transform on features *after* preprocess
# need to rerun dt_best['prep'] to get same mapping
Xtr_dt = dt_best["prep"].transform(X_train)
Xte_dt = dt_best["prep"].transform(X_test)
Xtr_sel = dt_selector.transform(Xtr_dt)
Xte_sel = dt_selector.transform(Xte_dt)

# Fit logistic on DT-selected features
log_sel = LogisticRegression(max_iter=5000, solver="lbfgs")
log_sel.fit(Xtr_sel, y_train)
y_score = log_sel.predict_proba(Xte_sel)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score)
plt.figure()
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc(fpr, tpr),
                estimator_name="LogReg (DT-selected)").plot()
plt.title("ROC – Logistic (DT-selected)")
plt.show()


## 10) Neural Networks (MLPClassifier) – full & reduced

In [None]:

# Grid stubs (small first)
mlp_grid = {
    "clf__hidden_layer_sizes": [(3,), (5,), (7,), (9,)],
    "clf__alpha": [1e-2, 1e-3, 1e-4, 1e-5]
}

mlp_pipe = Pipeline([("prep", preprocess),
                     ("clf", MLPClassifier(random_state=RANDOM_STATE, max_iter=500))])

mlp_gs = GridSearchCV(mlp_pipe, mlp_grid, cv=cv, n_jobs=-1, scoring="roc_auc")
mlp_gs.fit(X_train, y_train)
print("MLP best params:", mlp_gs.best_params_, "best AUC:", mlp_gs.best_score_)
mlp_best = mlp_gs.best_estimator_
_ = plot_roc(mlp_best, X_test, y_test, title="ROC – MLP (tuned, full features)")
_ = plot_cm(mlp_best, X_test, y_test, title="CM – MLP (tuned, full features)")

# Reduced features with DT-selected mask
# Reuse Xtr_sel/Xte_sel from DT-select above
mlp_reduced = MLPClassifier(random_state=RANDOM_STATE, max_iter=500)
mlp_reduced.fit(Xtr_sel, y_train)
y_score = mlp_reduced.predict_proba(Xte_sel)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score)
plt.figure()
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc(fpr, tpr),
                estimator_name="MLP (DT-selected)").plot()
plt.title("ROC – MLP (DT-selected features)")
plt.show()


## 11) Final comparisons (ROC winners only – prepare for Task 5)

In [None]:
# Winners: update these to the tuned/best models you actually want to compare
winners = [
    ("DecisionTree (tuned)", dt_best),
    ("Logistic (tuned)",     log_best),
    ("MLP (tuned full)",     mlp_best),
]

# 1) Sanity print: confirm different estimators and params
for name, model in winners:
    est = model.named_steps["clf"]
    print(f"{name}: {type(est).__name__} | params={getattr(est, 'get_params', lambda: {})()}")

# 2) Overlay ROC curves in ONE figure with labels/AUC
plt.figure()
scores = {}
for name, model in winners:
    y_score = get_scores(model, X_test)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    scores[name] = y_score
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")

plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Comparison – Winners")
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# 3) (Optional) Correlation check of score vectors: if ~1.0 then curves will look identical
import numpy as np
names = list(scores.keys())
for i in range(len(names)):
    for j in range(i+1, len(names)):
        r = np.corrcoef(scores[names[i]], scores[names[j]])[0,1]
        print(f"score corr({names[i]} vs {names[j]}) = {r:.4f}")

# 4) (Optional) PR curves – more informative with class imbalance
plt.figure()
for name, model in winners:
    y_score = scores[name]
    from sklearn.metrics import precision_recall_curve, average_precision_score
    p, r, _ = precision_recall_curve(y_test, y_score)
    ap = average_precision_score(y_test, y_score)
    plt.plot(r, p, label=f"{name} (AP={ap:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.title("Precision–Recall Comparison – Winners")
plt.legend(loc="lower left"); plt.grid(True, alpha=0.3)
plt.show()


## 12) Helpers to save artifacts (optional)

In [None]:

os.makedirs("figures", exist_ok=True)

def save_scores_table(models, X_tr, y_tr, X_te, y_te, path="figures/scores.csv"):
    rows = []
    for name, model in models:
        sc = summary_scores(model, X_tr, y_tr, X_te, y_te)
        rows.append({"model": name, **sc})
    pd.DataFrame(rows).to_csv(path, index=False)
    print(f"Saved {path}")

# Example usage:
# save_scores_table(winners, X_train, y_train, X_test, y_test)


In [None]:
import os
def save_current_figure(path):
    """Save the current Matplotlib figure to 'figures/path'."""
    os.makedirs("figures", exist_ok=True)
    plt.gcf().savefig(os.path.join("figures", path), dpi=150, bbox_inches="tight")
    print("Saved figure ->", os.path.join("figures", path))


In [None]:
# --- call save_current_figure("name.png") after each plot ---
import os, matplotlib.pyplot as plt
def save_current_figure(path):
    os.makedirs("figures", exist_ok=True)
    plt.gcf().savefig(os.path.join("figures", path), dpi=150, bbox_inches="tight")
    print("Saved ->", os.path.join("figures", path))

In [None]:
# --- QUICK HEALTH CHECK ---
import numpy as np, pandas as pd, sklearn
print("sklearn:", sklearn.__version__)

# 1) Basic shapes
print("X_train/X_test shapes:", X_train.shape, X_test.shape)  # expect ~ (33180, ?), (8296, ?)

# 2) Target ratio stays similar after split (stratified)
overall = y.value_counts(normalize=True).to_dict()
testset = y_test.value_counts(normalize=True).to_dict()
print("IsBadBuy ratio overall:", overall)
print("IsBadBuy ratio testset:", testset)

# 3) No '?' left (we replaced with NaN)
q_left = int((df == '?').sum().sum())
print("Remaining '?' cells:", q_left, "->", "OK" if q_left == 0 else "CHECK")

# 4) Feature types discovered by the pipeline step
print("Numeric features:", len(X_train.select_dtypes(np.number).columns))
print("Categorical features:", len(X_train.select_dtypes('object').columns))

# 5) Best models exist (means GridSearch cells ran)
print("Has dt_best:", 'dt_best' in globals())
print("Has log_best:", 'log_best' in globals())
print("Has mlp_best:", 'mlp_best' in globals())


In [None]:
import numpy as np, pandas as pd, sklearn, os
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

def ok(msg, cond):
    print(("✓ " if cond else "✗ ") + msg)
    return bool(cond)

def get_scores(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    raise ValueError("Model has neither predict_proba nor decision_function.")

print("sklearn version:", sklearn.__version__)
print("--- QUICK QA ---")

# 1) Data shape (expected ~41476 rows; test ~0.2)
ok("DataFrame exists", 'df' in globals())
rows_ok = 'df' in globals() and (len(df) > 40000)  # allow small variations
ok(f"Rows >= 40000 (got {len(df) if 'df' in globals() else 'N/A'})", rows_ok)

# 2) Sentinels handled
q_left = int(((df == '?').sum().sum()) if 'df' in globals() else -1)
ok(f"No '?' remaining (found {q_left})", q_left == 0)

# 3) Split sanity (sizes + stratify)
splits_ok = all(v in globals() for v in ["X_train","X_test","y_train","y_test"])
ok("Train/Test defined", splits_ok)
if splits_ok:
    print("X_train/X_test:", X_train.shape, X_test.shape)
    # expected test rows ≈ 0.2 * 41476 ≈ 8296
    ok("Test size reasonable (~20%)", 0.18 <= (len(X_test)/ (len(X_test)+len(X_train))) <= 0.22)
    # stratify check (tolerance 2%)
    ov = y.value_counts(normalize=True).to_dict() if 'y' in globals() else {}
    ts = y_test.value_counts(normalize=True).to_dict()
    drift = max(abs(ov.get(k,0)-ts.get(k,0)) for k in ts.keys())
    ok(f"Stratification drift < 2% (Δ={drift:.3f})", drift < 0.02)

# 4) Pipeline & dtypes
pipe_ok = 'preprocess' in globals()
ok("Preprocessing pipeline exists", pipe_ok)
if pipe_ok:
    # OneHotEncoder outputs dense?
    try:
        ohe = preprocess.named_transformers_['cat'].named_steps['onehot']
        params = ohe.get_params()
        dense = (("sparse_output" in params and params["sparse_output"] is False) or
                 ("sparse" in params and params["sparse"] is False))
        ok("OneHotEncoder outputs dense", dense)
    except Exception as e:
        ok("OneHotEncoder check", False); print("  note:", e)
# Numeric features should be more than 4 after coercion
num_cnt = len(X_train.select_dtypes(np.number).columns) if splits_ok else 0
ok(f"Numeric features > 4 (got {num_cnt})", num_cnt > 4)

# 5) Baseline models trained?
base_ok = True
for name in ["dt_clf","log_clf"]:
    has = name in globals()
    base_ok &= has
    ok(f"Baseline '{name}' exists", has)

# 6) Tuned models exist (GridSearch ran)?
tuned_ok = True
for name in ["dt_best","log_best","mlp_best"]:
    has = name in globals()
    tuned_ok &= has
    ok(f"Tuned '{name}' exists", has)

# 7) Compute quick AUCs (if models exist)
def quick_auc(label, model):
    try:
        y_score = get_scores(model, X_test)
        auc_val = roc_auc_score(y_test, y_score)
        print(f"  {label} AUC(test) = {auc_val:.3f}")
        return True
    except Exception as e:
        print(f"  {label} AUC FAILED:", e)
        return False

if splits_ok:
    if 'dt_clf' in globals(): quick_auc("DT baseline", dt_clf)
    if 'log_clf' in globals(): quick_auc("LogReg baseline", log_clf)
    if 'dt_best' in globals(): quick_auc("DT tuned", dt_best)
    if 'log_best' in globals(): quick_auc("LogReg tuned", log_best)
    if 'mlp_best' in globals(): quick_auc("MLP tuned", mlp_best)

# 8) Winners overlay sanity: three distinct estimators?
if tuned_ok:
    winners = [("DT (tuned)", dt_best), ("LogReg (tuned)", log_best), ("MLP (tuned)", mlp_best)]
    try:
        est_names = [type(m.named_steps["clf"]).__name__ for _, m in winners]
        ok(f"Distinct estimators for winners: {est_names}", len(set(est_names)) == 3)
    except Exception as e:
        ok("Winners distinctness check", False); print("  note:", e)

print("-
