
# IFN580 – Assignment 1: Starter Notebook


## 1) Config & Imports

In [1]:

# ---- User config ----
RANDOM_STATE = 42
TARGET_COL   = "IsBadBuy"   # change if different
DATA_PATHS   = [
    "kick.csv",                         # put the CSV next to this notebook
    "assignment 1 data kick.csv",       # alt name (rename as needed)
    "./data/kick.csv",                  # common project structure
]

# ---- Imports ----
import os, sys, math, json, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve, auc, RocCurveDisplay, classification_report,
                             accuracy_score)
from sklearn.feature_selection import RFE, RFECV, SelectFromModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
np.random.seed(RANDOM_STATE)


## 2) Load Data

In [2]:

def find_data(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

DATA_PATH = find_data(DATA_PATHS)
if DATA_PATH is None:
    raise FileNotFoundError(
        f"CSV not found. Place your kick dataset next to this notebook as 'kick.csv' "
        f"or update DATA_PATHS."
    )

print(f"Using data at: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
# --- PATCH AFTER "2) Load Data": handle '?' and coerce numeric-like columns ---
# Replace '?' with real missing values so the pipeline can impute correctly
df.replace('?', np.nan, inplace=True)

# Try to convert columns that look numeric into real numbers
coerced = []
for c in df.columns:
    if c == "IsBadBuy":
        continue
    s = pd.to_numeric(df[c], errors="coerce")
    # Treat as numeric if most values can be converted
    if (s.notna().mean() > 0.6) and (s.notna().sum() > 100):
        df[c] = s
        coerced.append(c)
print("Coerced to numeric (first few):", coerced[:10], "| total:", len(coerced))


FileNotFoundError: CSV not found. Place your kick dataset next to this notebook as 'kick.csv' or update DATA_PATHS.


## 3) Quick Audit



In [3]:

# Basic info
display(df.info())
display(df.describe(include='all').T)

# Target distribution (before preprocessing)
if TARGET_COL not in df.columns:
    raise KeyError(f"TARGET_COL '{TARGET_COL}' not found. Set TARGET_COL correctly.")

target_counts = df[TARGET_COL].value_counts(dropna=False)
target_ratio  = target_counts / len(df)
print("Target counts (before):")
display(pd.DataFrame({"count": target_counts, "ratio": target_ratio}))


NameError: name 'df' is not defined

## 4)Train/Test Split (stratified)

In [4]:
TARGET_COL = "IsBadBuy"  # change if your target name differs

# Drop pure identifiers and raw text date (we already have PurchaseTimestamp)
drop_cols = [c for c in ["PurchaseID", "PurchaseDate"] if c in df.columns]
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + drop_cols)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("Train/Test sizes:", X_train.shape, X_test.shape)


NameError: name 'df' is not defined

## 5) Preprocessing Pipeline (ColumnTransformer)

In [5]:
import sklearn
from packaging import version

# Identify feature types
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.columns.difference(numeric_features).tolist()

# Numeric: impute median then standardize
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # dense pipeline
])

# Categorical: impute mode then OHE (compatible with new sklearn)
ohe_kwargs = dict(handle_unknown="ignore")
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe_kwargs["sparse_output"] = False   # new param name in newer sklearn
else:
    ohe_kwargs["sparse"] = False          # legacy param name for older sklearn

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(**ohe_kwargs))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))


NameError: name 'X_train' is not defined