
# IFN580 – Assignment 1: Starter Notebook


## 1) Config & Imports

In [None]:
# 1) Config & Imports
RANDOM_STATE = 42
TARGET_COL = "IsBadBuy"  # change if your target column differs

import os, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# Ensure inline plots in classic Notebook
%matplotlib inline


## 2) Load Data

In [None]:
# 2) Load Data (robust search + sentinel handling)
DATA_PATHS = [
    "data/kick.csv",
    "kick.csv",
    "assignment 1 data kick.csv",
    "./data/kick.csv",
]

def find_data(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

DATA_PATH = find_data(DATA_PATHS)
if DATA_PATH is None:
    raise FileNotFoundError("CSV not found. Put your dataset as 'data/kick.csv' or update DATA_PATHS.")

print(f"Using data at: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

# Turn '?' into NaN so imputers treat them as missing
df.replace('?', np.nan, inplace=True)

print(df.shape)
display(df.head(3))



## 3) Quick Audit



In [None]:
# 3) Quick Audit + numeric coercion (promote numeric-like object columns)

# Convert object columns that look numeric into real numbers (heuristic)
coerced = []
for c in df.columns:
    if c == TARGET_COL:  # don't touch the target
        continue
    s = pd.to_numeric(df[c], errors="coerce")
    # treat as numeric if majority converts and at least some support
    if (s.notna().mean() > 0.6) and (s.notna().sum() > 100):
        df[c] = s
        coerced.append(c)
print("Coerced to numeric:", len(coerced), "| sample:", coerced[:8])

# Basic info for Task 1 evidence
display(df.info())
print("Target counts (before):")
display(df[TARGET_COL].value_counts(dropna=False))
print("Target ratio (before):")
display(df[TARGET_COL].value_counts(normalize=True))


## 4)Train/Test Split (stratified)

In [None]:
# 4) Train/Test Split (stratified) and drop non-predictive columns
drop_cols = [c for c in ["PurchaseID", "PurchaseDate"] if c in df.columns]  # keep PurchaseTimestamp

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL] + drop_cols)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("Train/Test sizes:", X_train.shape, X_test.shape)


## 5) Preprocessing Pipeline (ColumnTransformer)

In [None]:
# 5) Preprocessing Pipeline (ColumnTransformer) – sklearn-version safe

# Identify feature types
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.columns.difference(numeric_features).tolist()

# Numeric: impute median then standardize (dense)
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: impute mode then One-Hot Encode
# Guard for sklearn param rename: 'sparse' (old) -> 'sparse_output' (new)
try:
    _ = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", ohe)
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))
