
# IFN580 – Assignment 1: Starter Notebook


## 1) Config & Imports

In [None]:

# ---- User config ----
RANDOM_STATE = 42
TARGET_COL   = "IsBadBuy"   # change if different
DATA_PATHS   = [
    "kick.csv",                         # put the CSV next to this notebook
    "assignment 1 data kick.csv",       # alt name (rename as needed)
    "./data/kick.csv",                  # common project structure
]

# ---- Imports ----
import os, sys, math, json, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             roc_curve, auc, RocCurveDisplay, classification_report,
                             accuracy_score)
from sklearn.feature_selection import RFE, RFECV, SelectFromModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
np.random.seed(RANDOM_STATE)


## 2) Load Data

In [None]:

def find_data(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

DATA_PATH = find_data(DATA_PATHS)
if DATA_PATH is None:
    raise FileNotFoundError(
        f"CSV not found. Place your kick dataset next to this notebook as 'kick.csv' "
        f"or update DATA_PATHS."
    )

print(f"Using data at: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
# --- PATCH AFTER "2) Load Data": handle '?' and coerce numeric-like columns ---
# Replace '?' with real missing values so the pipeline can impute correctly
df.replace('?', np.nan, inplace=True)

# Try to convert columns that look numeric into real numbers
coerced = []
for c in df.columns:
    if c == "IsBadBuy":
        continue
    s = pd.to_numeric(df[c], errors="coerce")
    # Treat as numeric if most values can be converted
    if (s.notna().mean() > 0.6) and (s.notna().sum() > 100):
        df[c] = s
        coerced.append(c)
print("Coerced to numeric (first few):", coerced[:10], "| total:", len(coerced))



## 3) Quick Audit (Task 1)



In [None]:

# Basic info
display(df.info())
display(df.describe(include='all').T)

# Target distribution (before preprocessing)
if TARGET_COL not in df.columns:
    raise KeyError(f"TARGET_COL '{TARGET_COL}' not found. Set TARGET_COL correctly.")

target_counts = df[TARGET_COL].value_counts(dropna=False)
target_ratio  = target_counts / len(df)
print("Target counts (before):")
display(pd.DataFrame({"count": target_counts, "ratio": target_ratio}))
