# Preprocessing: Clean, Encode, and Split (student_clean.csv)
This notebook loads the cleaned data, fixes quoting artifacts, encodes categoricals (mixed: One-Hot + Ordinal), and saves:
- `data/student_preprocessed.csv`
- `data/train.csv`, `data/test.csv` (80/20 split, stratified if `G3` exists)

In [27]:
# Preprocessing: UCI Student Performance (Math + Portuguese)
# ----------------------------------------------------------
# Inputs (relative to repo root):
#   Final-Project-Team-4-AAI500/data/raw/student-mat.csv
#   Final-Project-Team-4-AAI500/data/raw/student-por.csv
#
# Outputs (written here):
#   Final-Project-Team-4-AAI500/data/clean/student_clean_classification.csv
#   Final-Project-Team-4-AAI500/data/clean/student_clean_classification_noG1G2.csv
#
# Notes:
# - Target: 'performance_level' derived from G3  (Low:<10, Medium:10–14, High:15–20)
# - Encoding: one-hot (drop_first=True)
# - Two feature sets:
#     (1) FULL  : includes G1, G2 (baseline; can leak early grades)
#     (2) NO-LEAK: drops G1, G2  ✅ recommended for factor-impact analysis

import pandas as pd
import numpy as np
from pathlib import Path

REPO_NAME = "Final-Project-Team-4-AAI500"

# -------- Locate repo root from current working directory --------
cwd = Path.cwd()
candidates = [cwd] + list(cwd.parents)
repo_root = None
for p in candidates:
    if (p / REPO_NAME).exists():
        repo_root = Path.cwd().parent 
        break
# If you're already at the root, allow that too
if repo_root is None and (cwd / "data" / "raw").exists():
    repo_root = cwd

if repo_root is None:
    raise RuntimeError(
        f"Could not find '{REPO_NAME}' in current path tree. "
        "Open the notebook inside the repo or adjust REPO_NAME."
    )

print(f"[info] repo_root = {repo_root}")

# -------- Fixed project-relative paths you provided --------
path_mat = repo_root / "data" / "raw" / "student-mat.csv"
path_por = repo_root / "data" / "raw" / "student-por.csv"
out_dir  = repo_root / "data" / "clean"
out_dir.mkdir(parents=True, exist_ok=True)

out_full   = out_dir / "student_clean_classification.csv"
out_noleak = out_dir / "student_clean_classification_noG1G2.csv"

# -------- Load raw data --------
for p in (path_mat, path_por):
    if not p.exists():
        raise FileNotFoundError(f"Missing input file: {p}")

mat = pd.read_csv(path_mat, sep=";")
por = pd.read_csv(path_por, sep=";")

# Label course and concatenate (no risky merges)
mat["course"] = "math"
por["course"] = "por"
raw = pd.concat([mat, por], ignore_index=True)

# -------- Target bucketing from G3 --------
if "G3" not in raw.columns:
    raise ValueError("Expected column 'G3' not found.")

def bucket_performance(g):
    if g < 10:
        return "Low"
    elif g < 15:
        return "Medium"
    else:
        return "High"

raw["performance_level"] = raw["G3"].apply(bucket_performance)

# -------- Separate types --------
categorical_cols = raw.select_dtypes(include=["object"]).columns.tolist()
categorical_for_X = [c for c in categorical_cols if c != "performance_level"]
numeric_cols = raw.select_dtypes(include=[np.number]).columns.tolist()

# -------- Build X / y, encode categoricals --------
X = raw[categorical_for_X + numeric_cols].copy()
y = raw["performance_level"].copy()

X_enc = pd.get_dummies(X, columns=categorical_for_X, drop_first=True)

# -------- Two feature sets --------
# FULL (includes early grades)
clean_full = pd.concat([X_enc, y], axis=1)

# NO-LEAK (drops G1, G2)
cols_no_leak = [c for c in X_enc.columns if c not in {"G1", "G2"}]
clean_no_leak = pd.concat([X_enc[cols_no_leak], y], axis=1)

# -------- Save --------
clean_full.to_csv(out_full, index=False)
clean_no_leak.to_csv(out_noleak, index=False)

# -------- Quick summary prints --------
def summarize(df, name):
    n_rows, n_cols = df.shape
    print(f"{name}: {n_rows} rows, {n_cols} columns")

print("\n=== Preprocessing Summary ===")
print(f"Combined raw shape: {raw.shape}")
summarize(clean_full,   "Clean (FULL, includes G1/G2)")
summarize(clean_no_leak, "Clean (NO-LEAK, drops G1/G2)")

print("\nClass distribution (performance_level):")
print(y.value_counts().sort_index())

print("\nFiles written:")
print(" -", out_full.as_posix())
print(" -", out_noleak.as_posix())


[info] repo_root = /Users/gisselletosta/Final-Project-Team-4-AAI500

=== Preprocessing Summary ===
Combined raw shape: (1044, 35)
Clean (FULL, includes G1/G2): 1044 rows, 44 columns
Clean (NO-LEAK, drops G1/G2): 1044 rows, 42 columns

Class distribution (performance_level):
performance_level
High      204
Low       230
Medium    610
Name: count, dtype: int64

Files written:
 - /Users/gisselletosta/Final-Project-Team-4-AAI500/data/clean/student_clean_classification.csv
 - /Users/gisselletosta/Final-Project-Team-4-AAI500/data/clean/student_clean_classification_noG1G2.csv
