In [None]:
# 02_feature_engineering.ipynb
# Data Cleaning & Feature Engineering
# Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import joblib

# Load Dataset
df = pd.read_csv("../data/raw/student_performance.csv")
df.head()

# Creating Risk Level (Target for Classification). Risk levels are based on total_score
df["risk_level"] = pd.cut(
    df["total_score"],
    bins=[0, 60, 75, 100],
    labels=["High Risk", "Medium Risk", "Low Risk"]
)

# Show percentage distribution for clarity
risk_dist = df["risk_level"].value_counts(normalize=True) * 100
risk_dist.round(2)

# Feature Selection
feature_cols = ["weekly_self_study_hours", "attendance_percentage", "class_participation"]
X = df[feature_cols]

# Targets
y_score = df["total_score"]    # Regression
y_risk = df["risk_level"]      # Classification

# Display summary shapes
print(f"Feature matrix shape: {X.shape}")
print(f"Regression target shape: {y_score.shape}")
print(f"Classification target shape: {y_risk.shape}")

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features scaled successfully.")

# Regression split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_scaled, y_score, test_size=0.2, random_state=42
)

#Classification split
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_scaled, y_risk, test_size=0.2, random_state=42, stratify=y_risk
)

# Displaying train/test counts
print("Train/Test split completed:")
print(f"Regression train/test: {X_train_reg.shape[0]}/{X_test_reg.shape[0]} samples")
print(f"Classification train/test: {X_train_clf.shape[0]}/{X_test_clf.shape[0]} samples")

# Saving Processed Data for Modeling
os.makedirs("../data/processed", exist_ok=True)

# Saving feature matrices and targets
joblib.dump(X_train_clf, "../data/processed/X_train_clf.pkl")
joblib.dump(X_test_clf, "../data/processed/X_test_clf.pkl")
joblib.dump(y_train_clf, "../data/processed/y_train_clf.pkl")
joblib.dump(y_test_clf, "../data/processed/y_test_clf.pkl")

print("Processed datasets saved successfully.")