# Feature Engineering
This notebook preprocesses the datasets using scaling + one-hot encoding.

In [7]:
from pathlib import Path
import sys
import pandas as pd

# Add src folder directly
src_path = Path.cwd().parent / "src"
sys.path.insert(0, str(src_path))

from data_loader import load_data
from preprocessing import build_preprocessor


In [8]:
# Load Portuguese dataset for demonstration
_, por = load_data()
if por is None:
    print("Dataset not found!")
else:
    print("Dataset loaded successfully!")

2025-09-29 09:20:52,289 [INFO] Math dataset loaded successfully with shape (395, 33)
2025-09-29 09:20:52,292 [INFO] Portuguese dataset loaded successfully with shape (649, 33)
Dataset loaded successfully!


In [9]:
X = por.drop("G3", axis=1)
y = por["G3"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (649, 32)
Target shape: (649,)


In [10]:
# Identify column types
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']
Categorical columns: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [11]:
# Build preprocessor and transform
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
X_transformed = preprocessor.fit_transform(X)

print("Original shape:", X.shape)
print("Transformed shape:", X_transformed.shape)

Original shape: (649, 32)
Transformed shape: (649, 41)


In [12]:
# Display feature names after transformation
feature_names = preprocessor.get_feature_names_out()
print("Number of features after preprocessing:", len(feature_names))
print("First 10 features:", feature_names[:10])

Number of features after preprocessing: 41
First 10 features: ['num__age' 'num__Medu' 'num__Fedu' 'num__traveltime' 'num__studytime'
 'num__failures' 'num__famrel' 'num__freetime' 'num__goout' 'num__Dalc']
