# Feature Engineering
This notebook preprocesses the datasets using scaling + one-hot encoding.

In [None]:
import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('__file__'), "..")))

import pandas as pd
from src.data_loader import load_data
from src.preprocessing import build_preprocessor

In [None]:
# Load Portuguese dataset for demonstration
_, por = load_data()
if por is None:
    print("Dataset not found!")
else:
    print("Dataset loaded successfully!")

In [None]:
X = por.drop("G3", axis=1)
y = por["G3"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Identify column types
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

In [None]:
# Build preprocessor and transform
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
X_transformed = preprocessor.fit_transform(X)

print("Original shape:", X.shape)
print("Transformed shape:", X_transformed.shape)

In [None]:
# Display feature names after transformation
feature_names = preprocessor.get_feature_names_out()
print("Number of features after preprocessing:", len(feature_names))
print("First 10 features:", feature_names[:10])