# 03 - Feature Engineering
This notebook encodes categorical features, scales numerical ones, and prepares data for model training.

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
sns.set(style='whitegrid')
%matplotlib inline

In [26]:
# Load cleaned data
df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train_cleaned.csv")
df.shape

(1460, 77)

In [40]:
# REMOVE ID if it exists in training data
if 'Id' in df.columns:
    df = df.drop(columns=['Id'])


In [41]:
# Separate features and target
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [35]:
# Identify categorical and numerical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Numerical: {len(num_cols)} columns")
print(f"Categorical: {len(cat_cols)} columns")

Numerical: 36 columns
Categorical: 39 columns


In [36]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape

((1168, 75), (292, 75))

In [37]:
# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [42]:
preprocessor.fit(X_train)

# SAVE updated preprocessor
import joblib
joblib.dump(preprocessor, "/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")


['/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl']

In [38]:
# Fit and transform training data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_train_processed.shape, X_val_processed.shape

((1168, 272), (292, 272))

In [39]:
# Save processed data for model training
np.save("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/X_train.npy", X_train_processed)
np.save("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/X_val.npy", X_val_processed)
y_train.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/y_train.csv", index=False)
y_val.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/y_val.csv", index=False)
print("Processed data saved.")

Processed data saved.


In [33]:
import joblib
joblib.dump(preprocessor, "/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")


['/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl']

In [43]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # 🔥 this is the key
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

# ⛔ DO NOT refit in 05-submission notebook later
preprocessor.fit(X_train)

import joblib
joblib.dump(preprocessor, "/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")


['/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl']