# Feature Engineering

This notebook prepares features for modelling customer behaviour. It encodes categorical variables, creates target variables and splits the data into training and test sets.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned data
df = pd.read_csv('../data/processed/customers_clean.csv', parse_dates=['purchase_date'])
df.head()

In [None]:
# Encode membership type as an ordinal feature
ordinal_map = {'Bronze': 0, 'Silver': 1, 'Gold': 2}
df['membership_ordinal'] = df['membership_type'].map(ordinal_map)

# Create binary targets
df['target_high_value'] = df['total_spend'] >= 1000
df['target_satisfied'] = df['satisfaction_level'] == 'Satisfied'

In [None]:
# One‑hot encode categorical variables
features = df[['age','total_spend','items_purchased','avg_rating','days_since_last_purchase','membership_ordinal']].copy()

# Gender one‑hot
gender_dummies = pd.get_dummies(df['gender'], prefix='gender', drop_first=True)
# City one‑hot
city_dummies = pd.get_dummies(df['city'], prefix='city', drop_first=True)
# Discount applied one‑hot
discount_dummies = pd.get_dummies(df['discount_applied'].map({True: 'Discount', False: 'No Discount'}), prefix='discount', drop_first=True)

features = pd.concat([features, gender_dummies, city_dummies, discount_dummies], axis=1)
features.head()

In [None]:
# Split into train and test sets for the high‑value target
X_train, X_test, y_train, y_test = train_test_split(features, df['target_high_value'], test_size=0.2, random_state=42)
print('Training set size:', X_train.shape)
print('Test set size:', X_test.shape)

In [None]:
# Save feature matrix and targets for reuse
import os
os.makedirs('../data/interim', exist_ok=True)
features.to_csv('../data/interim/features.csv', index=False)
df[['target_high_value','target_satisfied']].to_csv('../data/interim/targets.csv', index=False)
print('Features and targets saved to data/interim/')