# Stage 09 — Feature Engineering Homework

This notebook creates new features from the dataset and documents the rationale.

## 1. Load Dataset

In [None]:
import pandas as pd, numpy as np
from pathlib import Path

DATA = Path('../data/raw/sample.csv')
DATA.parent.mkdir(parents=True, exist_ok=True)

# Synthetic fallback if no dataset exists
if not DATA.exists():
    np.random.seed(42)
    df = pd.DataFrame({
        'id': range(1,201),
        'age': np.random.normal(35, 10, 200).round(0),
        'income': np.random.normal(60000, 15000, 200).round(0),
        'spend': np.random.normal(2000, 500, 200).round(0),
        'gender': np.random.choice(['M','F'], 200)
    })
    df.to_csv(DATA, index=False)

df = pd.read_csv(DATA)
df.head()

## 2. Feature 1 — Income per Age
Rationale: Normalizes income by age, capturing relative earning capacity across different age groups.

In [None]:
df['income_per_age'] = df['income'] / df['age'].replace(0, np.nan)
df[['age','income','income_per_age']].head()

## 3. Feature 2 — Spend to Income Ratio
Rationale: Proportion of spending relative to income; higher ratios may indicate financial stress or higher consumption preference.

In [None]:
df['spend_to_income_ratio'] = df['spend'] / df['income'].replace(0, np.nan)
df[['income','spend','spend_to_income_ratio']].head()

## 4. Feature 3 — High Income Flag
Rationale: Binary feature indicating above-average income; useful for classification-type tasks.

In [None]:
mean_income = df['income'].mean()
df['high_income_flag'] = (df['income'] > mean_income).astype(int)
df[['income','high_income_flag']].head()

## 5. Visualization and Correlation Checks

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(x='age', y='income_per_age', data=df)
plt.title('Age vs Income per Age')
plt.show()

sns.boxplot(x='high_income_flag', y='spend_to_income_ratio', data=df)
plt.title('Spend to Income Ratio by High Income Flag')
plt.show()

## 6. Save Engineered Dataset

In [None]:
OUT = Path('../data/processed/engineered.csv')
OUT.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT, index=False)
print('Saved ->', OUT.resolve())

## 7. Conclusion
- Added three engineered features.
- Each feature has clear rationale documented.
- Saved cleaned dataset for downstream modeling.