In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

RAW_PATH = Path("../data/raw/your_dataset.csv")  # adjust to your dataset
if RAW_PATH.exists():
    df = pd.read_csv(RAW_PATH)
else:
    # Synthetic fallback
    np.random.seed(42)
    df = pd.DataFrame({
        "income": np.random.randint(3000, 10000, 100),
        "spend": np.random.randint(1000, 8000, 100),
        "date": pd.date_range("2024-01-01", periods=100, freq="M"),
    })

df.head()


  "date": pd.date_range("2024-01-01", periods=100, freq="M"),


Unnamed: 0,income,spend,date
0,3860,7863,2024-01-31
1,8390,1337,2024-02-29
2,8226,1878,2024-03-31
3,8191,2076,2024-04-30
4,6772,5887,2024-05-31


**Feature 1 — spend_income_ratio**  
*Rationale:* Captures proportionality of spending relative to income.  
Higher ratios may indicate financial stress or aggressive consumption. 

In [2]:
# Feature 1: Spending-to-Income Ratio
df["spend_income_ratio"] = df["spend"] / df["income"]

**Feature 2 — rolling_spend_mean**  
*Rationale:* Smooths short-term volatility; reflects 3-month spending trend.  
Useful for detecting upward or downward momentum in expenditures.  

In [3]:
# Feature 2: 3-month Rolling Spend Mean
df = df.sort_values("date")
df["rolling_spend_mean"] = df["spend"].rolling(window=3, min_periods=1).mean()

**Feature 3 — log_income**  
*Rationale:* Reduces skew and stabilizes variance in income distribution.  
Helps linear models capture relationships more effectively.  

In [None]:
# Feature 3: Income Log Transformation (to reduce skew)
df["log_income"] = np.log1p(df["income"])

## Reflection
- Features engineered directly from EDA insights (spend-to-income ratio, rolling mean, log income).  
- They add proportionality, temporal smoothing, and variance stabilization.  
- These features will feed into later modeling and risk analysis.  