# Stage 09 — Homework Starter Notebook

In the lecture, we learned how to create engineered features. Now it’s your turn to apply those ideas to your own project data.

In [3]:
import pandas as pd
import numpy as np

n = 180  # ~6 months of daily data
dates = pd.date_range('2021-01-01', periods=n, freq='D')

regions = np.random.choice(['North','South','East','West'], size=n, p=[0.25,0.25,0.30,0.20])
age = np.random.normal(40, 8, size=n).clip(22, 70)
income = np.random.lognormal(mean=10.6, sigma=0.3, size=n)  # lognormal skew
transactions = np.random.poisson(lam=3, size=n) + (np.random.rand(n) < 0.05).astype(int)*8  # rare spikes
base_spend = (income * (0.0015 + 0.00002*(age-40)) + transactions*20)
noise = np.random.normal(0, 50, size=n)
spend = (base_spend + noise).clip(0)

# Assemble
df = pd.DataFrame({
    'date': dates,
    'region': regions,
    'age': age.round(1),
    'income': income.round(2),
    'transactions': transactions,
    'spend': spend.round(2)
})

# Inject missingness and outliers
df.loc[np.random.choice(df.index, size=6, replace=False), 'income'] = np.nan
df.loc[np.random.choice(df.index, size=4, replace=False), 'spend'] = np.nan
df.loc[np.random.choice(df.index, size=2, replace=False), 'transactions'] = df['transactions'].max() + 15  # extreme outliers

df.head()

Unnamed: 0,date,region,age,income,transactions,spend
0,2021-01-01,East,36.8,49165.05,2,138.97
1,2021-01-02,South,30.7,35789.05,5,154.36
2,2021-01-03,North,39.3,37523.49,0,80.71
3,2021-01-04,South,41.6,36655.73,3,79.23
4,2021-01-05,South,47.0,35862.8,5,98.63


## TODO: Implement at least 2 engineered features here

In [11]:
# Example template:
# Add rationale in markdown below
df['lg_income'] = np.where(
    df['income'] > 0, 
    np.log(df['income']), 
    0
)
df['lg_transactions'] = np.where(
    df['transactions'] > 0, 
    np.log(df['transactions']), 
    0
)
df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,date,region,age,income,transactions,spend,lg_income,lg_transactions
0,2021-01-01,East,36.8,49165.05,2,138.97,10.802938,0.693147
1,2021-01-02,South,30.7,35789.05,5,154.36,10.485397,1.609438
2,2021-01-03,North,39.3,37523.49,0,80.71,10.532722,0.000000
3,2021-01-04,South,41.6,36655.73,3,79.23,10.509325,1.098612
4,2021-01-05,South,47.0,35862.80,5,98.63,10.487456,1.609438
...,...,...,...,...,...,...,...,...
175,2021-06-25,West,43.2,30647.84,6,208.28,10.330317,1.791759
176,2021-06-26,East,41.8,41043.01,8,181.27,10.622376,2.079442
177,2021-06-27,West,42.6,29052.80,2,37.77,10.276870,0.693147
178,2021-06-28,East,50.3,35334.00,3,177.69,10.472601,1.098612


### Rationale for Feature 1
lg_income and lg_transaction to adjust their skewness 

In [12]:
# TODO: Add another feature
df_onehot = pd.get_dummies(df, columns=['region'])
df_onehot 

Unnamed: 0,date,age,income,transactions,spend,lg_income,lg_transactions,region_East,region_North,region_South,region_West
0,2021-01-01,36.8,49165.05,2,138.97,10.802938,0.693147,True,False,False,False
1,2021-01-02,30.7,35789.05,5,154.36,10.485397,1.609438,False,False,True,False
2,2021-01-03,39.3,37523.49,0,80.71,10.532722,0.000000,False,True,False,False
3,2021-01-04,41.6,36655.73,3,79.23,10.509325,1.098612,False,False,True,False
4,2021-01-05,47.0,35862.80,5,98.63,10.487456,1.609438,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
175,2021-06-25,43.2,30647.84,6,208.28,10.330317,1.791759,False,False,False,True
176,2021-06-26,41.8,41043.01,8,181.27,10.622376,2.079442,True,False,False,False
177,2021-06-27,42.6,29052.80,2,37.77,10.276870,0.693147,False,False,False,True
178,2021-06-28,50.3,35334.00,3,177.69,10.472601,1.098612,True,False,False,False


### Rationale for Feature 2
Change category feature into one-hot could benefit later analysis for data, getting away the string, which model can't indentify.
