# 03 - Feature Engineering

Create lag, rolling, and temporal features for time series forecasting

In [1]:
import pandas as pd
import numpy as np
import sys

sys.path.append('../src')
from features import create_lag_features, create_rolling_features, create_temporal_features, engineer_features

## Load Dataset

In [2]:
df = pd.read_csv('../data/processed/final_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")
print(f"Columns: {list(df.columns)[:10]}...")  # Show first 10 columns
df.head()

Original shape: (4210160, 28)
Columns: ['date', 'item_id', 'quantity', 'price_base', 'sum_total', 'store_id', 'division', 'format', 'city', 'area', 'dept_name', 'class_name', 'subclass_name', 'item_type', 'weight_volume', 'weight_netto', 'fatness', 'price', 'code', 'promo_price_before', 'promo_price_after', 'promo_days', 'online_qty', 'online_price', 'online_revenue', 'markdown_normal_price', 'markdown_price', 'markdown_qty']


Unnamed: 0,date,item_id,quantity,price_base,sum_total,store_id,division,format,city,area,...,code,promo_price_before,promo_price_after,promo_days,online_qty,online_price,online_revenue,markdown_normal_price,markdown_price,markdown_qty
0,2023-08-04,293375605257,1.0,47.86,47.86,1,Div1,Format-1,City1,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-08-04,a66fdf2c0ae7,3.0,49.6,148.8,1,Div1,Format-1,City1,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-08-04,daa46ef49b7a,0.822,379.0,311.54,1,Div1,Format-1,City1,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-08-04,a3b49c1bf758,1.0,129.0,129.0,1,Div1,Format-1,City1,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-08-04,ab611c5cef62,7.0,79.9,559.3,1,Div1,Format-1,City1,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create Temporal Features

In [3]:
      # Parse date column
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week

print("âœ“ Temporal features created")
print(f"Shape after temporal features: {df.shape}")
print(f"\nSample temporal features:")
print(df[['date', 'year', 'month', 'day_of_week', 'week_of_year']].head())

Temporal features created:
   day_of_week  month  quarter  year  week_of_year
0            4      8        3  2023            31
1            4      8        3  2023            31
2            4      8        3  2023            31
3            4      8        3  2023            31
4            4      8        3  2023            31
5            4      8        3  2023            31
6            4      8        3  2023            31
7            4      8        3  2023            31
8            4      8        3  2023            31
9            4      8        3  2023            31


## Create Lag Features

In [None]:
# Create lag features (previous 7 and 30 days sales)
for lag_days in [7, 30]:
    df[f'quantity_lag_{lag_days}'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(lag_days)
    df[f'sum_total_lag_{lag_days}'] = df.groupby(['item_id', 'store_id'])['sum_total'].shift(lag_days)

print("* Lag features created (7, 30 days)")
print(f"Shape after lag features: {df.shape}")
print(f"\nLag features sample:")
print(df[['quantity', 'quantity_lag_7', 'quantity_lag_30']].head(40))

Column 'sales' not found


## Create Rolling Features

In [None]:
# Create rolling mean and std features (7, 14, 30 day windows)
for window in [7, 14, 30]:
    df[f'quantity_rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )
    df[f'sum_total_rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'])['sum_total'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

print("* Rolling features created (windows: 7, 14, 30)")
print(f"Shape after rolling features: {df.shape}")
print(f"\nRolling features sample:")
print(df[['quantity', 'quantity_rolling_mean_7', 'quantity_rolling_mean_14']].head(40))

## Feature Summary

In [None]:
# Check missing values
print("Missing values before cleaning:")
print(df.isnull().sum().sort_values(ascending=False))
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Drop rows with NaN created by lag and rolling features
df_clean = df.dropna()
print(f"\nâœ“ Cleaned dataset shape: {df_clean.shape}")
print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")

df = df_clean

## Remove NaN rows from lags and rolling

In [None]:
# Save featured dataset
output_path = '../data/processed/featured_dataset.csv'
df.to_csv(output_path, index=False)
print(f"âœ“ Featured dataset saved: {output_path}")
print(f"Final shape: {df.shape}")
print(f"Features: {df.shape[1]}")

# Show summary
print(f"\nðŸ“Š Feature Engineering Summary:")
print(f"  - Temporal features: year, month, quarter, day_of_week, day_of_month, week_of_year")
print(f"  - Lag features: quantity_lag_7/30, sum_total_lag_7/30")
print(f"  - Rolling features: quantity_rolling_mean/std_(7,14,30)")
print(f"  - Total features: {df.shape[1]}")
print(f"  - Total samples: {df.shape[0]:,}")

## Save Featured Dataset

In [None]:
# Save the dataset with engineered features
output_path = '../data/processed/featured_dataset.csv'
df.to_csv(output_path, index=False)
print(f"âœ“ Featured dataset saved: {output_path}")
print(f"Final shape: {df.shape}")

print(f"\nSample:")
df.head()

## Feature Statistics

In [None]:
print(df.describe())

## Ready for Model Training!

The dataset now has:
- Temporal features (day_of_week, month, quarter, year, week_of_year)
- Lag features (lag_7, lag_14, lag_30)
- Rolling features (rolling_mean, rolling_std for windows 7, 14, 30)
- No missing values

Proceed to **04_model_training.ipynb**