# 03 - Feature Engineering

Create lag, rolling, and temporal features for time series forecasting

In [None]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append(os.path.abspath('../src'))
from src.features import create_lag_features, create_rolling_features, create_temporal_features, engineer_features

## Load Dataset

In [2]:
df = pd.read_csv('../data/processed/final_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")
print(f"Columns: {list(df.columns)[:10]}...")  # Show first 10 columns
df.head()

Dataset shape: (743268, 28)
Memory usage: 0.50 GB
Columns: ['date', 'item_id', 'quantity', 'price_base', 'sum_total', 'store_id', 'division', 'format', 'city', 'area']...


Unnamed: 0,date,item_id,quantity,price_base,sum_total,store_id,division,format,city,area,...,code,promo_price_before,promo_price_after,promo_days,online_qty,online_price,online_revenue,markdown_normal_price,markdown_price,markdown_qty
0,2023-02-22,ef09dbc9fa66,2.0,44.91,89.82,2,Div2,MaxiEuro,City2,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-03-09,95416d766ab9,4.0,54.9,219.6,1,Div1,Regular,City1,1200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-09-19,65093e8d67e6,1.0,401.0,401.0,4,Div1,MaxiEuro,City3,1887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-02-09,4fa30d19baa7,2.0,144.9,289.8,1,Div1,Regular,City1,1200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-07-10,53774f35b377,7.0,20.9,146.3,1,Div1,Regular,City1,1200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create Temporal Features

In [3]:
      # Parse date column
df['date'] = pd.to_datetime(df['date'])

# Extract temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week

print("âœ“ Temporal features created")
print(f"Shape after temporal features: {df.shape}")
print(f"\nSample temporal features:")
print(df[['date', 'year', 'month', 'day_of_week', 'week_of_year']].head())

âœ“ Temporal features created
Shape after temporal features: (743268, 34)

Sample temporal features:
        date  year  month  day_of_week  week_of_year
0 2023-02-22  2023      2            2             8
1 2024-03-09  2024      3            5            10
2 2024-09-19  2024      9            3            38
3 2023-02-09  2023      2            3             6
4 2023-07-10  2023      7            0            28


## Create Lag Features

In [4]:
# Create lag features (previous 7 and 30 days sales)
for lag_days in [7, 30]:
    df[f'quantity_lag_{lag_days}'] = df.groupby(['item_id', 'store_id'])['quantity'].shift(lag_days)
    df[f'sum_total_lag_{lag_days}'] = df.groupby(['item_id', 'store_id'])['sum_total'].shift(lag_days)

print("* Lag features created (7, 30 days)")
print(f"Shape after lag features: {df.shape}")
print(f"\nLag features sample:")
print(df[['quantity', 'quantity_lag_7', 'quantity_lag_30']].head(40))

* Lag features created (7, 30 days)
Shape after lag features: (743268, 38)

Lag features sample:
    quantity  quantity_lag_7  quantity_lag_30
0      2.000             NaN              NaN
1      4.000             NaN              NaN
2      1.000             NaN              NaN
3      2.000             NaN              NaN
4      7.000             NaN              NaN
5      2.895             NaN              NaN
6      8.000             NaN              NaN
7      4.000             NaN              NaN
8      1.000             NaN              NaN
9      3.000             NaN              NaN
10     1.000             NaN              NaN
11     1.000             NaN              NaN
12     2.000             NaN              NaN
13     1.000             NaN              NaN
14     1.000             NaN              NaN
15     1.000             NaN              NaN
16    11.000             NaN              NaN
17     2.000             NaN              NaN
18     1.010             NaN 

## Create Rolling Features

In [5]:
# Create rolling mean and std features (7, 14, 30 day windows)
for window in [7, 14, 30]:
    df[f'quantity_rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'])['quantity'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )
    df[f'sum_total_rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'])['sum_total'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )

print("* Rolling features created (windows: 7, 14, 30)")
print(f"Shape after rolling features: {df.shape}")
print(f"\nRolling features sample:")
print(df[['quantity', 'quantity_rolling_mean_7', 'quantity_rolling_mean_14']].head(40))

* Rolling features created (windows: 7, 14, 30)
Shape after rolling features: (743268, 44)

Rolling features sample:
    quantity  quantity_rolling_mean_7  quantity_rolling_mean_14
0      2.000                    2.000                     2.000
1      4.000                    4.000                     4.000
2      1.000                    1.000                     1.000
3      2.000                    2.000                     2.000
4      7.000                    7.000                     7.000
5      2.895                    2.895                     2.895
6      8.000                    8.000                     8.000
7      4.000                    4.000                     4.000
8      1.000                    1.000                     1.000
9      3.000                    3.000                     3.000
10     1.000                    1.000                     1.000
11     1.000                    1.000                     1.000
12     2.000                    2.000              

## Feature Summary

In [6]:
# Check missing values
print("Missing values before cleaning:")
print(df.isnull().sum().sort_values(ascending=False))
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Drop rows with NaN created by lag and rolling features
df_clean = df.dropna()
print(f"\nâœ“ Cleaned dataset shape: {df_clean.shape}")
print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")

df = df_clean

Missing values before cleaning:
sum_total_lag_30             596660
quantity_lag_30              596660
quantity_lag_7               251178
sum_total_lag_7              251178
sum_total                         0
store_id                          0
division                          0
format                            0
date                              0
item_id                           0
quantity                          0
price_base                        0
class_name                        0
dept_name                         0
area                              0
city                              0
subclass_name                     0
item_type                         0
weight_volume                     0
weight_netto                      0
promo_price_after                 0
promo_days                        0
online_qty                        0
online_price                      0
fatness                           0
price                             0
code                            

## Remove NaN rows from lags and rolling

In [7]:
# Save featured dataset
output_path = '../data/processed/featured_dataset.csv'
df.to_csv(output_path, index=False)
print(f"âœ“ Featured dataset saved: {output_path}")
print(f"Final shape: {df.shape}")
print(f"Features: {df.shape[1]}")

# Show summary
print(f"\nðŸ“Š Feature Engineering Summary:")
print(f"  - Temporal features: year, month, quarter, day_of_week, day_of_month, week_of_year")
print(f"  - Lag features: quantity_lag_7/30, sum_total_lag_7/30")
print(f"  - Rolling features: quantity_rolling_mean/std_(7,14,30)")
print(f"  - Total features: {df.shape[1]}")
print(f"  - Total samples: {df.shape[0]:,}")

âœ“ Featured dataset saved: ../data/processed/featured_dataset.csv
Final shape: (146608, 44)
Features: 44

ðŸ“Š Feature Engineering Summary:
  - Temporal features: year, month, quarter, day_of_week, day_of_month, week_of_year
  - Lag features: quantity_lag_7/30, sum_total_lag_7/30
  - Rolling features: quantity_rolling_mean/std_(7,14,30)
  - Total features: 44
  - Total samples: 146,608


## Save Featured Dataset

In [8]:
# Save the dataset with engineered features
output_path = '../data/processed/featured_dataset.csv'
df.to_csv(output_path, index=False)
print(f"âœ“ Featured dataset saved: {output_path}")
print(f"Final shape: {df.shape}")

print(f"\nSample:")
df.head()

âœ“ Featured dataset saved: ../data/processed/featured_dataset.csv
Final shape: (146608, 44)

Sample:


Unnamed: 0,date,item_id,quantity,price_base,sum_total,store_id,division,format,city,area,...,quantity_lag_7,sum_total_lag_7,quantity_lag_30,sum_total_lag_30,quantity_rolling_mean_7,sum_total_rolling_mean_7,quantity_rolling_mean_14,sum_total_rolling_mean_14,quantity_rolling_mean_30,sum_total_rolling_mean_30
163618,2023-08-23,0be3804714de,8.0,129.9,1039.2,1,Div1,Regular,City1,1200,...,15.0,1948.5,4.0,389.7,7.142857,927.728571,10.785714,1390.882143,9.433333,1209.353
172011,2023-03-15,0be3804714de,3.0,129.9,389.7,1,Div1,Regular,City1,1200,...,15.0,1947.6,10.0,1280.61,5.428571,705.171429,9.285714,1196.771429,9.2,1179.656
175320,2022-12-30,18d0dd039cd8,3.332,699.9,2332.08,1,Div1,Regular,City1,1200,...,1.516,1061.05,0.374,246.8,6.827429,3097.511429,5.230714,2523.572857,5.209267,2605.126333
175534,2023-08-30,b55a901fb8ff,7.0,59.89,419.2,1,Div1,Regular,City1,1200,...,6.0,406.68,2.0,119.8,3.571429,223.922857,4.428571,276.397143,3.9,239.049667
180354,2024-08-22,0be3804714de,25.0,129.9,3247.5,1,Div1,Regular,City1,1200,...,6.0,779.4,3.0,389.7,8.142857,1057.757143,9.428571,1224.642857,9.933333,1274.916


## Feature Statistics

In [9]:
print(df.describe())

                                date      quantity     price_base  \
count                         146608  146608.00000  146608.000000   
mean   2023-09-18 03:50:57.426607360       9.53005     169.377886   
min              2022-08-28 00:00:00     -28.00000    -304.620000   
25%              2023-03-18 00:00:00       2.00000      51.000000   
50%              2023-09-21 00:00:00       3.25500      89.000000   
75%              2024-03-23 00:00:00       8.00000     169.000000   
max              2024-09-26 00:00:00    4243.00000    4999.000000   
std                              NaN      42.07140     256.422225   

           sum_total       store_id           area  weight_volume  \
count  146608.000000  146608.000000  146608.000000  146608.000000   
mean     1110.147845       1.517809    1235.311641       0.075207   
min      -899.200000       1.000000    1000.000000       0.000000   
25%       149.900000       1.000000    1200.000000       0.000000   
50%       336.880000       1.0000

## Ready for Model Training!

The dataset now has:
- Temporal features (day_of_week, month, quarter, year, week_of_year)
- Lag features (lag_7, lag_14, lag_30)
- Rolling features (rolling_mean, rolling_std for windows 7, 14, 30)
- No missing values

Proceed to **04_model_training.ipynb**