# MMM Data Preparation Pipeline

This notebook demonstrates the comprehensive data preparation pipeline for MMM modeling, including:
- Handling weekly seasonality and trends
- Zero-spend period treatment
- Feature scaling and transformations
- Adstock and saturation transformations
- Interaction feature creation


In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler

# Import our custom modules
from data_preparation import DataPreparator
from utils import set_random_seed, adstock_transform, saturation_transform

# Set random seed for reproducibility
set_random_seed(42)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
# Load the processed data from exploration notebook
data = pd.read_csv('../data/raw/mmm_data.csv')
data['date'] = pd.to_datetime(data['date'])

print(f"Loaded data shape: {data.shape}")
print(f"Date range: {data['date'].min()} to {data['date'].max()}")
print(f"Columns: {list(data.columns)}")
data.head()


## 1. Data Preparation Pipeline

Let's apply the comprehensive data preparation pipeline:


In [None]:
# Initialize data preparator
prep = DataPreparator(random_seed=42)

# Apply comprehensive data preparation
prepared_data = prep.prepare_data(data, apply_transformations=True)

print(f"Prepared data shape: {prepared_data.shape}")
print(f"New features created: {len(prepared_data.columns) - len(data.columns)}")
print(f"Total features: {len(prepared_data.columns)}")


In [None]:
# Display the new features created
new_features = [col for col in prepared_data.columns if col not in data.columns]
print("New features created:")
for i, feature in enumerate(new_features, 1):
    print(f"{i:2d}. {feature}")

print(f"\nFeature categories:")
print(f"- Original features: {len(data.columns)}")
print(f"- New features: {len(new_features)}")
print(f"- Total features: {len(prepared_data.columns)}")
