In [2]:
# FEATURE ENGINEERING FOR DEMAND FORECASTING

print("CANADIAN GROCERY DEMAND FORECASTING - FEATURE ENGINEERING")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

print("Libraries imported successfully!")
print(f"Feature engineering started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

CANADIAN GROCERY DEMAND FORECASTING - FEATURE ENGINEERING
Libraries imported successfully!
Feature engineering started: 2025-09-05 13:37:53


In [8]:
#LOAD PREVIOUSLY GENERATED DATA
print("Loading saved datasets...")

import os
print(f"Current working directory: {os.getcwd()}")

# We're in notebooks/ but need to go up one level to access data/
data_path = '../data/raw/'
print(f"Looking for data in: {os.path.abspath(data_path)}")

try:
    print(f"Files in data/raw/: {os.listdir(data_path)}")
    
    # Load the datasets with correct relative paths
    sales_df = pd.read_csv('../data/raw/sales_data.csv')
    stores_df = pd.read_csv('../data/raw/stores_data.csv')
    products_df = pd.read_csv('../data/raw/products_data.csv')
    
    # Convert date column
    sales_df['date'] = pd.to_datetime(sales_df['date'])
    
    print(f"Data loaded successfully!")
    print(f"Sales records: {len(sales_df):,}")
    print(f"Date range: {sales_df['date'].min().date()} to {sales_df['date'].max().date()}")
    print(f"Stores: {sales_df['store_id'].nunique()}")
    print(f"Products: {sales_df['product_id'].nunique()}")
    
    # Quick data check
    print("\nData sample:")
    display(sales_df.head())
    print(f"\nData info:")
    print(f"Sales data shape: {sales_df.shape}")
    print(f"Stores data shape: {stores_df.shape}")
    print(f"Products data shape: {products_df.shape}")
    
except FileNotFoundError as e:
    print(f"Files not found: {e}")
    print("Let me check the directory structure...")
    
    # Check parent directory structure
    parent_dir = '..'
    print(f"\nContents of parent directory:")
    for item in os.listdir(parent_dir):
        item_path = os.path.join(parent_dir, item)
        if os.path.isdir(item_path):
            print(f"{item}/")
        else:
            print(f"{item}")

Loading saved datasets...
Current working directory: c:\Users\artha\OneDrive\Desktop\grocery-demand-forecasting\notebooks
Looking for data in: c:\Users\artha\OneDrive\Desktop\grocery-demand-forecasting\data\raw
Files in data/raw/: ['products_data.csv', 'sales_data.csv', 'stores_data.csv']
Data loaded successfully!
Sales records: 89,482
Date range: 2022-01-01 to 2023-12-31
Stores: 25
Products: 123

Data sample:


Unnamed: 0,date,store_id,product_id,product_name,category,brand,sales_quantity,price,revenue,promotion_flag,chain,province,store_size,population_density
0,2022-01-01,ST_002,PR_0029,No Name Ground Beef/kg,Meat,No Name,70,9.16,641.2,0,Metro,BC,Large,Suburban
1,2022-01-01,ST_013,PR_0048,National Brand Bananas/kg,Produce,National Brand,10,1.57,15.7,0,FreshCo,NS,Medium,Suburban
2,2022-01-01,ST_003,PR_0074,No Name Bagels 6pk,Bakery,No Name,99,3.66,362.34,0,Metro,PE,Medium,Urban
3,2022-01-01,ST_020,PR_0115,President's Choice Olive Oil 500ml,Pantry,President's Choice,76,7.47,567.72,0,FreshCo,NS,Small,Urban
4,2022-01-01,ST_014,PR_0078,National Brand Croissants 4pk,Bakery,National Brand,42,5.65,237.3,0,Sobeys,MB,Large,Urban



Data info:
Sales data shape: (89482, 14)
Stores data shape: (25, 6)
Products data shape: (123, 5)


In [10]:
# COMPREHENSIVE FEATURE ENGINEERING
print("Creating advanced features for ML models...")

def create_advanced_features(df):
    """
    Create comprehensive feature set for demand forecasting
    """
    df = df.copy()
    df = df.sort_values(['store_id', 'product_id', 'date'])
    
    print("Creating time-based features...")
    
    # === TIME-BASED FEATURES ===
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['quarter'] = df['date'].dt.quarter
    
    # Weekend and weekday indicators
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_monday'] = (df['day_of_week'] == 0).astype(int)
    df['is_friday'] = (df['day_of_week'] == 4).astype(int)
    
    # Month patterns
    df['is_month_start'] = (df['day'] <= 7).astype(int)
    df['is_month_middle'] = ((df['day'] > 7) & (df['day'] <= 21)).astype(int)
    df['is_month_end'] = (df['day'] > 21).astype(int)
    
    # Seasonal features (cyclical encoding for better ML performance)
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['sin_day_of_week'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_day_of_week'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['sin_day_of_year'] = np.sin(2 * np.pi * df['date'].dt.dayofyear / 365)
    df['cos_day_of_year'] = np.cos(2 * np.pi * df['date'].dt.dayofyear / 365)
    
    print("🇨🇦 Adding Canadian holiday features...")
    
    # === CANADIAN HOLIDAYS ===
    def is_canadian_holiday(date):
        """Check if date is a Canadian statutory holiday"""
        month, day = date.month, date.day
        
        # Fixed holidays
        holidays = {
            (1, 1): 'New Year',
            (7, 1): 'Canada Day', 
            (12, 25): 'Christmas',
            (12, 26): 'Boxing Day'
        }
        
        # Floating holidays (simplified)
        if month == 2 and 10 <= day <= 16:  # Family Day
            return 'Family Day'
        if month == 5 and 18 <= day <= 24:  # Victoria Day
            return 'Victoria Day'
        if month == 9 and 1 <= day <= 7:   # Labour Day
            return 'Labour Day'
        if month == 10 and 8 <= day <= 14: # Thanksgiving
            return 'Thanksgiving'
        if month == 11 and day == 11:      # Remembrance Day
            return 'Remembrance Day'
            
        return holidays.get((month, day), None)
    
    df['holiday_name'] = df['date'].apply(is_canadian_holiday)
    df['is_holiday'] = (df['holiday_name'].notna()).astype(int)
    df['is_major_holiday'] = df['holiday_name'].isin(['Christmas', 'New Year', 'Canada Day']).astype(int)
    
    # Days before/after holidays
    df['days_to_holiday'] = 0
    df['days_from_holiday'] = 0
    
    holiday_dates = df[df['is_holiday'] == 1]['date'].unique()
    for holiday_date in holiday_dates:
        mask_before = (df['date'] >= holiday_date - timedelta(days=3)) & (df['date'] < holiday_date)
        mask_after = (df['date'] > holiday_date) & (df['date'] <= holiday_date + timedelta(days=3))
        
        df.loc[mask_before, 'days_to_holiday'] = (holiday_date - df.loc[mask_before, 'date']).dt.days
        df.loc[mask_after, 'days_from_holiday'] = (df.loc[mask_after, 'date'] - holiday_date).dt.days
    
    print("Creating lag and rolling features...")
    
    # === LAG FEATURES ===
    # Sort by store, product, and date for proper lag calculation
    df = df.sort_values(['store_id', 'product_id', 'date'])
    
    # Sales quantity lags
    for lag in [1, 3, 7, 14, 30]:
        df[f'sales_lag_{lag}'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].shift(lag)
    
    # Revenue lags
    for lag in [1, 7, 30]:
        df[f'revenue_lag_{lag}'] = df.groupby(['store_id', 'product_id'])['revenue'].shift(lag)
    
    # Price lags
    df['price_lag_1'] = df.groupby(['store_id', 'product_id'])['price'].shift(1)
    df['price_lag_7'] = df.groupby(['store_id', 'product_id'])['price'].shift(7)
    
    print("Creating rolling window features...")
    
    # === ROLLING WINDOW FEATURES ===
    for window in [3, 7, 14, 30]:
        # Rolling means
        df[f'sales_rolling_mean_{window}'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].rolling(
            window=window, min_periods=1).mean().values
        
        # Rolling standard deviations (volatility)
        df[f'sales_rolling_std_{window}'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].rolling(
            window=window, min_periods=1).std().values
        
        # Rolling max/min
        df[f'sales_rolling_max_{window}'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].rolling(
            window=window, min_periods=1).max().values
        df[f'sales_rolling_min_{window}'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].rolling(
            window=window, min_periods=1).min().values
    
    print("Creating price and promotion features...")
    
    # === PRICE FEATURES ===
    df['price_change'] = df['price'] - df['price_lag_1']
    df['price_change_pct'] = df['price_change'] / df['price_lag_1']
    df['price_vs_avg'] = df['price'] / df.groupby('product_id')['price'].transform('mean')
    
    # Price elasticity indicators
    df['is_price_drop'] = (df['price_change'] < -0.10).astype(int)
    df['is_price_increase'] = (df['price_change'] > 0.10).astype(int)
    
    # === PROMOTION FEATURES ===
    df['promotion_lag_1'] = df.groupby(['store_id', 'product_id'])['promotion_flag'].shift(1)
    df['promotion_lag_7'] = df.groupby(['store_id', 'product_id'])['promotion_flag'].shift(7)
    
    # Days since last promotion
    df['days_since_promotion'] = df.groupby(['store_id', 'product_id'])['promotion_flag'].apply(
        lambda x: (x == 0).cumsum() - (x == 0).cumsum().where(x == 1).ffill().fillna(0)
    ).values
    
    # Promotion frequency (in last 30 days)
    df['promotion_frequency_30d'] = df.groupby(['store_id', 'product_id'])['promotion_flag'].rolling(
        window=30, min_periods=1).sum().values
    
    print("Creating store and product aggregation features...")
    
    # === AGGREGATION FEATURES ===
    # Store-level daily totals
    df['store_daily_sales'] = df.groupby(['store_id', 'date'])['sales_quantity'].transform('sum')
    df['store_daily_revenue'] = df.groupby(['store_id', 'date'])['revenue'].transform('sum')
    df['store_daily_transactions'] = df.groupby(['store_id', 'date'])['sales_quantity'].transform('count')
    
    # Product-level daily totals (across all stores)
    df['product_daily_sales'] = df.groupby(['product_id', 'date'])['sales_quantity'].transform('sum')
    df['product_daily_revenue'] = df.groupby(['product_id', 'date'])['revenue'].transform('sum')
    
    # Category-level daily totals
    df['category_daily_sales'] = df.groupby(['category', 'date'])['sales_quantity'].transform('sum')
    df['category_daily_revenue'] = df.groupby(['category', 'date'])['revenue'].transform('sum')
    
    # Chain-level daily totals
    df['chain_daily_sales'] = df.groupby(['chain', 'date'])['sales_quantity'].transform('sum')
    
    print("Creating market share and relative features...")
    
    # === MARKET SHARE FEATURES ===
    # Product's share of store sales
    df['product_store_share'] = df['sales_quantity'] / df['store_daily_sales']
    df['product_store_revenue_share'] = df['revenue'] / df['store_daily_revenue']
    
    # Store's share of product sales
    df['store_product_share'] = df['sales_quantity'] / df['product_daily_sales']
    
    # Product's share of category sales
    df['product_category_share'] = df['sales_quantity'] / df['category_daily_sales']
    
    # Store performance relative to chain
    df['store_vs_chain_performance'] = df['store_daily_sales'] / df['chain_daily_sales']
    
    print("Creating product lifecycle features...")
    
    # === PRODUCT LIFECYCLE FEATURES ===
    # Days since product first appeared in store
    df['product_age_in_store'] = df.groupby(['store_id', 'product_id'])['date'].transform(
        lambda x: (x - x.min()).dt.days
    )
    
    # Product velocity (average daily sales)
    df['product_velocity'] = df.groupby(['store_id', 'product_id'])['sales_quantity'].transform('mean')
    
    print("Feature engineering complete!")
    
    return df

# Apply feature engineering
print("Starting comprehensive feature engineering...")
engineered_df = create_advanced_features(sales_df)

print(f"\nFEATURE ENGINEERING SUMMARY:")
print(f"Original features: {sales_df.shape[1]}")
print(f"New features: {engineered_df.shape[1]}")
print(f"Features added: {engineered_df.shape[1] - sales_df.shape[1]}")
print(f"Total rows: {len(engineered_df):,}")

Creating advanced features for ML models...
Starting comprehensive feature engineering...
Creating time-based features...
🇨🇦 Adding Canadian holiday features...
Creating lag and rolling features...
Creating rolling window features...
Creating price and promotion features...
Creating store and product aggregation features...
Creating market share and relative features...
Creating product lifecycle features...
Feature engineering complete!

FEATURE ENGINEERING SUMMARY:
Original features: 14
New features: 87
Features added: 73
Total rows: 89,482


In [11]:
# CATEGORICAL VARIABLE ENCODING
print("Encoding categorical variables for ML models...")

from sklearn.preprocessing import LabelEncoder

def encode_categorical_features(df):
    """
    Encode categorical variables using Label Encoding
    """
    df = df.copy()
    encoders = {}
    
    # Categorical columns to encode
    categorical_columns = [
        'store_id', 'product_id', 'category', 'brand', 'chain', 
        'province', 'store_size', 'population_density', 'holiday_name'
    ]
    
    for col in categorical_columns:
        if col in df.columns:
            print(f"  Encoding {col}...")
            le = LabelEncoder()
            
            # Handle missing values
            df[f'{col}_encoded'] = df[col].fillna('unknown')
            df[f'{col}_encoded'] = le.fit_transform(df[f'{col}_encoded'])
            
            # Store encoder for later use
            encoders[col] = le
            
            print(f"    {col}: {len(le.classes_)} unique values")
    
    return df, encoders

# Apply categorical encoding
engineered_df, label_encoders = encode_categorical_features(engineered_df)

print(f"\nCategorical encoding complete!")
print(f"Encoded columns: {len(label_encoders)}")

# Display feature summary
print(f"\nFINAL FEATURE SET:")
feature_columns = [col for col in engineered_df.columns if col not in ['date', 'product_name']]
print(f"Total features available: {len(feature_columns)}")

# Group features by type
time_features = [col for col in feature_columns if any(x in col for x in ['year', 'month', 'day', 'week', 'quarter', 'sin_', 'cos_', 'holiday', 'weekend'])]
lag_features = [col for col in feature_columns if 'lag_' in col]
rolling_features = [col for col in feature_columns if 'rolling_' in col]
price_features = [col for col in feature_columns if 'price' in col]
promotion_features = [col for col in feature_columns if 'promotion' in col]
aggregation_features = [col for col in feature_columns if any(x in col for x in ['daily_', 'share', 'vs_'])]
encoded_features = [col for col in feature_columns if '_encoded' in col]

print(f"\nFEATURE BREAKDOWN:")
print(f"  Time features: {len(time_features)}")
print(f"  Lag features: {len(lag_features)}")
print(f"  Rolling features: {len(rolling_features)}")
print(f"  Price features: {len(price_features)}")
print(f"  Promotion features: {len(promotion_features)}")
print(f"  Aggregation features: {len(aggregation_features)}")
print(f"  Encoded features: {len(encoded_features)}")

Encoding categorical variables for ML models...
  Encoding store_id...
    store_id: 25 unique values
  Encoding product_id...
    product_id: 123 unique values
  Encoding category...
    category: 6 unique values
  Encoding brand...
    brand: 3 unique values
  Encoding chain...
    chain: 7 unique values
  Encoding province...
    province: 10 unique values
  Encoding store_size...
    store_size: 3 unique values
  Encoding population_density...
    population_density: 3 unique values
  Encoding holiday_name...
    holiday_name: 10 unique values

Categorical encoding complete!
Encoded columns: 9

FINAL FEATURE SET:
Total features available: 94

FEATURE BREAKDOWN:
  Time features: 25
  Lag features: 12
  Rolling features: 16
  Price features: 8
  Promotion features: 5
  Aggregation features: 14
  Encoded features: 9


In [13]:
# FEATURE QUALITY ANALYSIS
print("Analyzing feature quality and correlations...")

# Remove rows with missing target variable
analysis_df = engineered_df.dropna(subset=['sales_quantity']).copy()

print(f"Analysis dataset: {len(analysis_df):,} rows")

# Select numerical features for analysis
numerical_features = analysis_df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col != 'sales_quantity']  # Remove target

print(f"Numerical features for analysis: {len(numerical_features)}")

# Handle infinite values and missing data
analysis_df = analysis_df.replace([np.inf, -np.inf], np.nan)

# Fill missing values with median for numerical columns
missing_filled = 0
for col in numerical_features:
    if analysis_df[col].isnull().sum() > 0:
        median_val = analysis_df[col].median()
        missing_count = analysis_df[col].isnull().sum()
        analysis_df[col] = analysis_df[col].fillna(median_val)
        missing_filled += missing_count

print(f"Filled {missing_filled:,} missing values with median")

# Calculate correlations with target variable
print("Calculating correlations with target variable...")
correlations = analysis_df[numerical_features + ['sales_quantity']].corr()['sales_quantity'].abs().sort_values(ascending=False)

# Top 20 most correlated features
top_features = correlations.head(21).drop('sales_quantity')  # Exclude target itself

print(f"\nTOP 20 FEATURES CORRELATED WITH SALES:")
for i, (feature, corr) in enumerate(top_features.items(), 1):
    print(f"{i:2d}. {feature:<35} | Correlation: {corr:.4f}")

# Visualize top correlations
fig = px.bar(
    x=top_features.head(15).values,
    y=top_features.head(15).index,
    orientation='h',
    title='Top 15 Features Correlated with Sales Quantity',
    labels={'x': 'Absolute Correlation', 'y': 'Features'},
    color=top_features.head(15).values,
    color_continuous_scale='viridis'
)
fig.update_layout(height=600, showlegend=False)
fig.show()

# Feature importance based on variance
feature_variance = analysis_df[numerical_features].var().sort_values(ascending=False)
print(f"\nFEATURES WITH HIGHEST VARIANCE:")
for i, (feature, var) in enumerate(feature_variance.head(10).items(), 1):
    print(f"{i:2d}. {feature:<35} | Variance: {var:.2e}")

# Check for multicollinearity among top features
print(f"\nChecking multicollinearity among top 10 features...")
top_10_features = top_features.head(10).index.tolist()
correlation_matrix = analysis_df[top_10_features].corr()

# Find highly correlated feature pairs
high_corr_pairs = []
for i in range(len(top_10_features)):
    for j in range(i+1, len(top_10_features)):
        corr_val = abs(correlation_matrix.iloc[i, j])
        if corr_val > 0.8:  # High correlation threshold
            high_corr_pairs.append((top_10_features[i], top_10_features[j], corr_val))

if high_corr_pairs:
    print("Highly correlated feature pairs (>0.8):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")
else:
    print("No highly correlated features found (good for model performance)")

print(f"\nDATA QUALITY SUMMARY:")
print(f"  Total engineered features: {len(numerical_features)}")
print(f"  Best feature correlation: {top_features.iloc[0]:.4f}")
print(f"  Features with >0.1 correlation: {sum(top_features > 0.1)}")
print(f"  Ready for model training: ")

Analyzing feature quality and correlations...
Analysis dataset: 89,482 rows
Numerical features for analysis: 84
Filled 337,351 missing values with median
Calculating correlations with target variable...

TOP 20 FEATURES CORRELATED WITH SALES:
 1. sales_rolling_mean_3                | Correlation: 0.7309
 2. sales_rolling_max_3                 | Correlation: 0.6678
 3. product_category_share              | Correlation: 0.6555
 4. revenue                             | Correlation: 0.6375
 5. sales_rolling_mean_7                | Correlation: 0.6260
 6. sales_rolling_mean_14               | Correlation: 0.5914
 7. sales_rolling_mean_30               | Correlation: 0.5840
 8. sales_rolling_min_3                 | Correlation: 0.5601
 9. sales_rolling_max_7                 | Correlation: 0.5463
10. product_daily_sales                 | Correlation: 0.5392
11. store_daily_sales                   | Correlation: 0.5177
12. product_velocity                    | Correlation: 0.5176
13. sales_rol


FEATURES WITH HIGHEST VARIANCE:
 1. category_daily_revenue              | Variance: 6.18e+06
 2. store_daily_revenue                 | Variance: 5.06e+05
 3. product_daily_revenue               | Variance: 1.20e+05
 4. chain_daily_sales                   | Variance: 1.10e+05
 5. category_daily_sales                | Variance: 8.00e+04
 6. product_age_in_store                | Variance: 4.49e+04
 7. revenue                             | Variance: 2.72e+04
 8. revenue_lag_1                       | Variance: 2.57e+04
 9. revenue_lag_7                       | Variance: 2.06e+04
10. store_daily_sales                   | Variance: 1.34e+04

Checking multicollinearity among top 10 features...
Highly correlated feature pairs (>0.8):
  sales_rolling_mean_3 <-> sales_rolling_max_3: 0.908
  sales_rolling_mean_3 <-> sales_rolling_mean_7: 0.869
  sales_rolling_mean_3 <-> sales_rolling_mean_14: 0.821
  sales_rolling_mean_3 <-> sales_rolling_mean_30: 0.811
  sales_rolling_max_3 <-> sales_rolling_mea

In [15]:
# SAVE ENGINEERED FEATURES

print("Saving engineered features and artifacts...")

import pickle
import json
import os

# Create processed directory
os.makedirs('../data/processed', exist_ok=True)

# Save the full engineered dataset
engineered_df.to_csv('../data/processed/engineered_features.csv', index=False)
print("Saved: engineered_features.csv")

# Save label encoders
with open('../data/processed/label_encoders.pkl', 'wb') as f:
   pickle.dump(label_encoders, f)
print("Saved: label_encoders.pkl")

# Save feature lists for model training
feature_sets = {
   'all_features': numerical_features,
   'top_features': top_features.head(50).index.tolist(),
   'top_20_features': top_features.head(20).index.tolist(),
   'time_features': time_features,
   'lag_features': lag_features,
   'rolling_features': rolling_features,
   'price_features': price_features,
   'promotion_features': promotion_features,
   'aggregation_features': aggregation_features,
   'encoded_features': encoded_features
}

with open('../data/processed/feature_sets.pkl', 'wb') as f:
   pickle.dump(feature_sets, f)
print("Saved: feature_sets.pkl")

# Create summary report with JSON-safe data types
summary_report = {
   'feature_engineering_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
   'dataset_info': {
       'total_rows': int(len(engineered_df)),
       'total_features': int(len(numerical_features)),
       'date_range': {
           'start': engineered_df['date'].min().strftime('%Y-%m-%d'),
           'end': engineered_df['date'].max().strftime('%Y-%m-%d')
       },
       'stores': int(engineered_df['store_id'].nunique()),
       'products': int(engineered_df['product_id'].nunique()),
       'categories': int(engineered_df['category'].nunique()),
       'provinces': int(engineered_df['province'].nunique())
   },
   'feature_breakdown': {
       'time_features': int(len(time_features)),
       'lag_features': int(len(lag_features)),
       'rolling_features': int(len(rolling_features)),
       'price_features': int(len(price_features)),
       'promotion_features': int(len(promotion_features)),
       'aggregation_features': int(len(aggregation_features)),
       'encoded_features': int(len(encoded_features))
   },
   'top_features_correlation': {str(f): float(corr) for f, corr in top_features.head(20).items()},
   'data_quality': {
       'missing_values_filled': int(missing_filled),
       'features_with_high_correlation': int(sum(top_features > 0.1)),
       'multicollinearity_issues': int(len(high_corr_pairs) if 'high_corr_pairs' in locals() else 0)
   },
   'ready_for_modeling': True
}

with open('../data/processed/feature_engineering_summary.json', 'w') as f:
   json.dump(summary_report, f, indent=2)
print("Saved: feature_engineering_summary.json")

# Display final summary
print("\nFEATURE ENGINEERING COMPLETE!")
print("="*70)
print("FINAL DATASET SUMMARY:")
print(f"   Total records: {len(engineered_df):,}")
print(f"   Total features: {len(numerical_features)}")
print(f"   Best feature correlation: {top_features.iloc[0]:.4f} ({top_features.index[0]})")
print(f"   Data span: {summary_report['dataset_info']['date_range']['start']} to {summary_report['dataset_info']['date_range']['end']}")
print(f"   Stores: {summary_report['dataset_info']['stores']}")
print(f"   Products: {summary_report['dataset_info']['products']}")
print(f"   Categories: {summary_report['dataset_info']['categories']}")
print("="*70)
print("READY FOR MODEL TRAINING PHASE!")
print("\nFiles saved in data/processed/:")
print("   engineered_features.csv - Complete feature set")
print("   label_encoders.pkl - Categorical encoders")
print("   feature_sets.pkl - Feature groupings")
print("   feature_engineering_summary.json - Summary report")

# Quick preview of what we'll use for training
print("\nNEXT PHASE PREVIEW:")
print(f"   Target variable: sales_quantity")
print(f"   Training features: {len(feature_sets['top_20_features'])} (top correlated)")
print(f"   Models: LightGBM + XGBoost")
print(f"   Evaluation: Time series cross-validation")

Saving engineered features and artifacts...
Saved: engineered_features.csv
Saved: label_encoders.pkl
Saved: feature_sets.pkl
Saved: feature_engineering_summary.json

FEATURE ENGINEERING COMPLETE!
FINAL DATASET SUMMARY:
   Total records: 89,482
   Total features: 84
   Best feature correlation: 0.7309 (sales_rolling_mean_3)
   Data span: 2022-01-01 to 2023-12-31
   Stores: 25
   Products: 123
   Categories: 6
READY FOR MODEL TRAINING PHASE!

Files saved in data/processed/:
   engineered_features.csv - Complete feature set
   label_encoders.pkl - Categorical encoders
   feature_sets.pkl - Feature groupings
   feature_engineering_summary.json - Summary report

NEXT PHASE PREVIEW:
   Target variable: sales_quantity
   Training features: 20 (top correlated)
   Models: LightGBM + XGBoost
   Evaluation: Time series cross-validation
