In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [2]:
import numpy as np
import pandas as pd
import os
import warnings
from math import ceil
import gc

warnings.filterwarnings('ignore')

# **Configuration**

In [3]:
class Config:
    """Main configuration for feature engineering"""
    
    # Data paths
    DATA_PATH = '../input/m5-forecasting-accuracy/'
    TRAIN_FILE = 'sales_train_validation.csv'
    PRICES_FILE = 'sell_prices.csv'
    CALENDAR_FILE = 'calendar.csv'
    
    # Output file
    OUTPUT_FILE = 'processed_data.pkl'
    
    # Parameters
    TARGET = 'sales'
    END_TRAIN = 1913
    VALIDATION_DAYS = 28
    START_DAY = 1000  # Use recent data to save memory
    
    # Feature engineering settings
    LAGS = [7, 28]  # Weekly and monthly lags
    ROLLING_WINDOWS = [7, 28]  # Rolling window sizes
    
    # Random state for reproducibility
    RANDOM_STATE = 42


# **DATA LOADING**

In [4]:
def load_data():
    """Load all required datasets"""
    print('='*80)
    print('STEP 1: LOADING DATA')
    print('='*80)
    
    print('\nLoading files...')
    train = pd.read_csv(os.path.join(Config.DATA_PATH, Config.TRAIN_FILE))
    prices = pd.read_csv(os.path.join(Config.DATA_PATH, Config.PRICES_FILE))
    calendar = pd.read_csv(os.path.join(Config.DATA_PATH, Config.CALENDAR_FILE))
    
    print(f' Train dataset: {train.shape}')
    print(f' Prices dataset: {prices.shape}')
    print(f' Calendar dataset: {calendar.shape}')
    
    return train, prices, calendar

# **CREATE BASE GRID**

In [5]:
def create_base_grid(train_df):
    """
    Transform wide format to long format
    Convert from horizontal (item x days) to vertical (item x day)
    """
    print('\n' + '='*80)
    print('STEP 2: CREATING BASE GRID')
    print('='*80)
    
    # Define index columns (item identifiers)
    index_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    
    print('\nTransforming data from wide to long format...')
    # Melt: convert columns (d_1, d_2, ...) to rows
    grid = pd.melt(
        train_df,
        id_vars=index_cols,
        var_name='d',
        value_name=Config.TARGET
    )
    
    print(f' Grid created with {len(grid):,} rows')
    
    # Add test period (future 28 days)
    print('\nAdding test period rows...')
    test_rows = []
    for i in range(1, 29):
        temp = train_df[index_cols].copy()
        temp['d'] = f'd_{Config.END_TRAIN + i}'
        temp[Config.TARGET] = np.nan
        test_rows.append(temp)
    
    test_grid = pd.concat(test_rows, ignore_index=True)
    grid = pd.concat([grid, test_grid], ignore_index=True)
    
    print(f' Added test period: {len(test_grid):,} rows')
    print(f' Total grid size: {len(grid):,} rows')
    
    # Convert 'd' column to numeric (d_1 -> 1)
    print('\nOptimizing data types...')
    grid['d'] = grid['d'].str.replace('d_', '').astype(np.int16)
    
    # Convert categorical columns to save memory
    for col in index_cols:
        grid[col] = grid[col].astype('category')
    
    print(f' Memory optimized')
    
    return grid

# **ADD PRICE FEATURES**

In [6]:
def add_price_features(grid, prices, calendar):
    """Add price-related features to the grid"""
    print('\n' + '='*80)
    print('STEP 3: ADDING PRICE FEATURES')
    print('='*80)
    
    # Get week information from calendar
    print('\nMerging calendar week information...')
    calendar_prices = calendar[['wm_yr_wk', 'd']].drop_duplicates()
    calendar_prices['d'] = calendar_prices['d'].str.replace('d_', '').astype(np.int16)
    
    grid = grid.merge(calendar_prices, on='d', how='left')
    
    # Calculate price statistics per item
    print('Calculating price statistics...')
    price_stats = prices.groupby(['store_id', 'item_id'])['sell_price'].agg([
        ('price_max', 'max'),
        ('price_min', 'min'),
        ('price_mean', 'mean'),
        ('price_std', 'std')
    ]).reset_index()
    
    # Merge statistics to prices
    prices = prices.merge(price_stats, on=['store_id', 'item_id'], how='left')
    
    # Create normalized price (0-1 range)
    prices['price_norm'] = prices['sell_price'] / prices['price_max']
    
    # Merge prices to grid
    print('Merging prices to grid...')
    grid = grid.merge(
        prices[['store_id', 'item_id', 'wm_yr_wk', 'sell_price', 
                'price_norm', 'price_max', 'price_min', 'price_mean', 'price_std']],
        on=['store_id', 'item_id', 'wm_yr_wk'],
        how='left'
    )
    
    # Drop temporary week column
    grid.drop(columns=['wm_yr_wk'], inplace=True)
    
    print(f' Price features added')
    print(f' Grid shape: {grid.shape}')
    
    return grid


# **ADD CALENDAR FEATURES**

In [7]:

def add_calendar_features(grid, calendar):
    """Add time-based and event features"""
    print('\n' + '='*80)
    print('STEP 4: ADDING CALENDAR FEATURES')
    print('='*80)
    
    # Prepare calendar data
    calendar_temp = calendar.copy()
    calendar_temp['d'] = calendar_temp['d'].str.replace('d_', '').astype(np.int16)
    
    # Select relevant calendar columns
    cal_cols = [
        'd', 'date', 
        'event_name_1', 'event_type_1',
        'event_name_2', 'event_type_2',
        'snap_CA', 'snap_TX', 'snap_WI'
    ]
    
    print('\nMerging calendar information...')
    grid = grid.merge(calendar_temp[cal_cols], on='d', how='left')
    
    # Extract date features
    print('Extracting date features...')
    grid['date'] = pd.to_datetime(grid['date'])
    
    # Basic date components
    grid['day'] = grid['date'].dt.day.astype(np.int8)
    grid['week'] = grid['date'].dt.isocalendar().week.astype(np.int8)
    grid['month'] = grid['date'].dt.month.astype(np.int8)
    grid['year'] = grid['date'].dt.year.astype(np.int16)
    grid['dayofweek'] = grid['date'].dt.dayofweek.astype(np.int8)
    
    # Derived features
    grid['is_weekend'] = (grid['dayofweek'] >= 5).astype(np.int8)
    grid['is_month_start'] = (grid['day'] <= 7).astype(np.int8)
    grid['is_month_end'] = (grid['day'] >= 24).astype(np.int8)
    
    # Drop date column (already extracted features)
    grid.drop(columns=['date'], inplace=True)
    
    # Convert categorical columns
    cat_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for col in cat_cols:
        grid[col] = grid[col].astype('category')
    
    print(f' Calendar features added')
    print(f' Grid shape: {grid.shape}')
    
    return grid


# **ADD LAG FEATURES**

In [8]:
def add_lag_features(grid):
    """
    Add lag features - previous sales values
    These are crucial for time series forecasting
    """
    print('\n' + '='*80)
    print('STEP 5: ADDING LAG FEATURES')
    print('='*80)
    
    # Filter to recent data to save memory
    print(f'\nFiltering data from day {Config.START_DAY}...')
    grid = grid[grid['d'] >= Config.START_DAY].reset_index(drop=True)
    print(f' Grid size after filtering: {len(grid):,} rows')
    
    # Create lag features
    print('\nCreating lag features:')
    for lag in Config.LAGS:
        print(f'  - lag_{lag} (sales from {lag} days ago)')
        grid[f'lag_{lag}'] = grid.groupby('id')[Config.TARGET].transform(
            lambda x: x.shift(lag)
        )
    
    print(f' Lag features added')
    print(f' Grid shape: {grid.shape}')
    
    return grid

# **ADD ROLLING FEATURES**

In [9]:
def add_rolling_features(grid):
    """
    Add rolling window statistics
    These capture trends and patterns over time
    """
    print('\n' + '='*80)
    print('STEP 6: ADDING ROLLING WINDOW FEATURES')
    print('='*80)
    
    print('\nCreating rolling features:')
    for window in Config.ROLLING_WINDOWS:
        print(f'\n  Window size: {window} days')
        
        # Shift by 28 days to prevent data leakage
        shifted = grid.groupby('id')[Config.TARGET].transform(
            lambda x: x.shift(28)
        )
        
        # Rolling statistics
        print(f'    - rolling_mean_{window}')
        grid[f'rolling_mean_{window}'] = shifted.rolling(window).mean()
        
        print(f'    - rolling_std_{window}')
        grid[f'rolling_std_{window}'] = shifted.rolling(window).std()
        
        print(f'    - rolling_min_{window}')
        grid[f'rolling_min_{window}'] = shifted.rolling(window).min()
        
        print(f'    - rolling_max_{window}')
        grid[f'rolling_max_{window}'] = shifted.rolling(window).max()
    
    print(f'\n Rolling features added')
    print(f' Grid shape: {grid.shape}')
    
    return grid

# **ADD CUSTOM FEATURES**

In [10]:
def add_custom_features(grid):
    """
    Add custom engineered features
    These are domain-specific features that may improve prediction
    """
    print('\n' + '='*80)
    print('STEP 7: ADDING CUSTOM FEATURES')
    print('='*80)
    
    print('\nCreating custom features:')
    
    # 1. Holiday indicators
    print('  - Holiday indicators (christmas, thanksgiving, new_year)')
    grid['is_christmas_season'] = (
        (grid['month'] == 12) & (grid['day'] >= 15)
    ).astype(np.int8)
    
    grid['is_thanksgiving'] = (
        (grid['month'] == 11) & (grid['week'] == 4)
    ).astype(np.int8)
    
    grid['is_new_year'] = (
        (grid['month'] == 1) & (grid['day'] <= 7)
    ).astype(np.int8)
    
    # 2. Sales velocity (rate of change)
    if 'lag_7' in grid.columns and 'lag_28' in grid.columns:
        print('  - Sales velocity')
        grid['sales_velocity'] = (grid['lag_7'] - grid['lag_28']) / 21.0
    
    # 3. Price change indicators
    if 'sell_price' in grid.columns:
        print('  - Price change indicators')
        grid['price_change'] = grid.groupby(['store_id', 'item_id'])['sell_price'].diff()
        grid['price_increased'] = (grid['price_change'] > 0).astype(np.int8)
    
    # 4. Seasonal indicators
    print('  - Seasonal indicators (summer, winter)')
    grid['is_summer'] = grid['month'].isin([6, 7, 8]).astype(np.int8)
    grid['is_winter'] = grid['month'].isin([12, 1, 2]).astype(np.int8)
    
    # 5. Week of month
    print('  - Week of month')
    grid['week_of_month'] = ((grid['day'] - 1) // 7 + 1).astype(np.int8)
    
    print(f'\n Custom features added')
    print(f' Grid shape: {grid.shape}')
    
    return grid

# **HANDLE MISSING VALUES**

In [11]:
def handle_missing_values(grid):
    """
    Intelligently handle missing values
    Different strategies for different feature types
    """
    print('\n' + '='*80)
    print('STEP 8: HANDLING MISSING VALUES')
    print('='*80)
    
    print(f'\nMissing values before: {grid.isnull().sum().sum():,}')
    
    # 1. Lag features - forward fill (use last known value)
    print('\nHandling lag features:')
    lag_cols = [col for col in grid.columns if 'lag_' in col]
    if lag_cols:
        print(f'  - Forward filling {len(lag_cols)} lag features')
        for col in lag_cols:
            grid[col] = grid.groupby('id')[col].fillna(method='ffill').fillna(0)
    
    # 2. Rolling features - use median
    print('\nHandling rolling features:')
    rolling_cols = [col for col in grid.columns if 'rolling_' in col]
    if rolling_cols:
        print(f'  - Median filling {len(rolling_cols)} rolling features')
        for col in rolling_cols:
            grid[col] = grid.groupby('id')[col].transform(
                lambda x: x.fillna(x.median())
            ).fillna(0)
    
    # 3. Price features - backward then forward fill
    print('\nHandling price features:')
    price_cols = ['sell_price', 'price_norm', 'price_max', 'price_min', 
                  'price_mean', 'price_std']
    for col in price_cols:
        if col in grid.columns:
            print(f'  - Filling {col}')
            grid[col] = grid.groupby(['store_id', 'item_id'])[col].fillna(method='bfill')
            grid[col] = grid.groupby(['store_id', 'item_id'])[col].fillna(method='ffill')
    
    # 4. Fill remaining NaN - HANDLE CATEGORICAL SEPARATELY!
    print('\nFilling remaining NaN...')
    
    # Get categorical columns
    categorical_cols = grid.select_dtypes(include=['category']).columns.tolist()
    numerical_cols = [col for col in grid.columns if col not in categorical_cols]
    
    # Fill numerical columns with 0
    print(f'  - Filling {len(numerical_cols)} numerical columns with 0')
    for col in numerical_cols:
        if grid[col].isnull().any():
            grid[col] = grid[col].fillna(0)
    
    # Fill categorical columns with 'Unknown' or most frequent
    print(f'  - Filling {len(categorical_cols)} categorical columns')
    for col in categorical_cols:
        if grid[col].isnull().any():
            # Add 'Unknown' to categories first
            if 'Unknown' not in grid[col].cat.categories:
                grid[col] = grid[col].cat.add_categories(['Unknown'])
            grid[col] = grid[col].fillna('Unknown')
    
    print(f'\n Missing values after: {grid.isnull().sum().sum():,}')
    print(' All missing values handled')
    
    return grid


# **FEATURE SELECTION**

In [12]:
def get_feature_list(grid):
    """
    Define which features to use for modeling
    Separate features from IDs and target
    """
    print('\n' + '='*80)
    print('STEP 9: FEATURE SELECTION')
    print('='*80)
    
    # Columns to exclude from features (IDs and target)
    exclude = [
        'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
        'd', Config.TARGET
    ]
    
    # Get all feature columns
    features = [col for col in grid.columns if col not in exclude]
    
    # Identify categorical features
    categorical = [
        col for col in features 
        if grid[col].dtype.name == 'category' or grid[col].dtype == 'object'
    ]
    
    print(f'\n Total features: {len(features)}')
    print(f'  - Categorical: {len(categorical)}')
    print(f'  - Numerical: {len(features) - len(categorical)}')
    
    # Show sample features
    print('\nSample features (first 15):')
    for i, feat in enumerate(features[:15], 1):
        feat_type = 'categorical' if feat in categorical else 'numerical'
        print(f'  {i:2d}. {feat:30s} ({feat_type})')
    
    if len(features) > 15:
        print(f'  ... and {len(features) - 15} more features')
    
    return features, categorical

# **TRAIN/VALIDATION/TEST SPLIT**

In [13]:
def create_train_valid_split(grid):
    """
    Split data into training, validation, and test sets
    Important: Time-based split for time series!
    """
    print('\n' + '='*80)
    print('STEP 10: TRAIN/VALIDATION/TEST SPLIT')
    print('='*80)
    
    print('\nCreating time-based splits...')
    
    # Training data: up to (END_TRAIN - VALIDATION_DAYS)
    train_end = Config.END_TRAIN - Config.VALIDATION_DAYS
    train_mask = grid['d'] <= train_end
    train = grid[train_mask].copy()
    
    # Validation data: last VALIDATION_DAYS of training period
    valid_mask = (grid['d'] > train_end) & (grid['d'] <= Config.END_TRAIN)
    valid = grid[valid_mask].copy()
    
    # Test data: future predictions
    test_mask = grid['d'] > Config.END_TRAIN
    test = grid[test_mask].copy()
    
    # Remove rows with NaN target (only for train/valid)
    train = train[train[Config.TARGET].notna()].reset_index(drop=True)
    valid = valid[valid[Config.TARGET].notna()].reset_index(drop=True)
    
    print(f'\n Training set: {len(train):,} rows')
    print(f'    Date range: d_{train["d"].min()} to d_{train["d"].max()}')
    
    print(f'\n Validation set: {len(valid):,} rows')
    print(f'    Date range: d_{valid["d"].min()} to d_{valid["d"].max()}')
    
    print(f'\n Test set: {len(test):,} rows')
    print(f'    Date range: d_{test["d"].min()} to d_{test["d"].max()}')
    
    return train, valid, test

# **SAVE PROCESSED DATA**

In [14]:
def save_processed_data(grid, features, categorical):
    """Save the processed dataset and feature information"""
    print('\n' + '='*80)
    print('STEP 11: SAVING PROCESSED DATA')
    print('='*80)
    
    # Save main dataset
    print(f'\nSaving processed dataset to {Config.OUTPUT_FILE}...')
    grid.to_pickle(Config.OUTPUT_FILE)
    print(f' Dataset saved ({grid.shape[0]:,} rows, {grid.shape[1]:,} columns)')
    
    # Save feature information
    print('\nSaving feature information...')
    feature_info = {
        'features': features,
        'categorical': categorical,
        'all_columns': list(grid.columns),
        'target': Config.TARGET
    }
    
    with open('feature_info.pkl', 'wb') as f:
        import pickle
        pickle.dump(feature_info, f)
    
    print(' Feature information saved to feature_info.pkl')
    
    # Create feature summary
    print('\nCreating feature summary report...')
    with open('feature_summary.txt', 'w') as f:
        f.write('='*80 + '\n')
        f.write('M5 FORECASTING - FEATURE ENGINEERING SUMMARY\n')
        f.write('='*80 + '\n\n')
        f.write(f'Total Features: {len(features)}\n')
        f.write(f'Categorical Features: {len(categorical)}\n')
        f.write(f'Numerical Features: {len(features) - len(categorical)}\n\n')
        f.write('='*80 + '\n')
        f.write('FEATURE LIST\n')
        f.write('='*80 + '\n\n')
        
        # Group features by type
        lag_feats = [f for f in features if 'lag_' in f]
        rolling_feats = [f for f in features if 'rolling_' in f]
        price_feats = [f for f in features if 'price' in f.lower()]
        time_feats = [f for f in features if any(x in f for x in ['day', 'week', 'month', 'year'])]
        event_feats = [f for f in features if 'event' in f]
        custom_feats = [f for f in features if f not in lag_feats + rolling_feats + price_feats + time_feats + event_feats]
        
        f.write(f'Lag Features ({len(lag_feats)}):\n')
        for feat in lag_feats:
            f.write(f'  - {feat}\n')
        
        f.write(f'\nRolling Features ({len(rolling_feats)}):\n')
        for feat in rolling_feats:
            f.write(f'  - {feat}\n')
        
        f.write(f'\nPrice Features ({len(price_feats)}):\n')
        for feat in price_feats:
            f.write(f'  - {feat}\n')
        
        f.write(f'\nTime Features ({len(time_feats)}):\n')
        for feat in time_feats:
            f.write(f'  - {feat}\n')
        
        f.write(f'\nEvent Features ({len(event_feats)}):\n')
        for feat in event_feats:
            f.write(f'  - {feat}\n')
        
        f.write(f'\nCustom Features ({len(custom_feats)}):\n')
        for feat in custom_feats:
            f.write(f'  - {feat}\n')
    
    print(' Feature summary saved to feature_summary.txt')


# **MAIN PIPELINE**

In [15]:
# MAIN PIPELINE
# ============================================================================

def run_feature_engineering_pipeline():
    """Execute the complete feature engineering pipeline"""
    
    print('\n' + '='*80)
    print('M5 WALMART SALES FORECASTING')
    print('PART 1: FEATURE ENGINEERING PIPELINE')
    print('='*80 + '\n')
    
    # Load data
    train_df, prices_df, calendar_df = load_data()
    
    # Create base grid
    grid = create_base_grid(train_df)
    del train_df
    gc.collect()
    
    # Add features
    grid = add_price_features(grid, prices_df, calendar_df)
    grid = add_calendar_features(grid, calendar_df)
    del prices_df, calendar_df
    gc.collect()
    
    grid = add_lag_features(grid)
    grid = add_rolling_features(grid)
    grid = add_custom_features(grid)
    
    # Handle missing values
    grid = handle_missing_values(grid)
    
    # Feature selection
    features, categorical = get_feature_list(grid)
    
    # Train/valid/test split
    train, valid, test = create_train_valid_split(grid)
    
    # Save everything
    save_processed_data(grid, features, categorical)
    
    # Final summary
    print('\n' + '='*80)
    print('FEATURE ENGINEERING COMPLETED SUCCESSFULLY!')
    print('='*80)
    print('\nGenerated files:')
    print('   processed_data.pkl - Complete processed dataset')
    print('   feature_info.pkl - Feature names and types')
    print('   feature_summary.txt - Detailed feature report')
    print('\nDataset statistics:')
    print(f'  - Total rows: {len(grid):,}')
    print(f'  - Total columns: {len(grid.columns):,}')
    print(f'  - Features: {len(features):,}')
    print(f'  - Training samples: {len(train):,}')
    print(f'  - Validation samples: {len(valid):,}')
    print(f'  - Test samples: {len(test):,}')
    print('\nNext step: Run Part 2 (model_training.py) for model training!')
    print('='*80)


In [16]:
# EXECUTE
if __name__ == '__main__':
    run_feature_engineering_pipeline()


M5 WALMART SALES FORECASTING
PART 1: FEATURE ENGINEERING PIPELINE

STEP 1: LOADING DATA

Loading files...
 Train dataset: (30490, 1919)
 Prices dataset: (6841121, 4)
 Calendar dataset: (1969, 14)

STEP 2: CREATING BASE GRID

Transforming data from wide to long format...
 Grid created with 58,327,370 rows

Adding test period rows...
 Added test period: 853,720 rows
 Total grid size: 59,181,090 rows

Optimizing data types...
 Memory optimized

STEP 3: ADDING PRICE FEATURES

Merging calendar week information...
Calculating price statistics...
Merging prices to grid...
 Price features added
 Grid shape: (59181090, 14)

STEP 4: ADDING CALENDAR FEATURES

Merging calendar information...
Extracting date features...
 Calendar features added
 Grid shape: (59181090, 29)

STEP 5: ADDING LAG FEATURES

Filtering data from day 1000...
 Grid size after filtering: 28,721,580 rows

Creating lag features:
  - lag_7 (sales from 7 days ago)
  - lag_28 (sales from 28 days ago)
 Lag features added
 Grid sha