In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/model-training/processed_data.pkl
/kaggle/input/model-training/feature_info.pkl


In [2]:
import numpy as np
import pandas as pd
import pickle
import time
import gc
import warnings
import psutil
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, BayesianRidge
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# ML Models
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [3]:
class Config:
    """Enhanced configuration"""
    
    # Files
    OUTPUT_FILE = '/kaggle/input/model-training/processed_data.pkl'
    
    # Target
    TARGET = 'sales'
    RANDOM_STATE = 42
    
    # Memory safety
    MAX_RAM_GB = 25.0
    
    # Time Series CV
    N_FOLDS = 3
    FOLD_GAP = 28  # Days between folds
    
    # Enhanced Model Parameters with Tweedie loss
    LGBM_PARAMS = {
        'objective': 'tweedie',  # âœ… Tweedie loss for retail data
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 127,
        'max_depth': 8,
        'min_data_in_leaf': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'verbose': -1,
        'seed': RANDOM_STATE
    }
    
# ============================================================================
# MEMORY UTILITIES
# ============================================================================


In [4]:
def get_memory_usage():
    """Get current RAM usage in GB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024**3

def force_cleanup():
    """Aggressive garbage collection"""
    for _ in range(3):
        gc.collect()

def reduce_memory(df):
    """Reduce memory usage"""
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'   Memory: {start_mem:.0f}MB â†’ {end_mem:.0f}MB ({100*(start_mem-end_mem)/start_mem:.1f}% â†“)')
    
    return df

In [5]:
# WRMSSE METRIC

def calculate_wrmsse(y_true, y_pred, weights=None):
    """
     Weighted Root Mean Squared Scaled Error (M5 Competition metric)
    """
    if weights is None:
        weights = np.ones(len(y_true))
    
    # Calculate RMSE
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    
    # Calculate naive forecast scale (mean of training data)
    scale = np.mean(y_true)
    
    # Avoid division by zero
    if scale == 0:
        scale = 1.0
    
    # WRMSSE
    wrmsse = rmse / scale
    
    return wrmsse



In [6]:
def create_advanced_features(grid, is_train=True):
    """
     Advanced feature engineering: lags, rolling stats, exponential smoothing
    """
    print('\nðŸ”§ Creating advanced features...')
    
    # Sort by item and date
    grid = grid.sort_values(['item_id', 'd']).reset_index(drop=True)
    
    #  Lag features (1, 7, 14, 28 days)
    if is_train and Config.TARGET in grid.columns:
        print('   Creating lag features...')
        for lag in [1, 7, 14, 28]:
            grid[f'lag_{lag}'] = grid.groupby(['item_id', 'store_id'])[Config.TARGET].shift(lag)
        
        #  Rolling statistics (7, 14, 28 days)
        print('   Creating rolling features...')
        for window in [7, 14, 28]:
            grid[f'rolling_mean_{window}'] = grid.groupby(['item_id', 'store_id'])[Config.TARGET].transform(
                lambda x: x.shift(1).rolling(window).mean()
            )
            grid[f'rolling_std_{window}'] = grid.groupby(['item_id', 'store_id'])[Config.TARGET].transform(
                lambda x: x.shift(1).rolling(window).std()
            )
        
        #  Exponential weighted moving average
        print('   Creating exponential smoothing features...')
        grid['ewm_7'] = grid.groupby(['item_id', 'store_id'])[Config.TARGET].transform(
            lambda x: x.shift(1).ewm(span=7).mean()
        )
        grid['ewm_28'] = grid.groupby(['item_id', 'store_id'])[Config.TARGET].transform(
            lambda x: x.shift(1).ewm(span=28).mean()
        )
    
    # Fill NaNs
    for col in grid.columns:
        if grid[col].dtype in [np.float32, np.float64]:
            grid[col] = grid[col].fillna(0)
    
    print(f'   âœ“ Features: {grid.shape[1]} columns')
    return grid

In [7]:
# TARGET ENCODING

def apply_target_encoding(train, valid, test, categorical_cols, target_col, smoothing=10):
    """
    âœ… Target encoding for categorical variables
    """
    print('\n Applying target encoding...')
    
    encoded_cols = []
    
    for col in categorical_cols:
        if col not in train.columns:
            continue
            
        print(f'   Encoding: {col}')
        
        # Convert categorical to numeric codes first if needed
        if train[col].dtype.name == 'category':
            train_col = train[col].cat.codes
            valid_col = valid[col].cat.codes
            test_col = test[col].cat.codes
        else:
            train_col = train[col]
            valid_col = valid[col]
            test_col = test[col]
        
        # Calculate global mean
        global_mean = train[target_col].mean()
        
        # Calculate category means and counts
        agg = train.groupby(train_col)[target_col].agg(['mean', 'count'])
        
        # Smoothed encoding
        smoothed_means = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        
        # Create mapping
        encoding_map = smoothed_means.to_dict()
        
        # Apply encoding - create NEW column as float32
        encoded_name = f'{col}_target_enc'
        train[encoded_name] = train_col.map(encoding_map).fillna(global_mean).astype(np.float32)
        valid[encoded_name] = valid_col.map(encoding_map).fillna(global_mean).astype(np.float32)
        test[encoded_name] = test_col.map(encoding_map).fillna(global_mean).astype(np.float32)
        
        encoded_cols.append(encoded_name)
    
    print(f'   Created {len(encoded_cols)} target-encoded features')
    return train, valid, test, encoded_cols

In [8]:
# TIME SERIES CROSS-VALIDATION
def time_series_cv_split(grid, n_folds=3, gap=28):
    """
    âœ… Time Series Cross-Validation splits
    """
    print('\nðŸ“… Creating Time Series CV splits...')
    
    # Get unique days
    days = sorted(grid['d'].unique())
    max_day = max(days)
    
    folds = []
    
    for fold in range(n_folds):
        # Calculate split points
        valid_end = max_day - (n_folds - fold - 1) * gap
        valid_start = valid_end - gap + 1
        train_end = valid_start - 1
        
        print(f'   Fold {fold+1}: Train up to day {train_end}, Valid days {valid_start}-{valid_end}')
        
        folds.append({
            'train_end': train_end,
            'valid_start': valid_start,
            'valid_end': valid_end
        })
    
    return folds



In [9]:
# SIMPLE FORECASTING

def simple_forecast(models, X_test):
    """
     Simple direct forecasting
    """
    print(f'\n Generating predictions...')
    
    # Get predictions from LightGBM
    predictions = models['lgb'].predict(X_test, num_iteration=models['lgb'].best_iteration)
    predictions = np.maximum(predictions, 0)  # No negative sales
    
    print(f'    Generated {len(predictions)} predictions')
    
    return predictions

def load_data():
    """Load and prepare data"""
    print('\n' + '='*80)
    print('STEP 1: LOADING DATA')
    print('='*80)
    
    print(f'\nInitial RAM: {get_memory_usage():.2f}GB')
    
    print('\nLoading processed_data.pkl...')
    grid = pd.read_pickle(Config.OUTPUT_FILE)
    print(f' Loaded: {grid.shape}')
    
    print('\nOptimizing memory...')
    grid = reduce_memory(grid)
    force_cleanup()
    
    # Extract features
    print('\nExtracting features...')
    try:
        with open('feature_info.pkl', 'rb') as f:
            feature_info = pickle.load(f)
        features = feature_info['features']
        categorical = feature_info['categorical']
    except:
        exclude = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', Config.TARGET]
        features = [col for col in grid.columns if col not in exclude]
        categorical = [col for col in features if grid[col].dtype.name == 'category' or grid[col].dtype == 'object']
    
    print(f' Features: {len(features)} ({len(categorical)} categorical)')
    print(f'  RAM: {get_memory_usage():.2f}GB')
    
    return grid, features, categorical



In [10]:
def prepare_enhanced_data(grid, features, categorical):
    """Enhanced data preparation with advanced features"""
    print('\n' + '='*80)
    print('STEP 2: ENHANCED DATA PREPARATION')
    print('='*80)
    
    #  Create advanced features
    grid = create_advanced_features(grid, is_train=True)
    
    # Update features list
    exclude = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', Config.TARGET]
    features = [col for col in grid.columns if col not in exclude]
    
    # Define splits
    END_TRAIN = 1913
    VALIDATION_DAYS = 28
    train_end = END_TRAIN - VALIDATION_DAYS
    
    # Create masks
    train_mask = grid['d'] <= train_end
    valid_mask = (grid['d'] > train_end) & (grid['d'] <= END_TRAIN)
    test_mask = grid['d'] > END_TRAIN
    
    # Split
    print('\nSplitting data...')
    train = grid[train_mask].copy()
    valid = grid[valid_mask].copy()
    test = grid[test_mask].copy()
    
    # Remove NaN targets
    train = train[train[Config.TARGET].notna()].reset_index(drop=True)
    valid = valid[valid[Config.TARGET].notna()].reset_index(drop=True)
    
    #  Apply target encoding
    cat_for_encoding = [c for c in categorical if c in train.columns]
    train, valid, test, encoded_cols = apply_target_encoding(
        train, valid, test, cat_for_encoding, Config.TARGET
    )
    
    # Update features
    features = [f for f in features if f in train.columns] + encoded_cols
    
    # Encode categorical
    print('\nEncoding categorical variables...')
    for col in categorical:
        if col in train.columns:
            train[col] = train[col].cat.codes if hasattr(train[col], 'cat') else train[col]
            valid[col] = valid[col].cat.codes if hasattr(valid[col], 'cat') else valid[col]
            test[col] = test[col].cat.codes if hasattr(test[col], 'cat') else test[col]
    
    # Extract X and y
    X_train = train[features]
    y_train = train[Config.TARGET]
    X_valid = valid[features]
    y_valid = valid[Config.TARGET]
    X_test = test[features]
    
    print(f'\n Train: {X_train.shape}')
    print(f' Valid: {X_valid.shape}')
    print(f' Test: {X_test.shape}')
    print(f'  RAM: {get_memory_usage():.2f}GB')
    
    del grid, train, valid
    force_cleanup()
    
    return X_train, y_train, X_valid, y_valid, X_test, test, features, categorical



In [11]:
# TRAIN WITH TIME SERIES CV

def train_with_ts_cv(grid, features, categorical):
    """Train models using Time Series Cross-Validation"""
    print('\n' + '='*80)
    print('STEP 3: TRAINING WITH TIME SERIES CV')
    print('='*80)
    
    #  Create CV folds
    folds = time_series_cv_split(grid, n_folds=Config.N_FOLDS, gap=Config.FOLD_GAP)
    
    all_models = []
    fold_scores = []
    
    for fold_idx, fold_info in enumerate(folds):
        print(f'\n{"="*80}')
        print(f'FOLD {fold_idx + 1}/{Config.N_FOLDS}')
        print(f'{"="*80}')
        
        # Split data
        train_mask = grid['d'] <= fold_info['train_end']
        valid_mask = (grid['d'] >= fold_info['valid_start']) & (grid['d'] <= fold_info['valid_end'])
        
        train = grid[train_mask & grid[Config.TARGET].notna()].copy()
        valid = grid[valid_mask & grid[Config.TARGET].notna()].copy()
        
        X_train = train[features]
        y_train = train[Config.TARGET]
        X_valid = valid[features]
        y_valid = valid[Config.TARGET]
        
        # Train LightGBM
        print('\nTraining LightGBM with Tweedie loss...')
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical)
        valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical)
        
        lgb_model = lgb.train(
            Config.LGBM_PARAMS,
            train_data,
            num_boost_round=500,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        lgb_pred = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
        lgb_rmse = np.sqrt(mean_squared_error(y_valid, lgb_pred))
        lgb_wrmsse = calculate_wrmsse(y_valid, lgb_pred)
        
        print(f'   RMSE: {lgb_rmse:.6f} | WRMSSE: {lgb_wrmsse:.6f}')
    
        fold_scores.append({
            'fold': fold_idx + 1,
            'rmse': lgb_rmse,
            'wrmsse': lgb_wrmsse
        })
        
        all_models.append(lgb_model)
        
        del train, valid, X_train, y_train, X_valid, y_valid, train_data, valid_data
        force_cleanup()
    
    # Print CV results
    print(f'\n{"="*80}')
    print('TIME SERIES CV RESULTS')
    print(f'{"="*80}')
    
    avg_rmse = np.mean([s['rmse'] for s in fold_scores])
    avg_wrmsse = np.mean([s['wrmsse'] for s in fold_scores])
    
    print(f'\nAverage RMSE: {avg_rmse:.6f}')
    print(f'Average WRMSSE: {avg_wrmsse:.6f}')
    
    for score in fold_scores:
        print(f"  Fold {score['fold']}: RMSE={score['rmse']:.6f}, WRMSSE={score['wrmsse']:.6f}")
    
    return all_models, fold_scores



In [12]:
# MAIN ENHANCED PIPELINE

def run_enhanced_pipeline():
    """Complete enhanced pipeline"""
    print('\n' + '='*80)
    print(' M5 WALMART - ENHANCED PIPELINE')
    print('    Tweedie Loss |  Time Series CV |  Target Encoding')
    print('    WRMSSE Metric |  Advanced Features |  LightGBM Only')
    print('='*80)
    
    start_time = time.time()
    
    # Load data
    grid, features, categorical = load_data()
    
    # Prepare with enhancements
    X_train, y_train, X_valid, y_valid, X_test, test, features, categorical = prepare_enhanced_data(
        grid, features, categorical
    )
    
    # Train final models
    print('\n' + '='*80)
    print('TRAINING FINAL MODEL')
    print('='*80)
    
    models = {}
    
    # LightGBM with Tweedie
    print('\nTraining final LightGBM...')
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical)
    valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical)
    
    models['lgb'] = lgb.train(
        Config.LGBM_PARAMS,
        train_data,
        num_boost_round=500,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )
    
    # Validation metrics
    valid_pred = models['lgb'].predict(X_valid, num_iteration=models['lgb'].best_iteration)
    valid_rmse = np.sqrt(mean_squared_error(y_valid, valid_pred))
    valid_wrmsse = calculate_wrmsse(y_valid, valid_pred)
    
    print(f'\n Validation RMSE: {valid_rmse:.6f}')
    print(f' Validation WRMSSE: {valid_wrmsse:.6f}')
    
    #  Simple forecasting 
    final_pred = simple_forecast(models, X_test)
    
    # Save results
    print('\n' + '='*80)
    print('SAVING RESULTS')
    print('='*80)
    
    submission = pd.DataFrame({
        'id': test['id'],
        'prediction': final_pred
    })
    submission.to_csv('enhanced_predictions.csv', index=False)
    print('âœ“ Saved: enhanced_predictions.csv')
    
    with open('enhanced_models.pkl', 'wb') as f:
        pickle.dump(models, f)
    print('âœ“ Saved: enhanced_models.pkl')
    
    elapsed = time.time() - start_time
    print('\n' + '='*80)
    print(' ENHANCED PIPELINE COMPLETE!')
    print('='*80)
    print(f'\nTotal time: {elapsed/60:.1f} minutes')
    print(f'Final RAM: {get_memory_usage():.2f}GB')
    print(f'\n Enhancements Applied:')
    print(f'    Tweedie loss function')
    print(f'    Time Series Cross-Validation')
    print(f'    Advanced lag & rolling features')
    print(f'    Target encoding')
    print(f'    WRMSSE metric')
    print(f'    Simple direct forecasting')
    print(f'    LightGBM only (faster & cleaner)')
    print(f'\nExpected RMSE: ~1.95-2.00 (improved!)')
    
    return final_pred, models

In [13]:
if __name__ == '__main__':
    final_predictions, trained_models = run_enhanced_pipeline()


 M5 WALMART - ENHANCED PIPELINE
    Tweedie Loss |  Time Series CV |  Target Encoding
    WRMSSE Metric |  Advanced Features |  LightGBM Only

STEP 1: LOADING DATA

Initial RAM: 0.32GB

Loading processed_data.pkl...
 Loaded: (28721580, 48)

Optimizing memory...
   Memory: 5999MB â†’ 3342MB (44.3% â†“)

Extracting features...
 Features: 40 (4 categorical)
  RAM: 3.61GB

STEP 2: ENHANCED DATA PREPARATION

ðŸ”§ Creating advanced features...
   Creating lag features...
   Creating rolling features...
   Creating exponential smoothing features...
   âœ“ Features: 54 columns

Splitting data...

 Applying target encoding...
   Encoding: event_name_1
   Encoding: event_type_1
   Encoding: event_name_2
   Encoding: event_type_2
   Created 4 target-encoded features

Encoding categorical variables...

 Train: (27014140, 50)
 Valid: (853720, 50)
 Test: (853720, 50)
  RAM: 18.17GB

TRAINING FINAL MODEL

Training final LightGBM...
Training until validation scores don't improve for 50 rounds
[100]	v