In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create processed directory
processed_dir = "../data/processed/"
os.makedirs(processed_dir, exist_ok=True)

# Load data
data_path = "../data/raw data/dambulla_daily_vegetable_prices_2010_2025 (1).csv"
df = pd.read_csv(data_path)
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Unique vegetables: {df['Vegetable_Name'].nunique()}")

Dataset shape: (35166, 7)
Date range: 2010-01-01 00:00:00 to 2026-01-17 00:00:00
Unique vegetables: 6


In [2]:
def create_weekly_features_corrected(df):
    """
    Convert daily data to weekly with correct ISO week handling
    """
    df = df.copy()

    # CORRECTED: Use ISO year and week consistently
    iso_calendar = df['Date'].dt.isocalendar()
    df['ISO_Year'] = iso_calendar['year']
    df['ISO_Week'] = iso_calendar['week']

    # Create consistent week identifier
    df['Week_ID'] = df['ISO_Year'].astype(str) + '-W' + df['ISO_Week'].astype(str).str.zfill(2)

    # Use Monday as start of week (consistent with ISO)
    df['Week_Start'] = df['Date'] - pd.to_timedelta(df['Date'].dt.dayofweek, unit='D')

    return df

df = create_weekly_features_corrected(df)
print(f"Corrected week features created")
print(f"Sample Week_ID: {df['Week_ID'].iloc[0]}")
print(f"Week_Start sample: {df['Week_Start'].iloc[0]}")

Corrected week features created
Sample Week_ID: 2009-W53
Week_Start sample: 2009-12-28 00:00:00


In [3]:
def aggregate_weekly_safe(df, test_split_date='2023-01-01'):
    """
    Aggregate with leak-proof statistics
    """
    weekly_features = []
    test_split_date = pd.Timestamp(test_split_date)

    for veg_name in df['Vegetable_Name'].unique():
        veg_df = df[df['Vegetable_Name'] == veg_name].copy()

        # Separate train/test before aggregation
        veg_train = veg_df[veg_df['Date'] < test_split_date]
        veg_test = veg_df[veg_df['Date'] >= test_split_date]

        # Function to aggregate weekly
        def aggregate_group(group_df):
            return pd.Series({
                'Weekly_Arrival_Sum': group_df['Daily_Arrival_MT'].sum(),
                'Avg_Daily_Arrival': group_df['Daily_Arrival_MT'].mean(),
                'Std_Daily_Arrival': group_df['Daily_Arrival_MT'].std(),
                'Min_Daily_Arrival': group_df['Daily_Arrival_MT'].min(),
                'Max_Daily_Arrival': group_df['Daily_Arrival_MT'].max(),
                'Weekly_Sales_Sum': group_df['Estimated_Sales_MT'].sum(),
                'Avg_Daily_Sales': group_df['Estimated_Sales_MT'].mean(),
                'Std_Daily_Sales': group_df['Estimated_Sales_MT'].std(),
                'Min_Daily_Sales': group_df['Estimated_Sales_MT'].min(),
                'Max_Daily_Sales': group_df['Estimated_Sales_MT'].max(),
                'Avg_Weekly_Price': group_df['Wholesale_Price_Rs_kg'].mean(),
                'Std_Weekly_Price': group_df['Wholesale_Price_Rs_kg'].std(),
                'Min_Weekly_Price': group_df['Wholesale_Price_Rs_kg'].min(),
                'Max_Weekly_Price': group_df['Wholesale_Price_Rs_kg'].max(),
                'Season': group_df['Season'].iloc[0],
                'Weekly_Supply_Status': group_df['Supply_Status'].mode()[0] if not group_df['Supply_Status'].mode().empty else 'Stable'
            })

        # Aggregate train and test separately
        train_weekly = veg_train.groupby(['Week_ID', 'Week_Start', 'ISO_Year', 'ISO_Week']).apply(aggregate_group).reset_index()
        test_weekly = veg_test.groupby(['Week_ID', 'Week_Start', 'ISO_Year', 'ISO_Week']).apply(aggregate_group).reset_index()

        # Combine and add vegetable name
        veg_weekly = pd.concat([train_weekly, test_weekly])
        veg_weekly['Vegetable_Name'] = veg_name
        veg_weekly['Is_Train'] = veg_weekly['Week_Start'] < test_split_date

        weekly_features.append(veg_weekly)

    # Combine all vegetables
    weekly_df = pd.concat(weekly_features, ignore_index=True)
    weekly_df = weekly_df.sort_values(['Vegetable_Name', 'Week_Start'])

    return weekly_df

weekly_df = aggregate_weekly_safe(df)
print(f"\nWeekly dataset shape: {weekly_df.shape}")
print(f"Train samples: {weekly_df['Is_Train'].sum()}")
print(f"Test samples: {(~weekly_df['Is_Train']).sum()}")


Weekly dataset shape: (5034, 22)
Train samples: 4080
Test samples: 954


In [4]:
def create_lag_features_safe(weekly_df, lags=[1, 2, 4, 8, 12, 26, 52]):
    """
    Create lag features without train-test leakage
    """
    df = weekly_df.copy()

    # Sort to ensure proper lag calculation
    df = df.sort_values(['Vegetable_Name', 'Week_Start'])

    # Create lag features for each vegetable separately
    for veg_name in df['Vegetable_Name'].unique():
        veg_mask = df['Vegetable_Name'] == veg_name

        # Separate train and test indices
        train_mask = veg_mask & df['Is_Train']
        test_mask = veg_mask & (~df['Is_Train'])

        # Key variables to create lags for
        target_vars = ['Weekly_Arrival_Sum', 'Weekly_Sales_Sum', 'Avg_Weekly_Price']

        for var in target_vars:
            for lag in lags:
                col_name = f'{var}_lag_{lag}w'

                # Create lag for entire series
                lag_values = df.loc[veg_mask, var].shift(lag)
                df.loc[veg_mask, col_name] = lag_values

                # For test data, we can optionally forward-fill from last train value
                # But better to leave as NaN initially
                # We'll handle NaN filling separately

    # Rolling statistics - compute within train/test separately
    for veg_name in df['Vegetable_Name'].unique():
        veg_mask = df['Vegetable_Name'] == veg_name

        # Separate train and test
        veg_df = df[veg_mask].copy()
        train_idx = veg_df[veg_df['Is_Train']].index
        test_idx = veg_df[~veg_df['Is_Train']].index

        for window in [4, 8, 12, 26]:
            # Compute rolling stats separately for train and test
            # For train: use only train data
            if len(train_idx) >= window:
                train_arrival_ma = veg_df.loc[train_idx, 'Weekly_Arrival_Sum'].rolling(window=window, min_periods=1).mean()
                df.loc[train_idx, f'Weekly_Arrival_Sum_MA_{window}w'] = train_arrival_ma

                train_price_ma = veg_df.loc[train_idx, 'Avg_Weekly_Price'].rolling(window=window, min_periods=1).mean()
                df.loc[train_idx, f'Avg_Weekly_Price_MA_{window}w'] = train_price_ma

                train_arrival_std = veg_df.loc[train_idx, 'Weekly_Arrival_Sum'].rolling(window=window, min_periods=1).std()
                df.loc[train_idx, f'Weekly_Arrival_Sum_Std_{window}w'] = train_arrival_std

            # For test: use expanding window from last train values
            if len(test_idx) > 0:
                # Use all available data up to each point, but don't look into future
                for idx in test_idx:
                    current_date = df.loc[idx, 'Week_Start']
                    historical_mask = (df['Vegetable_Name'] == veg_name) & (df['Week_Start'] < current_date)

                    if historical_mask.sum() >= window:
                        historical_arrival = df.loc[historical_mask, 'Weekly_Arrival_Sum'].tail(window)
                        df.loc[idx, f'Weekly_Arrival_Sum_MA_{window}w'] = historical_arrival.mean()
                        df.loc[idx, f'Avg_Weekly_Price_MA_{window}w'] = df.loc[historical_mask, 'Avg_Weekly_Price'].tail(window).mean()
                        df.loc[idx, f'Weekly_Arrival_Sum_Std_{window}w'] = historical_arrival.std()

    return df

weekly_df = create_lag_features_safe(weekly_df)
print(f"\nLag features created safely")
print(f"Missing values in lag features: {weekly_df[[col for col in weekly_df.columns if 'lag' in col]].isnull().sum().sum()}")


Lag features created safely
Missing values in lag features: 1890


In [5]:
def create_statistical_features_safe(weekly_df):
    """
    Create statistical features with clipping for stability
    """
    df = weekly_df.copy()

    # 1. Supply-Demand Ratio (safe)
    df['Supply_Demand_Ratio'] = df['Weekly_Arrival_Sum'] / (df['Weekly_Sales_Sum'] + 1e-6)

    # 2. Price-Arrival Elasticity with clipping
    df['Price_Change_Pct'] = df.groupby('Vegetable_Name')['Avg_Weekly_Price'].pct_change() * 100
    df['Arrival_Change_Pct'] = df.groupby('Vegetable_Name')['Weekly_Arrival_Sum'].pct_change() * 100

    # CORRECTED: Handle division by zero safely
    epsilon = 1e-6
    df['Price_Arrival_Elasticity'] = df['Price_Change_Pct'] / (df['Arrival_Change_Pct'] + epsilon * np.sign(df['Arrival_Change_Pct']))

    # Clip extreme values
    df['Price_Arrival_Elasticity'] = np.clip(df['Price_Arrival_Elasticity'], -10, 10)

    # Alternative: Use log elasticity for better distribution
    df['Log_Price_Arrival_Elasticity'] = np.log1p(np.abs(df['Price_Arrival_Elasticity'])) * np.sign(df['Price_Arrival_Elasticity'])

    # 3. Volatility measures
    df['Price_Volatility'] = df['Std_Weekly_Price'] / (df['Avg_Weekly_Price'] + epsilon)
    df['Arrival_Volatility'] = df['Std_Daily_Arrival'] / (df['Avg_Daily_Arrival'] + epsilon)

    # 4. Range features
    df['Price_Range'] = df['Max_Weekly_Price'] - df['Min_Weekly_Price']
    df['Arrival_Range'] = df['Max_Daily_Arrival'] - df['Min_Daily_Arrival']

    # 5. Utilization rate (clipped)
    df['Utilization_Rate'] = df['Weekly_Sales_Sum'] / (df['Weekly_Arrival_Sum'] + epsilon)
    df['Utilization_Rate'] = np.clip(df['Utilization_Rate'], 0, 2)  # Rarely exceeds 1, but safe

    # 6. Price position within week's range
    df['Price_Position'] = (df['Avg_Weekly_Price'] - df['Min_Weekly_Price']) / (df['Price_Range'] + epsilon)
    df['Price_Position'] = np.clip(df['Price_Position'], 0, 1)

    # 7. Train-safe trend features
    for veg_name in df['Vegetable_Name'].unique():
        veg_mask = df['Vegetable_Name'] == veg_name
        train_mask = veg_mask & df['Is_Train']
        test_mask = veg_mask & (~df['Is_Train'])

        # For training data
        train_idx = df[train_mask].index
        if len(train_idx) >= 4:
            for i, idx in enumerate(train_idx):
                if i >= 3:
                    window_idx = train_idx[i-3:i+1]
                    x = np.arange(4)
                    y_price = df.loc[window_idx, 'Avg_Weekly_Price'].values
                    y_arrival = df.loc[window_idx, 'Weekly_Arrival_Sum'].values

                    # Linear regression slope
                    slope_price = np.polyfit(x, y_price, 1)[0]
                    slope_arrival = np.polyfit(x, y_arrival, 1)[0]

                    df.loc[idx, 'Price_Trend_4w'] = slope_price
                    df.loc[idx, 'Arrival_Trend_4w'] = slope_arrival

        # For test data - only use past information
        test_idx = df[test_mask].index
        for idx in test_idx:
            current_date = df.loc[idx, 'Week_Start']
            past_mask = (df['Vegetable_Name'] == veg_name) & (df['Week_Start'] < current_date)
            past_idx = df[past_mask].index

            if len(past_idx) >= 4:
                window_idx = past_idx[-4:]
                x = np.arange(4)
                y_price = df.loc[window_idx, 'Avg_Weekly_Price'].values
                y_arrival = df.loc[window_idx, 'Weekly_Arrival_Sum'].values

                slope_price = np.polyfit(x, y_price, 1)[0]
                slope_arrival = np.polyfit(x, y_arrival, 1)[0]

                df.loc[idx, 'Price_Trend_4w'] = slope_price
                df.loc[idx, 'Arrival_Trend_4w'] = slope_arrival

    return df

weekly_df = create_statistical_features_safe(weekly_df)
print(f"\nStatistical features created safely")
print(f"Price_Arrival_Elasticity range: [{weekly_df['Price_Arrival_Elasticity'].min():.2f}, {weekly_df['Price_Arrival_Elasticity'].max():.2f}]")


Statistical features created safely
Price_Arrival_Elasticity range: [-10.00, 10.00]


In [6]:
def create_features_leak_proof(weekly_df):
    """
    Create features without leaking test information
    """
    df = weekly_df.copy()

    # Ensure we have the Month column from Week_Start
    if 'Month' not in df.columns:
        df['Month'] = df['Week_Start'].dt.month

    # 1. Train-only statistics for encoding
    train_df = df[df['Is_Train']].copy()

    # Vegetable-specific statistics from TRAIN ONLY
    veg_stats_train = train_df.groupby('Vegetable_Name').agg({
        'Weekly_Arrival_Sum': ['mean', 'std'],
        'Avg_Weekly_Price': ['mean', 'std'],
        'Weekly_Sales_Sum': 'mean'
    }).reset_index()

    # Flatten column names
    veg_stats_train.columns = ['Vegetable_Name',
                                'Veg_Train_Avg_Arrival', 'Veg_Train_Std_Arrival',
                                'Veg_Train_Avg_Price', 'Veg_Train_Std_Price',
                                'Veg_Train_Avg_Sales']

    # Merge stats back (from train only)
    df = pd.merge(df, veg_stats_train, on='Vegetable_Name', how='left')

    # 2. Z-scores using train statistics
    epsilon = 1e-6
    df['Arrival_Z_Score'] = (df['Weekly_Arrival_Sum'] - df['Veg_Train_Avg_Arrival']) / (df['Veg_Train_Std_Arrival'] + epsilon)
    df['Price_Z_Score'] = (df['Avg_Weekly_Price'] - df['Veg_Train_Avg_Price']) / (df['Veg_Train_Std_Price'] + epsilon)

    # 3. Month-year statistics from TRAIN ONLY
    # Ensure Month column exists in train_df
    if 'Month' not in train_df.columns:
        train_df['Month'] = train_df['Week_Start'].dt.month

    month_year_stats_train = train_df.groupby(['ISO_Year', 'Month']).agg({
        'Weekly_Arrival_Sum': 'mean',
        'Avg_Weekly_Price': 'mean'
    }).reset_index()

    month_year_stats_train.columns = ['ISO_Year', 'Month', 'MonthYear_Train_Avg_Arrival', 'MonthYear_Train_Avg_Price']

    # Merge with main dataframe
    df = pd.merge(df, month_year_stats_train, on=['ISO_Year', 'Month'], how='left')

    # 4. Deviation from train monthly averages
    df['Arrival_Dev_From_TrainMonthAvg'] = df['Weekly_Arrival_Sum'] - df['MonthYear_Train_Avg_Arrival']
    df['Price_Dev_From_TrainMonthAvg'] = df['Avg_Weekly_Price'] - df['MonthYear_Train_Avg_Price']

    # 5. Simple categorical encoding (no leakage)
    supply_status_map = {'Low': 0, 'Stable': 1, 'High': 2}
    df['Supply_Status_Encoded'] = df['Weekly_Supply_Status'].map(supply_status_map)

    season_map = {'Maha': 0, 'Yala': 1}
    df['Season_Encoded'] = df['Season'].map(season_map)

    # 6. Cyclical encoding (no leakage risk)
    # Ensure Month column exists
    if 'Month' not in df.columns:
        df['Month'] = df['Week_Start'].dt.month

    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

    # Ensure WeekOfYear exists
    if 'WeekOfYear' not in df.columns:
        df['WeekOfYear'] = df['Week_Start'].dt.isocalendar().week

    df['Week_sin'] = np.sin(2 * np.pi * df['WeekOfYear'] / 52.1429)  # Average weeks per year
    df['Week_cos'] = np.cos(2 * np.pi * df['WeekOfYear'] / 52.1429)

    # 7. Safe interaction terms
    df['Arrival_Price_Interaction'] = df['Weekly_Arrival_Sum'] * df['Avg_Weekly_Price']
    df['Season_Arrival_Interaction'] = df['Season_Encoded'] * df['Weekly_Arrival_Sum']

    return df

# Now run the function
weekly_df = create_features_leak_proof(weekly_df)
print(f"\nLeak-proof features created")
print(f"Z-score range (should be centered around 0): [{weekly_df['Price_Z_Score'].min():.2f}, {weekly_df['Price_Z_Score'].max():.2f}]")


Leak-proof features created
Z-score range (should be centered around 0): [-0.97, 2.89]


In [7]:
def handle_missing_values_robust(weekly_df):
    """
    Handle NaN values with train-test separation
    """
    df = weekly_df.copy()

    print(f"\nMissing values before handling:")
    missing_counts = df.isnull().sum()
    missing_cols = missing_counts[missing_counts > 0]
    print(f"Columns with missing values: {len(missing_cols)}")

    # Separate train and test
    train_mask = df['Is_Train']
    test_mask = ~df['Is_Train']

    # 1. Forward fill for lag features (train only propagates to test start)
    lag_cols = [col for col in df.columns if 'lag' in col or 'MA_' in col or 'Trend' in col]

    for col in lag_cols:
        if col in df.columns:
            # Forward fill within each vegetable
            df[col] = df.groupby('Vegetable_Name')[col].transform(lambda x: x.ffill())

    # 2. For each vegetable, fill train with median, test with last train value
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col not in ['ISO_Year', 'ISO_Week', 'Month', 'WeekOfYear']]

    for veg_name in df['Vegetable_Name'].unique():
        veg_mask = df['Vegetable_Name'] == veg_name
        veg_train_mask = veg_mask & train_mask
        veg_test_mask = veg_mask & test_mask

        for col in numeric_cols:
            if col in df.columns and df[col].isnull().any():
                # Train: fill with train median
                train_median = df.loc[veg_train_mask, col].median()
                df.loc[veg_train_mask, col] = df.loc[veg_train_mask, col].fillna(train_median)

                # Test: fill with train median (not test median!)
                df.loc[veg_test_mask, col] = df.loc[veg_test_mask, col].fillna(train_median)

    # 3. Handle categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    categorical_cols = [col for col in categorical_cols if col not in ['Week_ID', 'Vegetable_Name']]

    for col in categorical_cols:
        if col in df.columns and df[col].isnull().any():
            # Use mode from training data
            train_mode = df.loc[train_mask, col].mode()
            mode_value = train_mode[0] if not train_mode.empty else 'Unknown'
            df[col] = df[col].fillna(mode_value)

    print(f"\nMissing values after handling: {df.isnull().sum().sum()}")

    # 4. Remove early weeks with insufficient history for lag features
    # Keep only weeks where we have enough history for the longest lag (52 weeks)
    print(f"\nRemoving early weeks with insufficient history...")
    initial_count = len(df)

    # For each vegetable, remove first 52 weeks (or keep them with NaN for short lags)
    # Actually, we already forward-filled, so we can keep them

    print(f"Kept {len(df)} of {initial_count} rows")

    return df

weekly_df = handle_missing_values_robust(weekly_df)


Missing values before handling:
Columns with missing values: 42

Missing values after handling: 0

Removing early weeks with insufficient history...
Kept 5034 of 5034 rows


In [8]:
def create_targets_with_split(weekly_df, forecast_horizons=[1, 4]):
    """
    Create targets and prepare for time-series cross-validation
    """
    df = weekly_df.copy()

    # Sort chronologically
    df = df.sort_values(['Vegetable_Name', 'Week_Start'])

    # Create target variables
    print(f"\nCreating forecast targets for horizons: {forecast_horizons} weeks")

    for horizon in forecast_horizons:
        for veg_name in df['Vegetable_Name'].unique():
            veg_mask = df['Vegetable_Name'] == veg_name

            # Price target
            df.loc[veg_mask, f'Target_{horizon}w_Price'] = df.loc[veg_mask, 'Avg_Weekly_Price'].shift(-horizon)

            # Arrival target
            df.loc[veg_mask, f'Target_{horizon}w_Arrival'] = df.loc[veg_mask, 'Weekly_Arrival_Sum'].shift(-horizon)

            # Sales target
            df.loc[veg_mask, f'Target_{horizon}w_Sales'] = df.loc[veg_mask, 'Weekly_Sales_Sum'].shift(-horizon)

    # Remove rows without future targets (last few weeks of each vegetable)
    initial_len = len(df)
    df = df.dropna(subset=['Target_1w_Price', 'Target_1w_Arrival'])
    removed = initial_len - len(df)

    print(f"Removed {removed} rows without future targets")
    print(f"Final dataset shape: {df.shape}")

    # Create time index for each vegetable
    df['Time_Index'] = df.groupby('Vegetable_Name').cumcount()

    return df

weekly_df = create_targets_with_split(weekly_df)


Creating forecast targets for horizons: [1, 4] weeks
Removed 6 rows without future targets
Final dataset shape: (5028, 95)


In [9]:
def select_final_features(weekly_df):
    """
    Select final features for modeling, removing leakage-prone columns
    """
    df = weekly_df.copy()

    print(f"Initial columns: {len(df.columns)}")

    # List of columns to REMOVE (leakage-prone, IDs, redundant)
    drop_cols = [
        # Identifiers and dates (keep for reference but not for modeling)
        'Week_ID', 'Week_Start', 'MonthYear', 'Month_Vegetable',

        # Future targets (except the main ones we're predicting)
        'Target_1w_Sales', 'Target_4w_Sales', 'Target_4w_Arrival',
        'Target_NextWeek_Arrival', 'Target_NextWeek_Sales',  # Legacy names

        # Raw columns that are redundant with derived features
        'Min_Daily_Arrival', 'Max_Daily_Arrival',
        'Min_Daily_Sales', 'Max_Daily_Sales',
        'Min_Weekly_Price', 'Max_Weekly_Price',

        # Train-only statistics (leakage risk if used incorrectly)
        'Veg_Train_Avg_Arrival', 'Veg_Train_Std_Arrival',
        'Veg_Train_Avg_Price', 'Veg_Train_Std_Price', 'Veg_Train_Avg_Sales',
        'MonthYear_Train_Avg_Arrival', 'MonthYear_Train_Avg_Price',

        # Highly correlated with target (perfect predictors)
        'Avg_Weekly_Price',  # Too correlated with target
        'Weekly_Arrival_Sum',  # Use lagged versions instead
        'Weekly_Sales_Sum',  # Use lagged versions instead

        # Alternative versions (keep best one)
        'Price_Arrival_Elasticity',  # Keep log version
        'WeekOfYear',  # Keep sin/cos versions
        'Month',  # Keep sin/cos versions
    ]

    # Only drop columns that exist
    drop_cols = [col for col in drop_cols if col in df.columns]

    print(f"\nDropping {len(drop_cols)} columns:")
    for col in drop_cols[:20]:  # Show first 20
        print(f"  - {col}")
    if len(drop_cols) > 20:
        print(f"  ... and {len(drop_cols) - 20} more")

    df_reduced = df.drop(columns=drop_cols)

    # Keep essential reference columns separately
    reference_cols = ['Vegetable_Name', 'Week_Start', 'ISO_Year', 'ISO_Week', 'Is_Train', 'Time_Index']
    reference_df = df[reference_cols].copy()

    # Separate features and targets
    feature_cols = [col for col in df_reduced.columns
                    if not col.startswith('Target_')
                    and col not in reference_cols]

    target_cols = [col for col in df_reduced.columns if col.startswith('Target_')]

    print(f"\nFinal feature selection:")
    print(f"  - Reference columns: {len(reference_cols)}")
    print(f"  - Feature columns: {len(feature_cols)}")
    print(f"  - Target columns: {len(target_cols)}")
    print(f"  - Total columns kept: {len(reference_cols) + len(feature_cols) + len(target_cols)}")

    # Create feature categories for documentation
    feature_categories = {
        'Lag Features': [col for col in feature_cols if 'lag' in col],
        'Moving Averages': [col for col in feature_cols if 'MA_' in col],
        'Seasonal Features': [col for col in feature_cols if 'sin' in col or 'cos' in col or 'Season' in col],
        'Volatility Features': [col for col in feature_cols if 'Volatility' in col or 'Std_' in col],
        'Ratio Features': [col for col in feature_cols if 'Ratio' in col or 'Rate' in col],
        'Trend Features': [col for col in feature_cols if 'Trend' in col],
        'Deviation Features': [col for col in feature_cols if 'Dev_' in col or 'Z_Score' in col],
        'Interaction Features': [col for col in feature_cols if 'Interaction' in col],
    }

    print(f"\nFeature categories:")
    for category, features in feature_categories.items():
        if features:
            print(f"  {category}: {len(features)} features")

    return df, feature_cols, target_cols, reference_df

final_df, feature_cols, target_cols, reference_df = select_final_features(weekly_df)

Initial columns: 96

Dropping 24 columns:
  - Week_ID
  - Week_Start
  - Target_1w_Sales
  - Target_4w_Sales
  - Target_4w_Arrival
  - Min_Daily_Arrival
  - Max_Daily_Arrival
  - Min_Daily_Sales
  - Max_Daily_Sales
  - Min_Weekly_Price
  - Max_Weekly_Price
  - Veg_Train_Avg_Arrival
  - Veg_Train_Std_Arrival
  - Veg_Train_Avg_Price
  - Veg_Train_Std_Price
  - Veg_Train_Avg_Sales
  - MonthYear_Train_Avg_Arrival
  - MonthYear_Train_Avg_Price
  - Avg_Weekly_Price
  - Weekly_Arrival_Sum
  ... and 4 more

Final feature selection:
  - Reference columns: 6
  - Feature columns: 64
  - Target columns: 3
  - Total columns kept: 73

Feature categories:
  Lag Features: 21 features
  Moving Averages: 8 features
  Seasonal Features: 7 features
  Volatility Features: 9 features
  Ratio Features: 2 features
  Trend Features: 2 features
  Deviation Features: 4 features
  Interaction Features: 2 features


In [10]:
def save_final_datasets(final_df, feature_cols, target_cols, reference_df, processed_dir):
    """
    Save processed datasets with clear separation
    """
    # 1. Save full dataset
    full_path = os.path.join(processed_dir, "processed_weekly_full.csv")
    final_df.to_csv(full_path, index=False)

    # 2. Save modeling-ready dataset (features + targets)
    model_cols = feature_cols + target_cols
    model_df = final_df[model_cols].copy()
    model_path = os.path.join(processed_dir, "processed_weekly_modeling.csv")
    model_df.to_csv(model_path, index=False)

    # 3. Save reference data separately
    ref_path = os.path.join(processed_dir, "processed_weekly_reference.csv")
    reference_df.to_csv(ref_path, index=False)

    # 4. Save feature list
    feature_list_path = os.path.join(processed_dir, "feature_list.txt")
    with open(feature_list_path, 'w') as f:
        f.write("FINAL FEATURE LIST FOR MODELING\n")
        f.write("=" * 80 + "\n\n")

        f.write(f"Total features: {len(feature_cols)}\n")
        f.write(f"Total targets: {len(target_cols)}\n\n")

        f.write("FEATURE CATEGORIES:\n")
        f.write("-" * 80 + "\n\n")

        categories = {
            'Lag Features': [col for col in feature_cols if 'lag' in col],
            'Moving Averages': [col for col in feature_cols if 'MA_' in col],
            'Seasonal': [col for col in feature_cols if 'sin' in col or 'cos' in col],
            'Statistical': [col for col in feature_cols if 'Volatility' in col or 'Trend' in col or 'Ratio' in col],
            'Encoded': [col for col in feature_cols if 'Encoded' in col],
            'Other': [col for col in feature_cols if all(keyword not in col for keyword in
                     ['lag', 'MA_', 'sin', 'cos', 'Volatility', 'Trend', 'Ratio', 'Encoded'])]
        }

        for category, features in categories.items():
            if features:
                f.write(f"\n{category.upper()} ({len(features)}):\n")
                for feat in sorted(features):
                    f.write(f"  {feat}\n")

        f.write("\n\nTARGET VARIABLES:\n")
        f.write("-" * 80 + "\n")
        for target in sorted(target_cols):
            f.write(f"{target}\n")

        f.write("\n\nDATA SPLIT:\n")
        f.write("-" * 80 + "\n")
        train_count = reference_df['Is_Train'].sum()
        test_count = (~reference_df['Is_Train']).sum()
        f.write(f"Training samples: {train_count} ({train_count/len(reference_df)*100:.1f}%)\n")
        f.write(f"Testing samples: {test_count} ({test_count/len(reference_df)*100:.1f}%)\n")
        f.write(f"Split date: 2023-01-01\n")

    print(f"\nðŸ’¾ FILES SAVED:")
    print(f"  1. Full dataset: {full_path}")
    print(f"  2. Modeling dataset: {model_path}")
    print(f"  3. Reference data: {ref_path}")
    print(f"  4. Feature list: {feature_list_path}")

    return full_path, model_path

full_path, model_path = save_final_datasets(final_df, feature_cols, target_cols, reference_df, processed_dir)


ðŸ’¾ FILES SAVED:
  1. Full dataset: ../data/processed/processed_weekly_full.csv
  2. Modeling dataset: ../data/processed/processed_weekly_modeling.csv
  3. Reference data: ../data/processed/processed_weekly_reference.csv
  4. Feature list: ../data/processed/feature_list.txt
