In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

# -------------------------------
# Step 1: Load prepared data
# -------------------------------
daily_sales_feat = pd.read_csv("prepared_sales.csv")
daily_sales_feat['date'] = pd.to_datetime(daily_sales_feat['date'])

# -------------------------------
# Step 2: Feature Engineering & Data Preparation
# -------------------------------
# Define features: list of feature column names to use for forecasting
# Exclude event_name_1 since it's mostly NaN and causes issues with XGBoost
features = ['day_of_week', 'month', 'week_of_year', 'year', 'event_flag', 'rolling_7', 'rolling_14']

# Handle missing values in rolling features
daily_sales_feat['rolling_7'] = daily_sales_feat['rolling_7'].fillna(daily_sales_feat['sales'])
daily_sales_feat['rolling_14'] = daily_sales_feat['rolling_14'].fillna(daily_sales_feat['sales'])

print("Features to use for forecasting:", features)
print("Data shape:", daily_sales_feat.shape)
print("Missing values in features:")
print(daily_sales_feat[features].isnull().sum())

In [None]:
# -------------------------------
# Step 3: Recursive Forecasting Function
# -------------------------------
import pandas as pd
import xgboost as xgb

def recursive_forecast(df, forecast_horizon=28, features=None):
    """
    Perform recursive forecasting with XGBoost.
    
    Parameters:
    - df: DataFrame with time series data for one store
    - forecast_horizon: number of days to forecast
    - features: list of feature column names
    
    Returns:
    - forecast_df: DataFrame with forecasted values
    """
    if features is None:
        features = ['day_of_week', 'month', 'week_of_year', 'year', 'event_flag', 'rolling_7', 'rolling_14']
    
    # Sort by date and reset index
    df = df.sort_values('date').reset_index(drop=True)
    
    # Create a copy for training
    train = df.copy()
    
    # Store forecasts
    forecasts = []
    forecast_dates = []
    
    # Get the last date in training data
    last_date = train['date'].max()
    
    # Recursive forecasting
    for day in range(forecast_horizon):
        # Use the latest row for prediction
        X_pred = train[features].iloc[[-1]]  # keep as DataFrame
        
        # Train XGBoost model (simplified - no categorical handling)
        model = xgb.XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            random_state=42
        )
        model.fit(train[features], train['sales'])

        yhat = model.predict(X_pred)[0]

        # Prepare new row for recursive update
        next_date = last_date + pd.Timedelta(days=day+1)
        
        new_row = {
            'date': next_date,
            'store_id': train['store_id'].iloc[-1],
            'sales': max(0, yhat),  # Ensure non-negative sales
            'day_of_week': next_date.dayofweek,
            'month': next_date.month,
            'week_of_year': next_date.isocalendar().week,
            'year': next_date.year,
            'event_flag': 0,  # Assume no events in forecast period
            'rolling_7': 0,   # Will be updated below
            'rolling_14': 0   # Will be updated below
        }
        
        # Update rolling averages
        if len(train) >= 7:
            new_row['rolling_7'] = train['sales'].tail(7).mean()
        else:
            new_row['rolling_7'] = train['sales'].mean()
            
        if len(train) >= 14:
            new_row['rolling_14'] = train['sales'].tail(14).mean()
        else:
            new_row['rolling_14'] = train['sales'].mean()
        
        # Add to training data for next iteration
        train = pd.concat([train, pd.DataFrame([new_row])], ignore_index=True)
        
        # Store forecast
        forecasts.append(yhat)
        forecast_dates.append(next_date)
    
    # Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'date': forecast_dates,
        'store_id': train['store_id'].iloc[0],
        'forecast': forecasts
    })
    
    return forecast_df

print("Recursive forecasting function defined successfully!")

In [None]:
# -------------------------------
# Step 4: Generate Forecasts for All Stores
# -------------------------------
stores = daily_sales_feat['store_id'].unique()
forecast_results = []

for store in stores:
    print(f"Forecasting {store}...")
    store_df = daily_sales_feat[daily_sales_feat['store_id'] == store]
    forecast = recursive_forecast(store_df, forecast_horizon=28, features=features)
    forecast_results.append(forecast)

# Combine all forecasts
all_forecasts = pd.concat(forecast_results, ignore_index=True)

print("\nForecasting complete!")
print(f"Generated forecasts for {len(stores)} stores")
print(f"Total forecast records: {len(all_forecasts)}")
print("\nSample forecasts:")
print(all_forecasts.head(10))

In [None]:
# -------------------------------
# Step 5: Visualization
# -------------------------------
plt.figure(figsize=(15, 10))

for i, store in enumerate(stores):
    plt.subplot(2, 2, i+1)
    
    # Historical data
    store_historical = daily_sales_feat[daily_sales_feat['store_id'] == store]
    store_forecast = all_forecasts[all_forecasts['store_id'] == store]
    
    # Plot last 60 days of historical data
    recent_data = store_historical.tail(60)
    plt.plot(recent_data['date'], recent_data['sales'], 
             label='Historical', color='blue', linewidth=2)
    
    # Plot forecasts
    plt.plot(store_forecast['date'], store_forecast['forecast'], 
             label='Forecast', color='red', linewidth=2, linestyle='--')
    
    plt.title(f'Sales Forecast - {store}')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("\nForecast Summary Statistics:")
print(all_forecasts.groupby('store_id')['forecast'].agg(['mean', 'std', 'min', 'max']).round(2))

In [None]:
# -------------------------------
# Step 6: Save Results
# -------------------------------
all_forecasts.to_csv('sales_forecasts.csv', index=False)
print("\nForecasts saved to 'sales_forecasts.csv'")

# Create a summary report
summary_stats = all_forecasts.groupby('store_id')['forecast'].agg([
    'count', 'mean', 'std', 'min', 'max'
]).round(2)
summary_stats.columns = ['Days_Forecasted', 'Avg_Daily_Sales', 'Std_Dev', 'Min_Sales', 'Max_Sales']
summary_stats.to_csv('forecast_summary.csv')
print("Summary statistics saved to 'forecast_summary.csv'")

print("\n03_forecasting complete. 28-day sales forecasts generated successfully!")