# Seasonal Demand Prediction - Future Forecast (Jan 1, 2016)

This notebook loads the groceries dataset, trains a Gradient Boosting model, and predicts the demand for **January 1, 2016**.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Preprocess Data

In [None]:
print("Loading data...")
df = pd.read_csv('Groceries_dataset.csv')
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
daily_sales = df.groupby('Date').size().reset_index(name='Sales_Count')
daily_sales = daily_sales.sort_values('Date')
print(f"Data loaded. Date range: {daily_sales['Date'].min()} to {daily_sales['Date'].max()}")

## 2. Feature Engineering

In [None]:
# Date features
daily_sales['Month'] = daily_sales['Date'].dt.month
daily_sales['Day'] = daily_sales['Date'].dt.day
daily_sales['DayOfWeek'] = daily_sales['Date'].dt.dayofweek
daily_sales['Year'] = daily_sales['Date'].dt.year
daily_sales['Quarter'] = daily_sales['Date'].dt.quarter
daily_sales['Is_Weekend'] = daily_sales['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

def get_season(month):
    if month in [12, 1, 2]: return 3 # Winter
    elif month in [3, 4, 5]: return 2 # Spring
    elif month in [6, 7, 8]: return 1 # Summer
    else: return 0 # Fall

daily_sales['Season_Code'] = daily_sales['Month'].apply(get_season)

# Lag features
for lag in [1, 2, 3, 4, 5, 6, 7, 14]:
    daily_sales[f'Sales_Lag_{lag}'] = daily_sales['Sales_Count'].shift(lag)

# Rolling stats
for window in [3, 7, 14, 30]:
    daily_sales[f'Rolling_Mean_{window}'] = daily_sales['Sales_Count'].rolling(window=window).mean()
    daily_sales[f'Rolling_Std_{window}'] = daily_sales['Sales_Count'].rolling(window=window).std()

# Drop NaN due to lags
daily_sales = daily_sales.dropna()

# Demand Level Target
threshold = daily_sales['Sales_Count'].median()
daily_sales['Demand_Level'] = daily_sales['Sales_Count'].apply(lambda x: 'High' if x > threshold else 'Low')
le = LabelEncoder()
daily_sales['Target'] = le.fit_transform(daily_sales['Demand_Level'])

print("Features created.")

## 3. Train Model

In [None]:
feature_cols = [
    'Month', 'Day', 'DayOfWeek', 'Year', 'Quarter', 'Is_Weekend', 'Season_Code',
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_5', 'Sales_Lag_6', 'Sales_Lag_7', 'Sales_Lag_14',
    'Rolling_Mean_3', 'Rolling_Mean_7', 'Rolling_Mean_14', 'Rolling_Mean_30',
    'Rolling_Std_3', 'Rolling_Std_7', 'Rolling_Std_14', 'Rolling_Std_30'
]

X = daily_sales[feature_cols]
y = daily_sales['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Training Gradient Boosting Classifier...")
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_scaled, y)
print(f"Model trained. Training Accuracy: {gb_model.score(X_scaled, y):.4f}")

## 4. Predict for 1 January 2016

In [None]:
def predict_future_demand(target_date_str, daily_sales_df, scaler, gb_model, feature_cols):
    print(f"\n{'='*60}")
    print(f"FUTURE DEMAND PREDICTION")
    print(f"{'='*60}\n")
    
    target_date = pd.to_datetime(target_date_str)
    print(f"Target Date: {target_date.strftime('%A, %B %d, %Y')}")
    
    latest_date = daily_sales_df['Date'].max()
    print(f"Latest data available: {latest_date.strftime('%Y-%m-%d')}")
    
    # Features for Target
    month = target_date.month
    day = target_date.day
    day_of_week = target_date.dayofweek
    year = target_date.year
    quarter = target_date.quarter
    is_weekend = 1 if day_of_week >= 5 else 0
    
    if month in [12, 1, 2]: season_code = 3
    elif month in [3, 4, 5]: season_code = 2
    elif month in [6, 7, 8]: season_code = 1
    else: season_code = 0
    
    # Get recent data for lags/rolling
    recent_data = daily_sales_df.tail(30)
    sales_values = recent_data['Sales_Count'].values
    
    lags = {}
    for lag in [1, 2, 3, 4, 5, 6, 7, 14]:
        lags[f'Sales_Lag_{lag}'] = sales_values[-lag] if len(sales_values) >= lag else sales_values[-1]
        
    rolling_stats = {}
    for window in [3, 7, 14, 30]:
        rolling_stats[f'Rolling_Mean_{window}'] = np.mean(sales_values[-window:])
        rolling_stats[f'Rolling_Std_{window}'] = np.std(sales_values[-window:])
        
    features = {
        'Month': month,
        'Day': day,
        'DayOfWeek': day_of_week,
        'Year': year,
        'Quarter': quarter,
        'Is_Weekend': is_weekend,
        'Season_Code': season_code,
        **lags,
        **rolling_stats
    }
    
    input_df = pd.DataFrame([features])[feature_cols]
    input_scaled = scaler.transform(input_df)
    
    prediction_code = gb_model.predict(input_scaled)[0]
    prediction_proba = gb_model.predict_proba(input_scaled)[0]
    
    # Assuming 0=High based on previous context
    predicted_label = "High" if prediction_code == 0 else "Low"
    
    print(f"\nPREDICTION RESULTS")
    print(f"{'='*60}")
    print(f"Predicted Demand Level: {predicted_label.upper()}")
    print(f"Confidence (High/Low): {prediction_proba}")
    print(f"{'='*60}\n")

predict_future_demand('2016-01-01', daily_sales, scaler, gb_model, feature_cols)