<div style="
    background: linear-gradient(90deg, #B07CFF, #2D0A47, #000000);
    color:white;
    padding:25px 30px;
    border-radius:12px;
    font-size:32px;
    font-weight:800;
    box-shadow: 0 0 15px rgba(128,0,255,0.6);
    width: 95%;        /* Change this to control width */
    height: 400;     
    line-height: 1.4; 
">
    <div>DEPI Final Data Science Project</div>
    <div>Project: Sales Forecasting and Optimization</div>
</div>



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.stattools import adfuller

##### displaying some info 

In [None]:
train = pd.read_csv('data/train.csv', parse_dates=['Date'], low_memory=False)
store = pd.read_csv('data/store.csv')
print(f"Train dataset loaded: {len(train):,} rows, {len(train.columns)} columns")
print(f"Store dataset loaded: {len(store):,} stores, {len(store.columns)} columns")
print(f"Date range: {train['Date'].min()} to {train['Date'].max()}")

##### preprocessing step to all add store related attributes in the training data (for max efficiency)

In [None]:
df = train.merge(store, on='Store', how='left')
print(f" Merged dataset: {len(df):,} rows, {len(df.columns)} columns")


In [None]:
print(f"Total Records: {len(df):,}")
print(f"Date Range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Number of Stores: {df['Store'].nunique()}")
print(f"Time Period: {(df['Date'].max() - df['Date'].min()).days} days")
print(df.dtypes)

In [None]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
print(pd.DataFrame({ 'Missing_Count': missing[missing > 0], 'Percentage': missing_pct[missing > 0] }))

In [None]:
print(df[['Sales', 'Customers', 'CompetitionDistance']].describe())

In [None]:
print(f"Duplicates: {df.duplicated().sum()}")

##### outlier detection for "Sales" column

In [None]:
Q1 = df['Sales'].quantile(0.25)
Q3 = df['Sales'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR # we use 3*IQR for stricter outlier detection
upper_bound = Q3 + 3 * IQR
outliers = df[(df['Sales'] < lower_bound) | (df['Sales'] > upper_bound)]
print(f"Sales outliers: {len(outliers):,} ({len(outliers)/len(df)*100:.2f}%)")

In [None]:
df.shape

In [None]:
df = df[(df['Sales'] >= lower_bound) & (df['Sales'] <= upper_bound)].copy()

print(f"Cleaned dataset shape: {df.shape}")

In [None]:
fig = make_subplots(rows=3, cols=2,
    subplot_titles=(
        'Daily Sales Trend', 'Sales Distribution', 'Sales by Day of Week',
        'Promo vs No Promo', 'Sales by Store Type', 'Competition Distance Impact'
    ),
    specs=[
        [{'type': 'scatter'}, {'type': 'histogram'}],
        [{'type': 'bar'}, {'type': 'box'}],
        [{'type': 'box'}, {'type': 'scatter'}]
    ]
)
daily_sales = df.groupby('Date')['Sales'].mean().reset_index()
fig.add_trace(go.Scatter(x=daily_sales['Date'], y=daily_sales['Sales'], mode='markers', name='Avg Daily Sales'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Sales'], nbinsx=50, name='Sales Dist'), row=1, col=2)
dow_sales = df.groupby('DayOfWeek')['Sales'].mean()
fig.add_trace(go.Bar(x=['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], y=dow_sales.values, name='DoW Sales'), row=2, col=1)
fig.add_trace(go.Box(x=df['Promo'].map({0: 'No Promo', 1: 'Promo'}), y=df['Sales'], name='Promo'), row=2, col=2)
if 'StoreType' in df.columns:
    fig.add_trace(go.Box(x=df['StoreType'], y=df['Sales'], name='Store Type'), row=3, col=1)
if 'CompetitionDistance' in df.columns:
    sample = df.sample(min(5000, len(df)))
    fig.add_trace(go.Scatter(x=sample['CompetitionDistance'], y=sample['Sales'], mode='markers', marker=dict(size=3, opacity=0.5)), row=3, col=2)
fig.update_layout(height=1200,showlegend=False,title_text="Rossmann Sales - Initial EDA Dashboard")
fig.show()

##### Data Preprocessing, cleaning and Feature Engineering


In [None]:
df= df.drop_duplicates()
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
df['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df['CompetitionOpenSinceYear'].fillna(0, inplace=True)


In [None]:
df['Promo2SinceWeek'].fillna(0, inplace=True)
df['Promo2SinceYear'].fillna(0, inplace=True)
df['PromoInterval'].fillna('None', inplace=True)

In [None]:
Q1 = df['Sales'].quantile(0.25)
Q3 = df['Sales'].quantile(0.75)
IQR = Q3 - Q1
df['Is_Outlier'] = ((df['Sales'] < (Q1 - 3 * IQR)) | (df['Sales'] > (Q3 + 3 * IQR))).astype(int)

In [None]:
df.isna().sum()


##### preparing the a cleaned dataset for feature engineering

In [None]:
df_feat = df.copy()
df_feat = df_feat.sort_values(['Store', 'Date']).reset_index(drop=True)

##### extracts calendar components from date column and store them in new columns

In [None]:
df_feat['Year'] = df_feat['Date'].dt.year
df_feat['Month'] = df_feat['Date'].dt.month
df_feat['Day'] = df_feat['Date'].dt.day
df_feat['WeekOfYear'] = df_feat['Date'].dt.isocalendar().week
df_feat['Quarter'] = df_feat['Date'].dt.quarter

In [None]:
df_feat['IsWeekend'] = (df_feat['DayOfWeek'] >= 6).astype(int) # Saturday=6, Sunday=7, monday=1
df_feat['IsMonthStart'] = df_feat['Date'].dt.is_month_start.astype(int)
df_feat['IsMonthEnd'] = df_feat['Date'].dt.is_month_end.astype(int)

##### this will help machine learning understand cyclical time patterns
##### (eg: that december comes january and after saturday comes sunday)

In [None]:
df_feat['Month_sin'] = np.sin(2 * np.pi * df_feat['Month'] / 12)
df_feat['Month_cos'] = np.cos(2 * np.pi * df_feat['Month'] / 12)
df_feat['DayOfWeek_sin'] = np.sin(2 * np.pi * df_feat['DayOfWeek'] / 7)
df_feat['DayOfWeek_cos'] = np.cos(2 * np.pi * df_feat['DayOfWeek'] / 7)

In [None]:
months = np.arange(1, 13)
month_sin = np.sin(2 * np.pi * months / 12)
month_cos = np.cos(2 * np.pi * months / 12)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=month_cos,
    y=month_sin,
    mode='markers+text',
    text=[f'Month {m}' for m in months],
    textposition='top center',
    marker=dict(size=12, color=months, colorscale='Viridis')
))
# circle outline
theta = np.linspace(0, 2 * np.pi, 100)
fig.add_trace(go.Scatter(
    x=np.cos(theta),
    y=np.sin(theta),
    mode='lines',
    line=dict(color='lightgray', dash='dot'),
    showlegend=False
))
fig.update_layout(
    title='Cyclical Encoding of Months (sin/cos)',
    xaxis_title='cos(2π * month / 12)',
    yaxis_title='sin(2π * month / 12)',
    width=600, height=600,
    xaxis=dict(scaleanchor='y', scaleratio=1),
    yaxis=dict(showgrid=False)
)
fig.show()

##### represent past values of “Sales” and “Customers” for each store

In [None]:
for lag in [1, 7, 14, 30]:#lag periods:1 day, 7 days, 14 days, and 30 days
    df_feat[f'Sales_Lag_{lag}'] = df_feat.groupby('Store')['Sales'].shift(lag)
    df_feat[f'Customers_Lag_{lag}'] = df_feat.groupby('Store')['Customers'].shift(lag)


In [None]:
for window in [7, 14, 30]:
    df_feat[f'Sales_Rolling_Mean_{window}'] = df_feat.groupby('Store')['Sales'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())
    df_feat[f'Sales_Rolling_Std_{window}'] = df_feat.groupby('Store')['Sales'].transform(lambda x: x.rolling(window=window, min_periods=1).std())
df_feat['SalesPerCustomer'] = df_feat['Sales'] / df_feat['Customers']

##### These features help a machine learning model recognize patterns over time and Shows the recent average performance of a store

In [None]:
df_feat[f'Sales_Rolling_Mean_{window}'] = df_feat.groupby('Store')['Sales'].transform(lambda x: x.rolling(window=window, min_periods=1).mean())
df_feat[f'Sales_Rolling_Std_{window}'] = df_feat.groupby('Store')['Sales'].transform(lambda x: x.rolling(window=window, min_periods=1).std())
df_feat['SalesPerCustomer'] = df_feat['Sales'] / df_feat['Customers']
df_feat['SalesPerCustomer'].replace([np.inf, -np.inf], 0, inplace=True) # if record has no customers, set SalesPerCustomer to 0

##### this block creates a new feature called CompetitionMonthsOpen, which measures how many months a store has had competition nearby up to the current date (based on the record’s year and month)

In [None]:
if 'CompetitionOpenSinceYear' in df_feat.columns:
        #calculates how many months have passed since a store’s competition started
    df_feat['CompetitionMonthsOpen'] = 12 * (df_feat['Year'] - df_feat['CompetitionOpenSinceYear']) + (df_feat['Month'] - df_feat['CompetitionOpenSinceMonth'])
    df_feat['CompetitionMonthsOpen'] = df_feat['CompetitionMonthsOpen'].clip(lower=0)

##### it measures how long a store’s continuous promotion program (“Promo2”) has been running

In [None]:
if 'Promo2SinceYear' in df_feat.columns:
    df_feat['Promo2Weeks'] = 52 * (df_feat['Year'] - df_feat['Promo2SinceYear']) + (df_feat['WeekOfYear'] - df_feat['Promo2SinceWeek'])
    df_feat['Promo2Weeks'] = df_feat['Promo2Weeks'].clip(lower=0)

##### finally ensuring that the final dataset has no missing values before passing it into a machine learning model

In [None]:
df_feat = df_feat.dropna()  
print(f"Feature engineered shape: {df_feat.shape}")


## Statistical Analysis

##### The Augmented Dickey-Fuller test for stationarity on the daily aggregated Rossmann sales data yields an ADF statistic of -70.56 and a p-value below 0.0001. We conclusively reject the null hypothesis of a unit root, indicating that the time series is stationary and suitable for time series modeling without further differencing."
- we’re ready to proceed to training time series models (AR, ARIMA, SARIMAX, Prophet, etc.)

In [None]:
subset = df_feat['Sales'].sample(50000, random_state=1)
adf_result = adfuller(subset.dropna())
print(f"ADF Statistic: {adf_result[0]:.4f}")
print(f"p-value: {adf_result[1]:.4f}")


In [None]:
numeric_cols = df_feat.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_feat[numeric_cols].corr()
sales_corr = correlation_matrix['Sales'].abs().sort_values(ascending=False)
print("Top 15 features correlated with Sales:")
print(sales_corr.head(48)[1:]) 

##### measures how much promotions increase sales on average

In [None]:
promo_impact = df_feat.groupby('Promo')['Sales'].agg(['mean', 'median', 'std'])
promo_impact.index = ['No Promo', 'With Promo']
promo_lift = ((promo_impact.loc['With Promo', 'mean'] / promo_impact.loc['No Promo', 'mean']) - 1) * 100
print(f"Avg sales lift from promotions: {promo_lift:.2f}%")

In [None]:
if 'StoreType' in df_feat.columns:
    print(df_feat.groupby('StoreType')['Sales'].agg(['mean', 'median', 'std', 'count']))
dow_stats = df_feat.groupby('DayOfWeek')['Sales'].agg(['mean', 'median'])
dow_stats.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
print(dow_stats)


In [None]:
plt.figure(figsize=(14,12))
top_features = correlation_matrix['Sales'].abs().sort_values(ascending=False).head(20).index
sns.heatmap(correlation_matrix.loc[top_features, top_features], annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
plt.title('Top 20 Features - Correlation Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.show()


In [None]:
sample_sales = df_feat.groupby('Date')['Sales'].mean().dropna().sample(min(900, len(df_feat)))
# Compute ACF and PACF values (up to 50 lags)
lags = 50
acf_values = acf(sample_sales, nlags=lags)
pacf_values = pacf(sample_sales, nlags=lags)
# Create a 1x2 subplot layout
fig = make_subplots(rows=1, cols=2, subplot_titles=("Autocorrelation Function (ACF)", "Partial Autocorrelation (PACF)"))
# ACF plot
fig.add_trace(
    go.Bar(x=list(range(lags + 1)), y=acf_values, name='ACF', marker_color='skyblue'),
    row=1, col=1
)
# PACF plot
fig.add_trace(
    go.Bar(x=list(range(lags + 1)), y=pacf_values, name='PACF', marker_color='lightgreen'),
    row=1, col=2
)
# Add horizontal zero lines
for i in range(1, 3):
    fig.add_shape(type="line", x0=0, x1=lags, y0=0, y1=0, line=dict(color="black", width=1), row=1, col=i)
# Update layout
fig.update_layout(
    title_text="ACF and PACF Plots (Interactive)",
    showlegend=False,
    height=500,
    width=1000,
    template="plotly_white"
)
fig.show()

In [None]:
df_feat.to_csv('data/cleaned_sales_features.csv', index=False)
print("✅ Cleaned feature dataset saved to: data/cleaned_sales_features.csv")


## Forecasting Model Development and Optimization Objectives

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
#how to fix pmdarima installation issues?
#import pmdarima as pm

In [None]:
# machine learning libraries
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
#Deep learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [None]:
df = pd.read_csv('data/cleaned_sales_features.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
daily_sales = df.groupby('Date').agg({
    'Sales': 'sum',
    'Customers': 'sum', 
    'Promo': 'mean',
    'SchoolHoliday': 'max',
    'StateHoliday': lambda x: (x != '0').any().astype(int)
}).reset_index()
daily_sales = daily_sales.sort_values('Date')
print(f"Time Series Data: {len(daily_sales)} days")
print(f"Date Range: {daily_sales['Date'].min()} to {daily_sales['Date'].max()}")
print(f"Total Sales Range: {daily_sales['Sales'].min():,.0f} to {daily_sales['Sales'].max():,.0f}")

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_sales['Date'], y=daily_sales['Sales'], 
                        mode='lines', name='Daily Sales',
                        line=dict(color='blue', width=1)))
fig.update_layout(
    title='Rossmann Daily Sales Time Series',
    xaxis_title='Date',
    yaxis_title='Total Daily Sales',
    height=400
)
fig.show()

In [None]:
test_weeks = 23
test_size = test_weeks * 7
train_size = len(daily_sales) - test_size

train_data = daily_sales.iloc[:train_size].copy()
test_data = daily_sales.iloc[train_size:].copy()

print(f"Training Data: {len(train_data)} days ({train_data['Date'].min()} to {train_data['Date'].max()})")
print(f"Test Data: {len(test_data)} days ({test_data['Date'].min()} to {test_data['Date'].max()})")

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_data['Date'], y=train_data['Sales'],
                        mode='lines', name='Training Data', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=test_data['Date'], y=test_data['Sales'],
                        mode='lines', name='Test Data', line=dict(color='red')))
fig.update_layout(title='Train-Test Split Visualization', height=400)
fig.show()

In [None]:
def train_auto_arima(train_series, seasonal=True):
    # Define potential parameter ranges (p,d,q)
    p_values = range(0, 3)
    d_values = range(0, 2)
    q_values = range(0, 3)
    best_aic = np.inf
    best_order = None
    best_model = None

    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    model = ARIMA(train_series, order=order)
                    results = model.fit()
                    if results.aic < best_aic:
                        best_aic = results.aic
                        best_order = order
                        best_model = results
                    print(f"Tested order {order}, AIC: {results.aic:.2f}")
                except Exception as e:
                    continue
    print(f"✅ Selected ARIMA order: {best_order} (AIC={best_aic:.2f})")
    return best_model
# Train "Auto-ARIMA"
arima_model = train_auto_arima(train_data['Sales'])
# Make predictions
arima_forecast = arima_model.forecast(steps=len(test_data))
# (statsmodels forecast doesn't return conf int by default, but you can add if needed)
print("🎯 ARIMA predictions completed!")

In [None]:
def train_prophet_model(train_data):    
    # Prepare data for Prophet
    prophet_train = train_data[['Date', 'Sales']].copy()
    prophet_train.columns = ['ds', 'y']
    
    # Add external regressors
    prophet_train['promo'] = train_data['Promo'].values
    prophet_train['school_holiday'] = train_data['SchoolHoliday'].values
    
    # Initialize Prophet model
    prophet_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=False,
        changepoint_prior_scale=0.05,
        seasonality_prior_scale=10.0
    )
    # Add external regressors
    prophet_model.add_regressor('promo')
    prophet_model.add_regressor('school_holiday')
    # Fit model
    prophet_model.fit(prophet_train)
    print("✅ Prophet model trained successfully!")
    return prophet_model

# Train Prophet
prophet_model = train_prophet_model(train_data)
# Make predictions
future_prophet = prophet_model.make_future_dataframe(periods=len(test_data))
future_prophet['promo'] = pd.concat([train_data['Promo'], test_data['Promo']], ignore_index=True)
future_prophet['school_holiday'] = pd.concat([train_data['SchoolHoliday'], test_data['SchoolHoliday']], ignore_index=True)
prophet_forecast = prophet_model.predict(future_prophet)
prophet_predictions = prophet_forecast.iloc[-len(test_data):]['yhat'].values

print("🎯 Prophet predictions completed!")

In [None]:
def create_time_features(df):    
    df_features = df.copy()
    df_features['year'] = df_features['Date'].dt.year
    df_features['month'] = df_features['Date'].dt.month
    df_features['day'] = df_features['Date'].dt.day
    df_features['dayofweek'] = df_features['Date'].dt.dayofweek
    df_features['dayofyear'] = df_features['Date'].dt.dayofyear
    df_features['weekofyear'] = df_features['Date'].dt.isocalendar().week
    df_features['quarter'] = df_features['Date'].dt.quarter
    df_features['is_weekend'] = (df_features['dayofweek'] >= 5).astype(int)
    
    # Cyclical encoding
    df_features['month_sin'] = np.sin(2 * np.pi * df_features['month'] / 12)
    df_features['month_cos'] = np.cos(2 * np.pi * df_features['month'] / 12)
    df_features['day_sin'] = np.sin(2 * np.pi * df_features['dayofweek'] / 7)
    df_features['day_cos'] = np.cos(2 * np.pi * df_features['dayofweek'] / 7)
    
    # Lag features
    for lag in [1, 7, 14, 21]:
        df_features[f'sales_lag_{lag}'] = df_features['Sales'].shift(lag)
    
    # Rolling statistics
    for window in [7, 14, 28]:
        df_features[f'sales_rolling_mean_{window}'] = df_features['Sales'].rolling(window).mean()
        df_features[f'sales_rolling_std_{window}'] = df_features['Sales'].rolling(window).std()
    
    return df_features

def train_xgboost_model(train_data, test_data):
    """Train XGBoost model with Bayesian optimization"""
    
    print("🚀 Training XGBoost Model with Bayesian Optimization...")
    
    # Create features
    train_features = create_time_features(train_data)
    test_features = create_time_features(test_data)
    
    # Select feature columns
    feature_cols = [col for col in train_features.columns if col not in ['Date', 'Sales']]
    
    # Drop NaN rows (from lag features)
    train_clean = train_features.dropna()
    
    X_train = train_clean[feature_cols]
    y_train = train_clean['Sales']
    X_test = test_features[feature_cols].fillna(method='ffill')
    
    # Bayesian optimization for hyperparameters
    search_spaces = {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.3),
        'subsample': Real(0.6, 1.0),
        'colsample_bytree': Real(0.6, 1.0)
    }
    
    xgb_model = xgb.XGBRegressor(random_state=42)
    
    # Time series cross-validation
    tscv = TimeSeriesSplit(n_splits=3)
    
    bayes_search = BayesSearchCV(
        xgb_model,
        search_spaces,
        n_iter=20,
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        random_state=42
    )    
    bayes_search.fit(X_train, y_train)
    print(f"✅ Best XGBoost Parameters: {bayes_search.best_params_}")
    # Make predictions
    xgb_predictions = bayes_search.predict(X_test)
    
    return bayes_search.best_estimator_, xgb_predictions
# Train XGBoost
xgb_model, xgb_predictions = train_xgboost_model(train_data, test_data)

print("🎯 XGBoost predictions completed!")

In [None]:
def create_lstm_sequences(data, lookback=14):
    X, y = [], []
    for i in range(lookback, len(data)):
        X.append(data[i-lookback:i])
        y.append(data[i])
    return np.array(X), np.array(y)

def train_lstm_model(train_data, test_data, lookback=14):    
    # Scale the data
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_data[['Sales']])
    
    # Create sequences
    X_train, y_train = create_lstm_sequences(train_scaled.flatten(), lookback)
    
    # Reshape for LSTM [samples, time steps, features]
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    
    # Build LSTM model
    lstm_model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(lookback, 1)),
        Dropout(0.2),
        LSTM(50, return_sequences=False),
        Dropout(0.2),
        Dense(25),
        Dense(1)
    ])
    
    lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    # Train model
    history = lstm_model.fit(
        X_train, y_train,
        batch_size=32,
        epochs=50,
        validation_split=0.2,
        verbose=0
    )
    
    # Make predictions
    last_sequence = train_scaled[-lookback:].flatten()
    lstm_predictions = []
    
    for _ in range(len(test_data)):
        # Predict next value
        pred_input = last_sequence[-lookback:].reshape(1, lookback, 1)
        pred = lstm_model.predict(pred_input, verbose=0)[0, 0]
        lstm_predictions.append(pred)
        
        # Update sequence
        last_sequence = np.append(last_sequence[1:], pred)
    
    # Inverse scale predictions
    lstm_predictions = scaler.inverse_transform(np.array(lstm_predictions).reshape(-1, 1)).flatten()
    
    print("✅ LSTM model trained successfully!")
    return lstm_model, lstm_predictions, scaler

# Train LSTM
lstm_model, lstm_predictions, lstm_scaler = train_lstm_model(train_data, test_data)

print("🎯 LSTM predictions completed!")

In [None]:
def train_random_forest(train_data, test_data):    
    # Create features (reuse from XGBoost)
    train_features = create_time_features(train_data)
    test_features = create_time_features(test_data)
    
    feature_cols = [col for col in train_features.columns if col not in ['Date', 'Sales']]
    
    train_clean = train_features.dropna()
    X_train = train_clean[feature_cols]
    y_train = train_clean['Sales']
    X_test = test_features[feature_cols].fillna(method='ffill')
    
    # Hyperparameter search
    search_spaces = {
        'n_estimators': Integer(100, 500),
        'max_depth': Integer(5, 20),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 5)
    }
    
    rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=3)
    
    bayes_search = BayesSearchCV(
        rf_model,
        search_spaces,
        n_iter=15,
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        random_state=42
    )
    
    bayes_search.fit(X_train, y_train)
    
    rf_predictions = bayes_search.predict(X_test)
    
    print(f"✅ Best Random Forest Parameters: {bayes_search.best_params_}")
    return bayes_search.best_estimator_, rf_predictions
# Train Random Forest
rf_model, rf_predictions = train_random_forest(train_data, test_data)

print("🎯 Random Forest predictions completed!")

In [None]:
def calculate_metrics(y_true, y_pred, model_name):
    """Calculate comprehensive evaluation metrics"""
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    
    return {
        'Model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape
    }

# Collect all predictions
predictions = {
    'Auto-ARIMA': arima_forecast,
    'Facebook Prophet': prophet_predictions,
    'XGBoost': xgb_predictions,
    'LSTM': lstm_predictions,
    'Random Forest': rf_predictions
}

# Calculate metrics for all models
results = []
y_true = test_data['Sales'].values

for model_name, y_pred in predictions.items():
    metrics = calculate_metrics(y_true, y_pred, model_name)
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('RMSE')

print("📊 MODEL PERFORMANCE COMPARISON")
print("=" * 50)
print(results_df.to_string(index=False, float_format='%.2f'))

# Visualize model comparison
fig = make_subplots(rows=2, cols=2,
                   subplot_titles=['RMSE Comparison', 'MAE Comparison', 
                                 'MAPE Comparison', 'Actual vs Predictions'])
results_df['RMSE'] = results_df['RMSE'].round(2)
results_df['MAE'] = results_df['MAE'].round(2)
results_df['MAPE'] = results_df['MAPE'].round(2)
# RMSE
fig.add_trace(go.Bar(x=results_df['Model'], y=results_df['RMSE'], name='RMSE'), row=1, col=1)

# MAE
fig.add_trace(go.Bar(x=results_df['Model'], y=results_df['MAE'], name='MAE'), row=1, col=2)

# MAPE
fig.add_trace(go.Bar(x=results_df['Model'], y=results_df['MAPE'], name='MAPE'), row=2, col=1)

# Actual vs Predictions (Best Model)
best_model = results_df.iloc[0]['Model']
best_predictions = predictions[best_model]

fig.add_trace(go.Scatter(x=test_data['Date'], y=y_true, mode='lines', 
                        name='Actual', line=dict(color='blue')), row=2, col=2)
fig.add_trace(go.Scatter(x=test_data['Date'], y=best_predictions, mode='lines',
                        name=f'{best_model} (Best)', line=dict(color='red')), row=2, col=2)

fig.update_layout(height=800, title_text="Comprehensive Model Performance Analysis")
fig.show()