# Sales Forecasting Project â€” Colab-ready Notebook

**Purpose:** Predict future sales using synthetic time-series data. This notebook trains ARIMA, Prophet, and LSTM models and compares forecasts.

**Notes:**
- Uses built-in synthetic dataset (daily sales + promotions) so it runs without uploads.
- Long-running cells (LSTM training) are kept modest so it finishes comfortably in Colab.

---


In [None]:
# Install dependencies (run this first). In Colab this may take a few minutes.
!pip install -q numpy pandas matplotlib seaborn statsmodels prophet tensorflow scikit-learn nbformat
# fbprophet fallback isn't installed here by default; 'prophet' package is used.


In [None]:
# Imports and folder setup
import os, warnings, math, pickle
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Prophet import (will use 'prophet' package)
try:
    from prophet import Prophet
except Exception as e:
    Prophet = None
    print('Prophet not available:', e)

# TensorFlow/Keras
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
except Exception as e:
    tf = None
    print('TensorFlow not available:', e)

# Create output folders
os.makedirs('/content/outputs', exist_ok=True)
os.makedirs('/content/models', exist_ok=True)
print('Folders created: /content/outputs and /content/models')

In [None]:
# Generate synthetic daily sales data (2020-01-01 to 2023-12-31)
rng = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
np.random.seed(42)
base = 200 + 0.05 * np.arange(len(rng))  # slight trend
seasonal = 20 * np.sin(2 * np.pi * rng.dayofyear / 365.25)  # yearly seasonality
dow = 10 * ((rng.dayofweek >=5).astype(int))  # weekend uplift
noise = np.random.normal(0, 8, len(rng))
# promotions: random weeks with +50 boost
promo = np.zeros(len(rng), dtype=int)
promo_weeks = np.random.choice(np.arange(len(rng)//7), size=20, replace=False)
for w in promo_weeks:
    start = w*7
    promo[start:start+7] = 1
promo_effect = promo * 50
sales = base + seasonal + dow + promo_effect + noise
df = pd.DataFrame({'date': rng, 'sales': sales, 'promo': promo}).set_index('date')
df.head()

In [None]:
# Quick EDA + save main plot
plt.figure(figsize=(12,4))
plt.plot(df.index, df['sales'])
plt.title('Sales over time (synthetic)')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.tight_layout()
plt.savefig('/content/outputs/sales_time_series.png')
plt.show()

# STL decomposition
stl = STL(df['sales'], period=365)
res = stl.fit()
fig = res.plot()
plt.suptitle('STL Decomposition')
plt.savefig('/content/outputs/stl_decomposition.png')
plt.show()

# Augmented Dickey-Fuller
adf_result = adfuller(df['sales'])
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])

In [None]:
# Train/test split (last 90 days for test)
forecast_horizon = 90
train = df[:-forecast_horizon].copy()
test = df[-forecast_horizon:].copy()
print('Train:', train.index.min().date(), 'to', train.index.max().date())
print('Test:', test.index.min().date(), 'to', test.index.max().date())

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
def evaluate(y_true, y_pred, label='model'):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print(f"{label} -> MAE: {mae:.3f}, RMSE: {rmse:.3f}, MAPE: {mape:.2f}%")
    return {'mae': mae, 'rmse': rmse, 'mape': mape}

In [None]:
# ARIMA with weekly seasonality & promo as exogenous var
p,d,q = 5,1,2
seasonal_order = (1,1,1,7)
exog_train = train[['promo']]
exog_test = test[['promo']]
arima_model = ARIMA(train['sales'], order=(p,d,q), seasonal_order=seasonal_order, exog=exog_train)
arima_res = arima_model.fit()
print(arima_res.summary())

arima_fore = arima_res.predict(start=test.index[0], end=test.index[-1], exog=exog_test)
evaluate(test['sales'], arima_fore, label='ARIMA')

# Save plot and model
plt.figure(figsize=(10,4))
plt.plot(train.index[-180:], train['sales'][-180:], label='train (last 180 days)')
plt.plot(test.index, test['sales'], label='actual')
plt.plot(arima_fore.index, arima_fore, label='arima_pred')
plt.legend()
plt.title('ARIMA Forecast vs Actual')
plt.savefig('/content/outputs/arima_forecast.png')
plt.show()

import pickle
with open('/content/models/arima_model.pkl','wb') as f:
    pickle.dump(arima_res, f)

In [None]:
# Prophet model (if available)
prophet_pred = None
if Prophet is None:
    print('Prophet not installed; skipping Prophet model.')
else:
    prophet_df = train[['sales']].reset_index().rename(columns={'date':'ds','sales':'y'})
    prophet_df['promo'] = train['promo'].values
    m = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
    m.add_regressor('promo')
    m.fit(prophet_df)
    future = m.make_future_dataframe(periods=forecast_horizon)
    future = future.set_index('ds')
    future['promo'] = 0
    future.loc[train.index,'promo'] = train['promo']
    future.loc[test.index,'promo'] = test['promo']
    future = future.reset_index()
    fcst = m.predict(future)
    prophet_pred = fcst.set_index('ds')['yhat'][-forecast_horizon:]
    evaluate(test['sales'], prophet_pred, label='Prophet')
    plt.figure(figsize=(10,4))
    plt.plot(test.index, test['sales'], label='actual')
    plt.plot(prophet_pred.index, prophet_pred, label='prophet_pred')
    plt.legend()
    plt.title('Prophet Forecast vs Actual')
    plt.savefig('/content/outputs/prophet_forecast.png')
    plt.show()
    import pickle
    with open('/content/models/prophet_model.pkl','wb') as f:
        pickle.dump(m, f)

In [None]:
# LSTM forecasting (simple sliding-window). Smaller model/epochs for Colab runtime.
lstm_pred_series = None
if tf is None:
    print('TensorFlow not installed; skipping LSTM.')
else:
    from sklearn.preprocessing import MinMaxScaler
    data_lstm = train[['sales','promo']].copy()
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data_lstm)
    lookback = 30
    X, y = [], []
    for i in range(lookback, len(scaled)):
        X.append(scaled[i-lookback:i])
        y.append(scaled[i,0])
    X, y = np.array(X), np.array(y)
    split = int(0.9 * len(X))
    X_train_l, X_val_l = X[:split], X[split:]
    y_train_l, y_val_l = y[:split], y[split:]
    model = keras.Sequential([
        layers.Input(shape=(lookback, X.shape[2])),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(16, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    # Train for few epochs to keep runtime reasonable
    history = model.fit(X_train_l, y_train_l, validation_data=(X_val_l, y_val_l), epochs=8, batch_size=32)

    # Rolling forecast for test period
    last_seq = scaled[-lookback:]
    preds = []
    for i in range(forecast_horizon):
        x_in = last_seq.reshape(1, lookback, X.shape[2])
        p = model.predict(x_in)[0,0]
        preds.append(p)
        promo_val = test['promo'].iloc[i]
        promo_scaled = scaler.transform(np.array([[0, promo_val]]))[0,1]
        next_scaled = np.array([p, promo_scaled])
        last_seq = np.vstack([last_seq[1:], next_scaled])
    # convert scaled sales preds back to original
    sales_min, sales_max = scaler.data_min_[0], scaler.data_max_[0]
    preds_unscaled = np.array(preds) * (sales_max - sales_min) + sales_min
    lstm_pred_series = pd.Series(preds_unscaled, index=test.index)
    evaluate(test['sales'], lstm_pred_series, label='LSTM')
    plt.figure(figsize=(10,4))
    plt.plot(test.index, test['sales'], label='actual')
    plt.plot(lstm_pred_series.index, lstm_pred_series, label='lstm_pred')
    plt.legend()
    plt.title('LSTM Forecast vs Actual')
    plt.savefig('/content/outputs/lstm_forecast.png')
    plt.show()
    model.save('/content/models/lstm_model')

In [None]:
# Ensemble (average of available model predictions)
preds = pd.DataFrame({'actual': test['sales']})
preds['arima'] = arima_fore
if 'prophet_pred' in globals() and prophet_pred is not None:
    preds['prophet'] = prophet_pred
if 'lstm_pred_series' in globals() and lstm_pred_series is not None:
    preds['lstm'] = lstm_pred_series
model_cols = [c for c in preds.columns if c!='actual']
if len(model_cols)>0:
    preds['ensemble'] = preds[model_cols].mean(axis=1)
    evaluate(preds['actual'], preds['ensemble'], label='Ensemble')
    plt.figure(figsize=(10,4))
    plt.plot(preds.index, preds['actual'], label='actual')
    for c in model_cols:
        plt.plot(preds.index, preds[c], alpha=0.6, label=c)
    plt.plot(preds.index, preds['ensemble'], linewidth=2, label='ensemble')
    plt.legend()
    plt.title('Model comparisons')
    plt.savefig('/content/outputs/model_comparison.png')
    plt.show()

# Save CSV
preds.to_csv('/content/outputs/forecast_comparison.csv')
print('Saved forecast table to /content/outputs/forecast_comparison.csv')

---
## Summary & Next steps
- Outputs saved under `/content/outputs` (plots + forecast CSV) and models under `/content/models`.
- Next steps you might run:
  - Increase LSTM epochs or tune hyperparameters
  - Upload your real dataset and set DATA_PATH to use it
  - Add additional regressors (price, holidays, marketing spend)

Download the outputs folder as a zip (optional):
```python
!zip -r /content/outputs.zip /content/outputs
```
