# 05 — Forecasting Models
Trains baseline and advanced models across multiple forecast horizons.

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from src.data_loader import fetch_multiple_stocks
from src.feature_engineering import engineer_features
from src.baselines import NaiveForecast, RandomWalkForecast, SMAForecast, ARIMAForecast, run_all_baselines
from src.evaluation import compute_all_metrics, plot_forecast_vs_actual

warnings.filterwarnings('ignore')

## 1. Load Data

In [None]:
SELECTED_TICKERS = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA',
    'META', 'TESLA', 'BERKB', 'JPM', 'JNJ',
    'XOM', 'WMT', 'PG', 'MA', 'V',
    'HD', 'DIS', 'PYPL', 'NFLX', 'TSLA',
    'ADBE', 'CRM', 'INTC', 'AMD', 'CSCO',
    'IBM', 'BA', 'GE', 'CAT', 'MMM'
]

stock_data = fetch_multiple_stocks(SELECTED_TICKERS, period="2y")
cluster_labels = pd.read_parquet("../data/processed/cluster_labels.parquet")

print(f"Stock data shape: {len(stock_data)}")
print(f"Cluster labels shape: {cluster_labels.shape}")

## 2. Prepare Training/Test Split
We use the most recent 20% of data as test set.

In [None]:
sample_ticker = "AAPL"
series = stock_data[sample_ticker]["Close"]
train_size = int(len(series) * 0.8)
train = series.iloc[:train_size]
test = series.iloc[train_size:]

print(f"Train shape: {train.shape}, date range: {train.index[0]} to {train.index[-1]}")
print(f"Test shape: {test.shape}, date range: {test.index[0]} to {test.index[-1]}")

## 3. Baseline Models

In [None]:
baseline_results = []

naive = NaiveForecast()
naive.fit(train.values)
naive_preds = naive.predict(len(test))
naive_metrics = compute_all_metrics(test.values, naive_preds)
naive_metrics['model'] = 'Naive'
baseline_results.append(naive_metrics)

rw = RandomWalkForecast()
rw.fit(train.values)
rw_preds = rw.predict(len(test))
rw_metrics = compute_all_metrics(test.values, rw_preds)
rw_metrics['model'] = 'Random Walk'
baseline_results.append(rw_metrics)

sma = SMAForecast(window=20)
sma.fit(train.values)
sma_preds = sma.predict(len(test))
sma_metrics = compute_all_metrics(test.values, sma_preds)
sma_metrics['model'] = 'SMA(20)'
baseline_results.append(sma_metrics)

arima = ARIMAForecast()
arima.fit(train.values)
arima_preds = arima.predict(len(test))
arima_metrics = compute_all_metrics(test.values, arima_preds)
arima_metrics['model'] = 'ARIMA'
baseline_results.append(arima_metrics)

baseline_df = pd.DataFrame(baseline_results)
print(baseline_df.to_string())

## 4. Visualize Baseline Forecasts

In [None]:
predictions_dict = {
    'Naive': naive_preds,
    'Random Walk': rw_preds,
    'SMA(20)': sma_preds,
    'ARIMA': arima_preds
}

plot_forecast_vs_actual(
    test, 
    predictions_dict, 
    title=f"{sample_ticker} - Baseline Forecasts", 
    save=True, 
    filename="05_baseline_forecasts.png"
)

## 5. LSTM Forecasting
**Note:** LSTM requires TensorFlow. If not installed, this section will be skipped gracefully.

In [None]:
lstm_preds = None
try:
    from src.forecasters import LSTMForecaster
    lstm = LSTMForecaster(epochs=30)
    lstm.fit(train.values)
    lstm_preds = lstm.predict(len(test))
    lstm_metrics = compute_all_metrics(test.values, lstm_preds)
    print(f"LSTM Metrics: {lstm_metrics}")
except ImportError:
    print("TensorFlow/LSTM not available - skipping LSTM forecasting")
except Exception as e:
    print(f"Error training LSTM: {e}")

## 6. Cluster-Informed ARIMA

In [None]:
try:
    from src.forecasters import ClusterInformedForecaster
    cluster_informed = ClusterInformedForecaster(model_type="arima")
    cluster_informed.fit(stock_data, cluster_labels["kmeans_cluster"])
    cluster_preds = cluster_informed.predict(sample_ticker, steps=len(test))
    cluster_metrics = compute_all_metrics(test.values, cluster_preds)
    print(f"Cluster-Informed ARIMA Metrics: {cluster_metrics}")
except Exception as e:
    print(f"Error with Cluster-Informed ARIMA: {e}")

## 7. Multi-Stock Forecasting
Run baselines on multiple stocks to build comparison data.

In [None]:
all_results = []
sample_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA']

for ticker in sample_tickers:
    series = stock_data[ticker]["Close"]
    train_size = int(len(series) * 0.8)
    train = series.iloc[:train_size]
    test = series.iloc[train_size:]
    
    baseline_results = run_all_baselines(train.values, horizon=len(test))
    
    for model_name, preds in baseline_results.items():
        metrics = compute_all_metrics(test.values, preds)
        metrics['ticker'] = ticker
        metrics['model'] = model_name
        all_results.append(metrics)

multi_stock_df = pd.DataFrame(all_results)
print(multi_stock_df.groupby(['ticker', 'model']).agg({'RMSE': 'mean', 'MAE': 'mean'}).to_string())

## 8. Save Forecasting Results

In [None]:
multi_stock_df.to_parquet("../data/processed/forecast_results.parquet")
print(f"Forecast results saved. Shape: {multi_stock_df.shape}")