<a href="https://colab.research.google.com/github/AMAYA-RAJAN-K/Stock-Market-forecasting--ARIMA-SARIMA-LSTM/blob/main/Stock_market_forcasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import math
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# TensorFlow (LSTM)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
import joblib
from datetime import datetime

## LOAD DATA

In [3]:
TICKER = "AAPL"
START = "2023-01-01"
END = "2024-12-31"
HOLDOUT_DAYS = 30        # final forecast horizon
RESULTS_DIR = "results"
DATA_DIR = "data"
MODELS_DIR = "models"
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)


In [5]:
def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denom = np.where(y_true == 0, 1e-8, y_true)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100.0

In [6]:
def fix_close_value(x):
    """Robust fix for Close values that might be nested types in Colab."""
    import pandas as _pd, numpy as _np
    try:
        if isinstance(x, _pd.Series):
            return float(x.iloc[0])
        if isinstance(x, _np.ndarray):
            return float(x.flatten()[0])
        if isinstance(x, list):
            return float(x[0])
        return float(x)
    except Exception:
        # Last resort: convert via str then float
        return float(str(x).strip().strip("[]"))

In [7]:
def save_plot(fig, path):
    fig.savefig(path, dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("Saved plot ->", path)

In [8]:
print("Downloading data for", TICKER)
ticker = yf.Ticker(TICKER)
hist = ticker.history(start=START, end=END)

Downloading data for AAPL


In [9]:
if hist is None or hist.empty:
    raise RuntimeError("No data returned. Check network or ticker.")

df = hist.reset_index()[['Date', 'Close']].copy()

In [10]:
df['Date'] = pd.to_datetime(df['Date'])
df['Close'] = df['Close'].apply(fix_close_value).astype(float)

In [11]:
raw_path = os.path.join(DATA_DIR, f"{TICKER}_raw_{START}_to_{END}.csv")
df.to_csv(raw_path, index=False)
print("Saved raw data ->", raw_path)
print(df.head())

Saved raw data -> data/AAPL_raw_2023-01-01_to_2024-12-31.csv
                       Date       Close
0 2023-01-03 00:00:00-05:00  123.211205
1 2023-01-04 00:00:00-05:00  124.482033
2 2023-01-05 00:00:00-05:00  123.161934
3 2023-01-06 00:00:00-05:00  127.693573
4 2023-01-09 00:00:00-05:00  128.215714


## EDA & feature engineering

In [12]:
df.sort_values('Date', inplace=True)
df['lag_1'] = df['Close'].shift(1)
df['rolling7'] = df['Close'].rolling(window=7).mean()
df['rolling30'] = df['Close'].rolling(window=30).mean()

In [13]:
df.dropna(inplace=True)
proc_path = os.path.join(DATA_DIR, f"{TICKER}_processed.csv")
df.to_csv(proc_path, index=False)
print("Saved processed data ->", proc_path)

Saved processed data -> data/AAPL_processed.csv


In [14]:
# plot price + MAs
fig = plt.figure(figsize=(12,5))
plt.plot(df['Date'], df['Close'], label='Close', alpha=0.7)
plt.plot(df['Date'], df['rolling7'], label='7-day MA', linewidth=1.2)
plt.plot(df['Date'], df['rolling30'], label='30-day MA', linewidth=1.2)
plt.title(f"{TICKER} Close Price with Moving Averages")
plt.xlabel("Date"); plt.ylabel("Price (USD)")
plt.legend(); plt.grid(alpha=0.2)
save_plot(fig, os.path.join(RESULTS_DIR, "aapl_close_with_ma.png"))


Saved plot -> results/aapl_close_with_ma.png


In [15]:
# correlation heatmap
corr = df[['Close','lag_1','rolling7','rolling30']].corr()
fig = plt.figure(figsize=(5,4))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation")
save_plot(fig, os.path.join(RESULTS_DIR, "aapl_correlation_heatmap.png"))


Saved plot -> results/aapl_correlation_heatmap.png


## Train/Test split

In [16]:
series = df.set_index('Date')['Close']
train = series[:-HOLDOUT_DAYS]
test = series[-HOLDOUT_DAYS:]
print(f"Train length: {len(train)}, Test length: {len(test)}")

Train length: 442, Test length: 30


In [17]:
# containers for forecasts and metrics
forecasts = pd.DataFrame({'Date': test.index})
metrics = []

## 1. ARIMA (baseline)

In [18]:
print("\nFitting ARIMA (order=(5,1,0)) ...")
try:
    arima_order = (5,1,0)
    arima_model = SARIMAX(train, order=arima_order, enforce_stationarity=False, enforce_invertibility=False)
    arima_res = arima_model.fit(disp=False)
    arima_forecast = arima_res.forecast(steps=HOLDOUT_DAYS)
    forecasts['ARIMA'] = arima_forecast.values
    rmse = math.sqrt(mean_squared_error(test, arima_forecast))
    mae = mean_absolute_error(test, arima_forecast)
    mape_v = mape(test, arima_forecast)
    metrics.append(('ARIMA(5,1,0)', rmse, mae, mape_v))
    # save model & forecast
    joblib.dump(arima_res, os.path.join(MODELS_DIR, "arima_res.pkl"))
    arima_forecast.to_csv(os.path.join(RESULTS_DIR, "ARIMA_forecast.csv"))
    print("ARIMA done — RMSE:", round(rmse,4))
except Exception as e:
    print("ARIMA failed:", e)


Fitting ARIMA (order=(5,1,0)) ...
ARIMA done — RMSE: 17.8836


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


## 2. SARIMA (seasonal) with fallback attempts

In [19]:
print("\nFitting SARIMA with fallback seasonal orders ...")
sarima_done = False
sarima_attempts = [((1,1,1),(1,1,1,5)), ((1,1,1),(0,1,1,5)), ((2,1,0),(1,1,1,5)), ((1,1,1),(1,1,1,7))]
for non_seasonal, seasonal in sarima_attempts:
    try:
        sarima_model = SARIMAX(train, order=non_seasonal, seasonal_order=seasonal,
                               enforce_stationarity=False, enforce_invertibility=False)
        sarima_res = sarima_model.fit(disp=False)
        sarima_forecast = sarima_res.forecast(steps=HOLDOUT_DAYS)
        forecasts['SARIMA'] = sarima_forecast.values
        rmse = math.sqrt(mean_squared_error(test, sarima_forecast))
        mae = mean_absolute_error(test, sarima_forecast)
        mape_v = mape(test, sarima_forecast)
        metrics.append((f"SARIMA{non_seasonal}x{seasonal}", rmse, mae, mape_v))
        joblib.dump(sarima_res, os.path.join(MODELS_DIR, "sarima_res.pkl"))
        sarima_forecast.to_csv(os.path.join(RESULTS_DIR, "SARIMA_forecast.csv"))
        print(f"SARIMA success — order={non_seasonal}, seasonal={seasonal}, RMSE={round(rmse,4)}")
        sarima_done = True
        break
    except Exception as e:
        print("SARIMA attempt failed for", non_seasonal, seasonal, " — error:", e)

if not sarima_done:
    print("SARIMA unsuccessful for all tried orders. Continuing without SARIMA results.")



Fitting SARIMA with fallback seasonal orders ...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMA success — order=(1, 1, 1), seasonal=(1, 1, 1, 5), RMSE=14.5732


  return get_prediction_index(


## 3. LSTM (TensorFlow)

In [20]:
print("\nPreparing LSTM model")
try:
    # Use business-frequency filled series for LSTM input
    s = series.asfreq('B').fillna(method='ffill')
    values = s.values.reshape(-1,1)
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(values)

    LOOKBACK = 30
    X, y = [], []
    for i in range(LOOKBACK, len(scaled)):
        X.append(scaled[i-LOOKBACK:i, 0])
        y.append(scaled[i, 0])
    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], X.shape[1], 1))
# align train/test
    X_train, X_test = X[:-HOLDOUT_DAYS], X[-HOLDOUT_DAYS:]
    y_train, y_test = y[:-HOLDOUT_DAYS], y[-HOLDOUT_DAYS:]

    lstm_model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(LOOKBACK, 1)),
        Dropout(0.2),
        LSTM(32),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
    lstm_model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=32, callbacks=[es], verbose=1)

    pred_scaled = lstm_model.predict(X_test)
    pred = scaler.inverse_transform(pred_scaled)
    forecasts['LSTM'] = pred.flatten()
    rmse = math.sqrt(mean_squared_error(s[-HOLDOUT_DAYS:].values, pred.flatten()))
    mae = mean_absolute_error(s[-HOLDOUT_DAYS:].values, pred.flatten())
    mape_v = mape(s[-HOLDOUT_DAYS:].values, pred.flatten())
    metrics.append(('LSTM', rmse, mae, mape_v))
    # save model & forecast
    lstm_model.save(os.path.join(MODELS_DIR, "lstm_model.keras"))
    pd.DataFrame(pred.flatten(), columns=["Pred"]).to_csv(os.path.join(RESULTS_DIR, "LSTM_forecast.csv"), index=False)
    print("LSTM done — RMSE:", round(rmse,4))
except Exception as e:
    print("LSTM failed:", e)


Preparing LSTM model
Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 110ms/step - loss: 0.0832 - val_loss: 0.0014
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 0.0078 - val_loss: 0.0124
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.0044 - val_loss: 0.0030
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 0.0040 - val_loss: 0.0026
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.0037 - val_loss: 0.0020
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.0033 - val_loss: 0.0015
Epoch 6: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step
LSTM done — RMSE: 11.548


## Save combined forecasts & comparison

In [21]:
forecasts_path = os.path.join(RESULTS_DIR, "forecasts_combined.csv")
forecasts.to_csv(forecasts_path, index=False)
print("\nSaved combined forecasts ->", forecasts_path)

cmp_df = pd.DataFrame(metrics, columns=["Model","RMSE","MAE","MAPE"])
cmp_df.to_csv(os.path.join(RESULTS_DIR, "model_comparison.csv"), index=False)
print("Saved model comparison ->", os.path.join(RESULTS_DIR, "model_comparison.csv"))
print("\nModel comparison:\n", cmp_df)


Saved combined forecasts -> results/forecasts_combined.csv
Saved model comparison -> results/model_comparison.csv

Model comparison:
                           Model       RMSE        MAE      MAPE
0                  ARIMA(5,1,0)  17.883593  15.112181  6.091422
1  SARIMA(1, 1, 1)x(1, 1, 1, 5)  14.573194  12.324157  4.968673
2                          LSTM  11.547964  10.504968  4.252597


## Plot Actual vs Forecasts

In [22]:
fig = plt.figure(figsize=(12,6))
plt.plot(test.index, test.values, label='Actual', marker='o')
for col in ['ARIMA','SARIMA','LSTM']:
    if col in forecasts.columns:
        plt.plot(forecasts['Date'], forecasts[col], label=col, marker='x')
plt.title(f"{TICKER} - Actual vs Forecasts (last {HOLDOUT_DAYS} days)")
plt.xlabel("Date"); plt.ylabel("Price")
plt.legend(); plt.grid(alpha=0.2)
save_plot(fig, os.path.join(RESULTS_DIR, "actual_vs_forecasts.png"))

Saved plot -> results/actual_vs_forecasts.png
