<a href="https://colab.research.google.com/github/Aurora-Fund-Analytics/forecast-model/blob/main/notebooks/test_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prediction model

In [None]:
!pip install torch yfinance pandas numpy scikit-learn ta

In [4]:
import os
import math
import json
import time
from dataclasses import dataclass, asdict
from typing import List, Tuple

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import yfinance as yf
from sklearn.preprocessing import StandardScaler

try:
    import ta
except ImportError:
    ta = None

# Fix random seed
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
def download_ohlcv(tickers: List[str], start: str, end: str) -> pd.DataFrame:
    df = yf.download(
        tickers=tickers,
        start=start,
        end=end,
        auto_adjust=True,
        group_by='ticker',
        threads=True,
        progress=False,
        interval='1d'
    )
    if isinstance(df.columns, pd.MultiIndex):
        df = df.sort_index(axis=1)
    else:
        df = pd.concat({tickers[0]: df}, axis=1)
    return df

def add_features_for_ticker(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d['ret_1'] = d['Close'].pct_change()
    d['log_ret_1'] = np.log1p(d['ret_1'])

    for w in (5, 10, 20, 60):
        d[f'roll_mean_{w}'] = d['Close'].pct_change().rolling(w).mean()
        d[f'roll_std_{w}'] = d['Close'].pct_change().rolling(w).std()
        d[f'roll_min_{w}'] = d['Close'].rolling(w).min() / d['Close'] - 1.0
        d[f'roll_max_{w}'] = d['Close'].rolling(w).max() / d['Close'] - 1.0
        d[f'price_sma_{w}'] = d['Close'] / d['Close'].rolling(w).mean() - 1.0
        d[f'vol_sma_{w}'] = d['Volume'] / d['Volume'].rolling(w).mean() - 1.0

    if ta is not None:
        try:
            d['rsi_14'] = ta.momentum.RSIIndicator(close=d['Close'], window=14).rsi()
            macd = ta.trend.MACD(close=d['Close'])
            d['macd'] = macd.macd()
            d['macd_signal'] = macd.macd_signal()
            d['macd_hist'] = macd.macd_diff()
        except Exception:
            d[['rsi_14','macd','macd_signal','macd_hist']] = np.nan
    else:
        d[['rsi_14','macd','macd_signal','macd_hist']] = np.nan

    d['vol_ret_1'] = d['Volume'].pct_change().replace([np.inf, -np.inf], np.nan)
    d['hl_spread'] = (d['High'] - d['Low']) / d['Close']
    d['y_target'] = d['log_ret_1'].shift(-1)

    return d.dropna()

def build_feature_matrix(df_multi, tickers):
    rows = []
    for t in tickers:
        one = df_multi[t][['Open','High','Low','Close','Volume']].dropna()
        f = add_features_for_ticker(one)
        f['ticker'] = t
        rows.append(f)
    data = pd.concat(rows)
    data['date'] = data.index
    data = data.sort_values(['ticker','date']).reset_index(drop=True)
    feature_cols = [c for c in data.columns if c not in ('y_target','ticker','date')]
    return data, feature_cols


In [6]:
def make_sequences(data, feature_cols, window, horizon=1):
    X_list, y_list = [], []
    for t, grp in data.groupby('ticker', sort=False):
        vals = grp[feature_cols + ['y_target']].values
        for i in range(window, len(vals) - horizon + 1):
            X_list.append(vals[i-window:i, :-1])
            y_list.append(vals[i + horizon - 1, -1])
    return np.array(X_list, dtype=np.float32), np.array(y_list, dtype=np.float32)

class SeqDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float().view(-1, 1)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

In [7]:
class LSTMRegressor(nn.Module):
    def __init__(self, n_features, hidden=96, num_layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(n_features, hidden, num_layers=num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden//2, 1)
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last)


In [8]:
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    preds, trues = [], []
    total = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = criterion(pred, yb)
        total += loss.item() * xb.size(0)
        preds.append(pred.cpu().numpy())
        trues.append(yb.cpu().numpy())
    preds = np.concatenate(preds).flatten()
    trues = np.concatenate(trues).flatten()
    mse = np.mean((preds - trues)**2)
    mae = np.mean(np.abs(preds - trues))
    dir_acc = np.mean(np.sign(preds) == np.sign(trues))
    return total / len(loader.dataset), mse, mae, dir_acc


In [9]:
# Config
tickers = ["SPY","QQQ","IWM","EEM","TLT","GLD"]
start = "2005-01-01"
end = "2025-08-25"
window = 60
horizon = 1
batch_size = 128
hidden = 96
layers = 2
dropout = 0.2
epochs = 35
lr = 1e-3

set_seed(42)

# Download & prepare data
ohlcv = download_ohlcv(tickers, start, end)
data, feature_cols = build_feature_matrix(ohlcv, tickers)

# Train/Val/Test split
data_sorted = data.sort_values('date')
n = len(data_sorted)
n_test = int(0.1*n)
n_val = int(0.1*n)
train_df = data_sorted.iloc[:n-n_val-n_test]
val_df = data_sorted.iloc[n-n_val-n_test:n-n_test]
test_df = data_sorted.iloc[n-n_test:]

# Scale
scaler = StandardScaler().fit(train_df[feature_cols])
train_df[feature_cols] = scaler.transform(train_df[feature_cols])
val_df[feature_cols] = scaler.transform(val_df[feature_cols])
test_df[feature_cols] = scaler.transform(test_df[feature_cols])

# Build sequences
X_train, y_train = make_sequences(train_df, feature_cols, window)
X_val, y_val = make_sequences(val_df, feature_cols, window)
X_test, y_test = make_sequences(test_df, feature_cols, window)

train_loader = DataLoader(SeqDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(SeqDataset(X_val, y_val), batch_size=batch_size)
test_loader = DataLoader(SeqDataset(X_test, y_test), batch_size=batch_size)

# Model
model = LSTMRegressor(len(feature_cols), hidden=hidden, num_layers=layers, dropout=dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
criterion = nn.MSELoss()

best_val = float('inf')
wait = 0
patience = 7

for epoch in range(1, epochs+1):
    tr_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_mse, val_mae, val_da = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch:03d} | Train {tr_loss:.6f} | Val {val_loss:.6f} | MSE {val_mse:.6f} | MAE {val_mae:.6f} | DirAcc {val_da:.3f}")
    if val_loss < best_val:
        best_val = val_loss
        best_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

model.load_state_dict(best_state)

# Test
test_loss, test_mse, test_mae, test_da = evaluate(model, test_loader, criterion)
print(f"Test -> MSE: {test_mse:.6f}, MAE: {test_mae:.6f}, DirAcc: {test_da:.3f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[feature_cols] = scaler.transform(train_df[feature_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df[feature_cols] = scaler.transform(val_df[feature_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[feature_cols] = scaler.transform(test_df[feature_cols])


Epoch 001 | Train 0.000647 | Val 0.000191 | MSE 0.000191 | MAE 0.010494 | DirAcc 0.508
Epoch 002 | Train 0.000310 | Val 0.000188 | MSE 0.000188 | MAE 0.010457 | DirAcc 0.498
Epoch 003 | Train 0.000260 | Val 0.000186 | MSE 0.000186 | MAE 0.010463 | DirAcc 0.500
Epoch 004 | Train 0.000230 | Val 0.000186 | MSE 0.000186 | MAE 0.010431 | DirAcc 0.490
Epoch 005 | Train 0.000209 | Val 0.000183 | MSE 0.000183 | MAE 0.010351 | DirAcc 0.510
Epoch 006 | Train 0.000201 | Val 0.000191 | MSE 0.000191 | MAE 0.010567 | DirAcc 0.499
Epoch 007 | Train 0.000194 | Val 0.000191 | MSE 0.000191 | MAE 0.010497 | DirAcc 0.495
Epoch 008 | Train 0.000192 | Val 0.000187 | MSE 0.000187 | MAE 0.010411 | DirAcc 0.494
Epoch 009 | Train 0.000191 | Val 0.000183 | MSE 0.000183 | MAE 0.010321 | DirAcc 0.500
Epoch 010 | Train 0.000191 | Val 0.000183 | MSE 0.000183 | MAE 0.010323 | DirAcc 0.512
Epoch 011 | Train 0.000189 | Val 0.000193 | MSE 0.000193 | MAE 0.010515 | DirAcc 0.495
Epoch 012 | Train 0.000189 | Val 0.000182 |

In [10]:
os.makedirs("artifacts", exist_ok=True)
torch.save({
    'state_dict': model.state_dict(),
    'feature_cols': feature_cols,
    'window': window,
    'scaler_mean_': scaler.mean_.tolist(),
    'scaler_scale_': scaler.scale_.tolist()
}, "artifacts/lstm_stock_model.pt")

# TorchScript version
example = torch.from_numpy(X_test[:1]).float().to(device)
traced = torch.jit.trace(model, example)
traced.save("artifacts/lstm_stock_model_traced.pt")
print("Model saved in artifacts/")

Model saved in artifacts/


In [16]:
import torch
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import StandardScaler

# Load artifacts
artifacts = torch.load("artifacts/lstm_stock_model.pt", map_location=device)
feature_cols = artifacts['feature_cols']
window = artifacts['window']
scaler_mean = np.array(artifacts['scaler_mean_'])
scaler_scale = np.array(artifacts['scaler_scale_'])

# Load model
model = LSTMRegressor(n_features=len(feature_cols), hidden=96, num_layers=2, dropout=0.2)
model.load_state_dict(artifacts['state_dict'])
model.to(device)
model.eval()

# Step A: Download new data for E1VFVN30
ticker = "E1VFVN30.VN"
start = "2024-01-01"
end = "2025-08-27"
df_new = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
df_new = df_new[['Open','High','Low','Close','Volume']].dropna()

In [15]:


# Step B: Feature engineering
def add_features_for_ticker(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d['ret_1'] = d['Close'].pct_change()
    d['log_ret_1'] = np.log1p(d['ret_1'])

    for w in (5, 10, 20, 60):
        d[f'roll_mean_{w}'] = d['Close'].pct_change().rolling(w).mean()
        d[f'roll_std_{w}'] = d['Close'].pct_change().rolling(w).std()
        d[f'roll_min_{w}'] = d['Close'].rolling(w).min() / d['Close'] - 1.0
        d[f'roll_max_{w}'] = d['Close'].rolling(w).max() / d['Close'] - 1.0
        d[f'price_sma_{w}'] = d['Close'] / d['Close'].rolling(w).mean() - 1.0
        d[f'vol_sma_{w}'] = d['Volume'] / d['Volume'].rolling(w).mean() - 1.0

    if ta is not None:
        try:
            d['rsi_14'] = ta.momentum.RSIIndicator(close=d['Close'], window=14).rsi()
            macd = ta.trend.MACD(close=d['Close'])
            d['macd'] = macd.macd()
            d['macd_signal'] = macd.macd_signal()
            d['macd_hist'] = macd.macd_diff()
        except Exception:
            d[['rsi_14','macd','macd_signal','macd_hist']] = np.nan
    else:
        d[['rsi_14','macd','macd_signal','macd_hist']] = np.nan

    d['vol_ret_1'] = d['Volume'].pct_change().replace([np.inf, -np.inf], np.nan)
    d['hl_spread'] = (d['High'] - d['Low']) / d['Close']
    d['y_target'] = d['log_ret_1'].shift(-1)

    return d.dropna()

df_feat = add_features_for_ticker(df_new)

# Step C: Apply scaler
X_full = df_feat[feature_cols].values
X_scaled = (X_full - scaler_mean) / scaler_scale

# Step D: Get last 'window' sequence
if len(X_scaled) < window:
    raise ValueError("Not enough data for window size")
last_seq = X_scaled[-window:].reshape(1, window, -1)

# Step E: Predict next 5 days (simulate)
predicted_prices = []
current_close = df_feat['Close'].iloc[-1]

seq = last_seq.copy()
for _ in range(5):  # next 5 days
    x_tensor = torch.from_numpy(seq).float().to(device)
    with torch.no_grad():
        pred_log_ret = model(x_tensor).item()  # predicted log return
    next_close = current_close * np.exp(pred_log_ret)  # convert log return to price
    predicted_prices.append(next_close)

    # Update seq for next prediction: shift left, append predicted feature row
    # For simplicity, append same features except 'Close' updated
    new_row = seq[0, -1, :].copy()
    # Replace normalized close-related features:
    new_row[feature_cols.index('log_ret_1')] = pred_log_ret  # naive approximation
    seq = np.roll(seq, -1, axis=1)
    seq[0, -1, :] = new_row

    current_close = next_close

predicted_prices

ValueError: Not enough data for window size