In [29]:
# This script installs all required libraries for data analysis, plotting, LLM workflows, and notebook imports.
# Note: The installation command is commented out to prevent accidental execution.
# --------------------------------------------------------------------------------

# Required Libraries:
# pandas: Data manipulation and analysis
# numpy: Numerical computations
# matplotlib: Data visualization
# yfinance: Downloading financial data from Yahoo Finance
# langchain: Building LLM-powered applications and chains
# import_ipynb: Importing Jupyter notebooks as Python modules
# scipy: Scientific computing (e.g., signal processing)
# statsmodels: Statistical modeling and time series analysis
# xgboost: Gradient boosting for machine learning
# selenium: Web scraping and browser automation
# webdriver_manager: Managing browser drivers for Selenium
# transformers: State-of-the-art NLP models
# peft: Parameter-efficient fine-tuning for transformers
# accelerate: Optimizing training and inference of models
# bitsandbytes: Efficient training of large models with 8-bit optimizers
# tensorflow: Deep learning framework
# torch: PyTorch deep learning framework
# tensorboard: Visualization tool for TensorFlow and PyTorch
# scikit-learn: Machine learning library for Python (version 1.6.1)

# Install all required libraries
#%pip install -U tensorflow pandas torch tensorboard numpy matplotlib yfinance langchain import_ipynb scipy statsmodels xgboost selenium webdriver_manager transformers peft accelerate bitsandbytes
#%pip install scikit-learn==1.6.1

In [None]:
import os
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
import datetime
#from langchain.chains import SequentialChain, LLMChain
#from langchain.prompts import PromptTemplate
#from langchain.llms import OpenAI  # Replace with any LLM provider
#from langchain.output_parsers import RegexParser
# -------------------------------------------------------------------------
# Other Imports
# -------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from datetime import datetime, timedelta
from torch.utils.data import Dataset
import statsmodels.api as sm
import torch.nn as nn
import torch.optim as optim
import torch
# -------------------------------------------------------------------------
#  Custom Imports
from modules.modules import SetTransformer, VariableSetDataset
from modules.functions import *
# -------------------------------------------------------------------------
#  Web Scraping Imports
# -------------------------------------------------------------------------
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# -------------------------------------------------------------------------

In [31]:
class GoldPriceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [32]:
def load_checkpoint(filepath, model, device='cpu'):
    checkpoint = torch.load(filepath, map_location=device, weights_only=False)  # Set weights_only=False
    model.load_state_dict(checkpoint['model_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return epoch, loss

In [33]:
def predict_next_day_gold_price_arimax(df: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using ARIMAX with technical indicators.

    Returns:
        tuple: (next_day_price)
    """
    exog_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    for col in exog_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    df = df[['Close'] + exog_cols].dropna()
    df = df.asfreq('B')
    df.ffill(inplace=True)

    y = df['Close']
    exog = df[exog_cols]

    # -------------------------------
    # Forecast Next Price
    # -------------------------------
    next_exog = exog.iloc[[-1]].values
    predicted_price = model.forecast(steps=1, exog=next_exog).iloc[0]

    return predicted_price


In [34]:
def predict_next_day_gold_price_xgboost(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using XGBoost with technical indicators.
    Returns:
        tuple: (next_day_price)
    """

    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy().dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20']
    gold_clean = gold_clean.dropna()
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean = gold_clean.dropna()
    gold_clean = gold_clean[
        (np.isfinite(gold_clean['Target_pct_change'])) &
        (np.abs(gold_clean['Target_pct_change']) < 1.0)
    ]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price

In [35]:
def predict_next_day_gold_price_rf(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using Random Forest with enhanced features.
    Saves model daily and loads if already exists. Returns price, model, and percentage change.
    """

    # Feature Engineering
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy()
    gold_clean = gold_clean.dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    gold_clean['Price_momentum_3'] = gold_clean['Close'] / gold_clean['Close'].shift(3) - 1
    gold_clean['Price_momentum_5'] = gold_clean['Close'] / gold_clean['Close'].shift(5) - 1

    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20',
        'Price_momentum_3', 'Price_momentum_5']

    gold_clean.dropna(inplace=True)
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean.dropna(inplace=True)
    gold_clean = gold_clean[(np.abs(gold_clean['Target_pct_change']) < 1.0)]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price


In [36]:
def predict_next_day_gold_price_lstm(gold: pd.DataFrame, model=None) -> float:
    
    sequence_length = 10 ## Based on the model's training sequence length
    
    # -------------------------------
    # Feature Setup
    # -------------------------------
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold = gold[['Close'] + feature_cols].dropna()
    gold = gold.asfreq('B')
    gold.ffill(inplace=True)
    gold['Target'] = gold['Close'].shift(-1)
    gold.dropna(inplace=True)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(gold[feature_cols])
    y_scaled = scaler.fit_transform(gold[['Target']])

    X_seq, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X_seq.append(X_scaled[i:i + sequence_length])
        y_seq.append(y_scaled[i + sequence_length])

    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)

    # -------------------------------
    # Forecast
    # -------------------------------
    model.eval()
    last_seq = torch.tensor(X_scaled[-sequence_length:], dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        next_pred = model(last_seq).cpu().numpy()

    predicted_price = scaler.inverse_transform(
        np.concatenate([np.zeros((1, len(feature_cols))), next_pred], axis=1)
    )[:, -1][0]

    return predicted_price

In [37]:
def predict_next_day_gold_price_ensemble(
    ensemble_model: dict,
    arimax_pred: float,
    xgb_pred: float,
    rf_pred: float,
    lstm_pred: float,
    llm_pred: float
):
    """
    Use a pre-trained ensemble model (results hash) and new predictions to create a results hash.
    Only updates the 'individual_predictions' and recalculates all ensemble outputs.
    """
    # Extract weights and metadata from the loaded ensemble model
    weights_used = ensemble_model.get('weights_used', {})
    meta_weights = weights_used.get('meta_weights', {
        'simple': 0.1, 'weighted': 0.25, 'sentiment': 0.2, 'volatility': 0.25, 'trend': 0.2
    })
    norm_weights = weights_used.get('normalized_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })
    vol_weights = weights_used.get('volatility_weights', {
        'arimax': 0.25, 'xgboost': 0.20, 'rf': 0.20, 'lstm': 0.15, 'llm': 0.20
    })
    trend_weights = weights_used.get('trend_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })

    # Extract metadata for calculation
    metadata = ensemble_model.get('metadata', {})
    current_price = metadata.get('current_price', 0)
    current_sentiment = metadata.get('current_sentiment', 0)

    model_names = ['arimax', 'xgboost', 'rf', 'lstm', 'llm']
    model_preds = [arimax_pred, xgb_pred, rf_pred, lstm_pred, llm_pred]

    # 1. Simple average
    simple_avg = np.mean(model_preds)

    # 2. Weighted average (sentiment-boosted)
    weighted_avg = sum(norm_weights[k] * p for k, p in zip(model_names, model_preds))

    # 3. Sentiment-adjusted
    sentiment_factor = 1 + 0.02 * current_sentiment if abs(current_sentiment) > 0.1 else 1.0
    sentiment_adjusted = weighted_avg * sentiment_factor

    # 4. Volatility-weighted
    volatility_weighted = sum(vol_weights[k] * p for k, p in zip(model_names, model_preds))

    # 5. Trend-following
    trend_following = sum(trend_weights[k] * p for k, p in zip(model_names, model_preds))

    # 6. Meta-Ensemble
    meta_ensemble = (
        meta_weights['simple'] * simple_avg +
        meta_weights['weighted'] * weighted_avg +
        meta_weights['sentiment'] * sentiment_adjusted +
        meta_weights['volatility'] * volatility_weighted +
        meta_weights['trend'] * trend_following
    )

    pct_changes = {
        'simple_avg': (simple_avg - current_price) / current_price * 100 if current_price else 0,
        'weighted_avg': (weighted_avg - current_price) / current_price * 100 if current_price else 0,
        'sentiment_adjusted': (sentiment_adjusted - current_price) / current_price * 100 if current_price else 0,
        'volatility_weighted': (volatility_weighted - current_price) / current_price * 100 if current_price else 0,
        'trend_following': (trend_following - current_price) / current_price * 100 if current_price else 0,
        'meta_ensemble': (meta_ensemble - current_price) / current_price * 100 if current_price else 0
    }

    results = {
        'predictions': {
            'simple_average': simple_avg,
            'weighted_average': weighted_avg,
            'sentiment_adjusted': sentiment_adjusted,
            'volatility_weighted': volatility_weighted,
            'trend_following': trend_following,
            'meta_ensemble': meta_ensemble
        },
        'percentage_changes': pct_changes,
        'weights_used': weights_used,
        'metadata': metadata,
        'model_info': {
            **ensemble_model.get('model_info', {}),
            'created_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'individual_predictions': {
                'arimax': arimax_pred,
                'xgboost': xgb_pred,
                'random_forest': rf_pred,
                'lstm': lstm_pred,
                'llm': llm_pred
            }
        }
    }
    return results


In [None]:
def get_latest_bullionvault_articles(URL="https://www.bullionvault.com/gold-news"):
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')
    latest=soup.find(id='views-bootstrap-grid-1').find_all(class_='field-content')
    list_data = []
    for item in latest:
        date=item.find(class_='views-field-created')
        if not date:
            continue
        link=item.find(class_='views-field-title').find('a')['href']
        page_response = requests.get(link)
        page_soup = BeautifulSoup(page_response.content, 'html.parser')
        content = page_soup.find('div', class_='field field-name-body field-type-text-with-summary field-label-hidden')
        title = page_soup.find('h1').text.strip()
        content_text = content.text.strip() if content else ''
        data_point = {'Date': date.text.strip() if date else 'N/A', 'Content': title + ':' + content_text}
        list_data.append(data_point)
    list_df=pd.DataFrame(list_data)
    list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def yf_extract_info(item):
    link=item.find('a',class_='subtle-link')['href']
    title=item.find('a',class_='subtle-link')['title']

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration

    page_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    page_driver.get(link)

    page_soup = BeautifulSoup(page_driver.page_source, 'html.parser')
    content = page_soup.find('div', class_='body')
    content_text = content.text.strip() if content else ''
    date= page_soup.find('div', class_= lambda c: c and c.startswith("byline")).find('time')
    data_point = {'Date': date.text.strip() if date else 'N/A', 'Content': title + ':' + content_text}
    page_driver.quit()
    return data_point

def get_latest_yf_articles(URL="https://finance.yahoo.com/news/"):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(URL)
    time.sleep(2)
    last_height = driver.execute_script("return document.body.scrollHeight")
    count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        count += 1
        if count > 1:
            break
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all('li', class_='story-item')
    list_data = []
    #print(f"Found {len(articles)} articles on Yahoo Finance.")
    for article in articles:
        try:
            list_data.append(yf_extract_info(article))
        except Exception as e:
            continue
    driver.quit()
    list_df=pd.DataFrame(list_data)
    list_df['Date'].dropna(inplace=True)
    #list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def get_reuters_article_text(item, base_URL="https://www.reuters.com"):
    title = item.get_text(strip=True)
    link = item.find('a', href=True)['href']
    if not link.startswith('http'):
        link = base_URL + link
    chrome_options = Options()
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(link)
    time.sleep(2)
    page_soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    article = page_soup.find_all("div", class_=lambda c: c and c.startswith("article-body__paragraph"))
    article_text = ""
    for para in article:
        paragraph_text = para.get_text(strip=True)
        article_text = article_text + "." + paragraph_text
    date = page_soup.find("span", class_=lambda c: c and c.startswith("date-line__date")).get_text(strip=True)
    data_point = {'Date': date, 'Content': title + ':' + article_text}
    return data_point

def get_reuters_articles_list(URL):
    chrome_options = Options()
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(URL)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    articles=soup.find_all("div", class_=lambda c: c and c.startswith("story-card__area-headline"))
    list_data = []
    for article in articles:
        try:
            list_data.append(get_reuters_article_text(article))
        except Exception as e:
            continue
    list_df=pd.DataFrame(list_data)
    list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def get_reuters_articles():
    base_URL="https://www.reuters.com"
    search_query="/site-search/?query=gold"
    df = pd.DataFrame(columns=['Date', 'Content'])
    for section_val in ['all']:
        for offset_nb in range(0, 40, 20):
            offset =f"&offset={offset_nb}"
            section=f"&section={section_val}"
            URL = base_URL + search_query + offset + section
            try:
                df_latest=get_reuters_articles_list(URL)
                df = pd.concat([df, df_latest], ignore_index=True)
            except Exception as e:
                print(f"Error fetching articles from {URL}: {e}")
                continue
    return df

In [38]:
# =========================================================================
# Step 1: Check if today's model exists
# =========================================================================
today = datetime.now().strftime("%Y-%m-%d")
yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
today = yesterday
next_day = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")

In [None]:
## TODO [Jaison]: Replace this with today's gold price Scraping. 


# Load Datasets for Time Series Models and News LLM Model
# ---------------------------------------------------------------------------

input_data = "data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv"
df = pd.read_csv(input_data, index_col=0, parse_dates=True)
gold_today = df.iloc[[-1]]
print(f"Data \n\n{gold_today.head()}\n\n")

# ---------------------------------------------------------------------------


Data 

                 Open       High        Low      Close  Volume   Returns  \
Date                                                                       
2025-06-20  82.790001  82.790001  82.029999  82.230003  550293 -0.006524   

              MA_5   MA_20    MA_50  Volatility  ...   BB_lower  BB_width  \
Date                                             ...                        
2025-06-20  82.692  81.167  79.9768    0.007811  ...  78.625844  5.082312   

            BB_position      MACD  MACD_Signal  MACD_Hist  Momentum_10  \
Date                                                                     
2025-06-20     0.709157  0.943162     0.861658   0.081504     1.030006   

              ROC_10  Sentiment  Sentiment_Label  
Date                                              
2025-06-20  0.012685      -0.75         negative  

[1 rows x 22 columns]


Number of rows in df: (6491, 6)
Max articles per day: 22
Encodings shape: (1648, 22, 512), Price changes shape: (1648, 1), Masks sh

In [None]:
## TODO [Yaswanth] : Replace this with today's news articles scraping.

def extract_news_data():
    bullion_df = get_latest_bullionvault_articles()
    yf_df=get_latest_yf_articles()
    yf_df['Date']=pd.to_datetime(yf_df['Date'],errors='coerce').dt.date
    reuters_df = get_reuters_articles()
    three_days_ago = pd.to_datetime('today').date() - timedelta(days=3)

    df_combined = pd.concat([bullion_df, yf_df, reuters_df], ignore_index=True)
    df_combined = df_combined.sort_values(by='Date')
    df_combined=df_combined[df_combined['Date'] >= three_days_ago]
    # Placeholder for actual news data extraction logic
    return df_combined

## TODO [Tejashwini] : cleaning script for scraped news data.


In [None]:
## TODO [Adithya] : Insert Topic extraction model here.
## Extract the encodings.
import tensorflow_hub as hub

## Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #This is around 1 GB in size, it took a while for me to run this.
embed = hub.load(model_url)

# Generate embeddings
embeddings = embed(sentences)


In [None]:
## TODO [Mohan]: Integrate sentiment extraction model here



In [None]:

## TODO [Tarun] : Replace the file read with values produced by previous members in the chain.

news_llm_model_data = pd.read_pickle('data/combined_dataset_with_price_change.pkl')
print("Number of rows in df:",news_llm_model_data.shape)
news_llm_model_data.head()

#Group input data into sets for use in model.
encodings, price_changes, masks = group_into_variable_sets(news_llm_model_data)
print(f"Encodings shape: {encodings.shape}, Price changes shape: {price_changes.shape}, Masks shape: {masks.shape}")

# Create the dataset
dataset = VariableSetDataset(encodings, price_changes, masks)
print(f"Dataset {dataset}")

In [40]:
# Get device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [41]:
# Run Time Series Models and get predictions
# --------------------------------------------------------------------
# Load pre-trained models
arimax_model = sm.load_pickle(f'models/arimax_2025-06-20.pkl')
random_forest_model = sm.load_pickle(f'models/random_forest_2025-06-20.pkl')
xgboost_model = sm.load_pickle(f'models/xgboost_2025-06-20.pkl')

lstw_model = LSTMModel(input_size=11).to(device)
lstw_model.load_state_dict(torch.load('models/lstm_2025-06-20.pt', map_location=device))

# --------------------------------------------------------------------
# Predict gold price using all four models with gold_price_prediction where possible

# ARIMAX
predicted_price_arimax = predict_next_day_gold_price_arimax(df, arimax_model)

# Random Forest
predicted_price_rf = predict_next_day_gold_price_rf(df, random_forest_model)

# XGBoost
predicted_price_xgb = predict_next_day_gold_price_xgboost(df, xgboost_model)

# LSTM
predicted_price_lstw = predict_next_day_gold_price_lstm(df, lstw_model)

print(f"ARIMAX: Predicted gold price for {next_day}: {predicted_price_arimax}")

print(f"Random Forest: Predicted gold price for {next_day}: {predicted_price_rf}")

print(f"XGBoost: Predicted gold price for {next_day}: {predicted_price_xgb}")

print(f"LSTM: Predicted gold price for {next_day}: {predicted_price_lstw}")
# --------------------------------------------------------------------

ARIMAX: Predicted gold price for 2025-06-23: 82.2040508858499
Random Forest: Predicted gold price for 2025-06-23: 82.63219047144733
XGBoost: Predicted gold price for 2025-06-23: 83.2262346595262
LSTM: Predicted gold price for 2025-06-23: 82.97583439662782


In [42]:
# Load the pre-trained News LLM model & get predictions
# --------------------------------------------------------------------

news_model = SetTransformer(
    dim_input = 512, 
    num_outputs = 1, #One final prediction
    dim_output = 1, #1D output for price change
    num_inds=32, 
    dim_hidden=128, 
    num_heads=4, 
    ln=True #Layer normalization
    ).to(device)

# --------------------------------------------------------------------
# Load the pre-trained model weights
checkpoint_path = 'models/final_model.pth'
if os.path.exists(checkpoint_path):
    start_epoch, start_loss = load_checkpoint(checkpoint_path, news_model, device)
    print(f"Model loaded from {checkpoint_path} at epoch {start_epoch} with loss {start_loss:.4f}")
else:
    start_epoch, start_loss = 0, float('inf')
    print(f"No checkpoint found at {checkpoint_path}. Starting from scratch.")
# --------------------------------------------------------------------
predicted_price_news_llm = 80 # Placeholder for the actual prediction logic
#predicted_price_news_llm = news_model(news_llm_model_data)

Model loaded from models/final_model.pth at epoch 100 with loss 0.4647


In [43]:
ensemble_model = sm.load_pickle(f'models/ensemble_model_2025-06-20.pkl')
results = predict_next_day_gold_price_ensemble(
    ensemble_model,
    predicted_price_arimax,
    predicted_price_xgb,
    predicted_price_rf,
    predicted_price_lstw,
    predicted_price_news_llm,  # Placeholder for LLM prediction
)

print(f"Ensemble Model Results for {next_day}:")
print(f"Predicted Price: {results['predictions']['meta_ensemble']}")
print(f"Percentage Change: {results['percentage_changes']['meta_ensemble']:.2f}%")

Ensemble Model Results for 2025-06-23:
Predicted Price: 81.8896819508526
Percentage Change: -0.41%
