In [1]:
# This script installs all required libraries for data analysis, plotting, LLM workflows, and notebook imports.
# Note: The installation command is commented out to prevent accidental execution.
# --------------------------------------------------------------------------------

# Required Libraries:
# pandas: Data manipulation and analysis
# numpy: Numerical computations
# matplotlib: Data visualization
# yfinance: Downloading financial data from Yahoo Finance
# langchain: Building LLM-powered applications and chains
# import_ipynb: Importing Jupyter notebooks as Python modules
# scipy: Scientific computing (e.g., signal processing)
# statsmodels: Statistical modeling and time series analysis
# xgboost: Gradient boosting for machine learning
# selenium: Web scraping and browser automation
# webdriver_manager: Managing browser drivers for Selenium
# transformers: State-of-the-art NLP models
# peft: Parameter-efficient fine-tuning for transformers
# accelerate: Optimizing training and inference of models
# bitsandbytes: Efficient training of large models with 8-bit optimizers
# tensorflow: Deep learning framework
# torch: PyTorch deep learning framework
# tensorboard: Visualization tool for TensorFlow and PyTorch
# scikit-learn: Machine learning library for Python (version 1.6.1)

# Install all required libraries
#%pip install -U tensorflow pandas torch tensorboard numpy matplotlib yfinance langchain import_ipynb scipy statsmodels xgboost selenium webdriver_manager transformers peft accelerate bitsandbytes
#%pip install scikit-learn==1.6.1

In [2]:
import os
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SequentialChain, TransformChain
# -------------------------------------------------------------------------
# Other Imports
# -------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from datetime import datetime, timedelta
import statsmodels.api as sm
import torch
# -------------------------------------------------------------------------
#  Custom Imports
from modules.modules import SetTransformer, LSTMModel, VariableSetDataset
from modules.functions import *




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
WORKAREA = os.getenv("WORKAREA", "D:/CAREER/IISC_B/Academics/Courses\SEM_3\DA_225o\Project\DL-7-25\Final")
# =========================================================================
# Get today's date and the next day in YYYY-MM-DD format
# =========================================================================
today = datetime.now().strftime("%Y-%m-%d")
next_day = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
print(f"Today's date: {today}")
print(f"Next day's date: {next_day}")   

Today's date: 2025-06-23
Next day's date: 2025-06-24


In [4]:
def predict_next_day_gold_price_arimax(df: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using ARIMAX with technical indicators.

    Returns:
        tuple: (next_day_price)
    """
    exog_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    for col in exog_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    df = df[['Close'] + exog_cols].dropna()
    df = df.asfreq('B')
    df.ffill(inplace=True)

    y = df['Close']
    exog = df[exog_cols]

    # -------------------------------
    # Forecast Next Price
    # -------------------------------
    next_exog = exog.iloc[[-1]].values
    predicted_price = model.forecast(steps=1, exog=next_exog).iloc[0]

    return predicted_price


In [5]:
def predict_next_day_gold_price_xgboost(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using XGBoost with technical indicators.
    Returns:
        tuple: (next_day_price)
    """

    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy().dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20']
    gold_clean = gold_clean.dropna()
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean = gold_clean.dropna()
    gold_clean = gold_clean[
        (np.isfinite(gold_clean['Target_pct_change'])) &
        (np.abs(gold_clean['Target_pct_change']) < 1.0)
    ]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price

In [6]:
def predict_next_day_gold_price_rf(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using Random Forest with enhanced features.
    Saves model daily and loads if already exists. Returns price, model, and percentage change.
    """

    # Feature Engineering
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy()
    gold_clean = gold_clean.dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    gold_clean['Price_momentum_3'] = gold_clean['Close'] / gold_clean['Close'].shift(3) - 1
    gold_clean['Price_momentum_5'] = gold_clean['Close'] / gold_clean['Close'].shift(5) - 1

    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20',
        'Price_momentum_3', 'Price_momentum_5']

    gold_clean.dropna(inplace=True)
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean.dropna(inplace=True)
    gold_clean = gold_clean[(np.abs(gold_clean['Target_pct_change']) < 1.0)]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price


In [7]:
def predict_next_day_gold_price_lstm(gold: pd.DataFrame, model=None) -> float:
    
    sequence_length = 10 ## Based on the model's training sequence length
    
    # -------------------------------
    # Feature Setup
    # -------------------------------
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold = gold[['Close'] + feature_cols].dropna()
    gold = gold.asfreq('B')
    gold.ffill(inplace=True)
    gold['Target'] = gold['Close'].shift(-1)
    gold.dropna(inplace=True)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(gold[feature_cols])
    y_scaled = scaler.fit_transform(gold[['Target']])

    X_seq, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X_seq.append(X_scaled[i:i + sequence_length])
        y_seq.append(y_scaled[i + sequence_length])

    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)

    # -------------------------------
    # Forecast
    # -------------------------------
    model.eval()
    last_seq = torch.tensor(X_scaled[-sequence_length:], dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        next_pred = model(last_seq).cpu().numpy()

    predicted_price = scaler.inverse_transform(
        np.concatenate([np.zeros((1, len(feature_cols))), next_pred], axis=1)
    )[:, -1][0]

    return predicted_price

In [8]:
def predict_next_day_gold_price_ensemble(
    ensemble_model: dict,
    arimax_pred: float,
    xgb_pred: float,
    rf_pred: float,
    lstm_pred: float,
    llm_pred: float
):
    """
    Use a pre-trained ensemble model (results hash) and new predictions to create a results hash.
    Only updates the 'individual_predictions' and recalculates all ensemble outputs.
    """
    # Extract weights and metadata from the loaded ensemble model
    weights_used = ensemble_model.get('weights_used', {})
    meta_weights = weights_used.get('meta_weights', {
        'simple': 0.1, 'weighted': 0.25, 'sentiment': 0.2, 'volatility': 0.25, 'trend': 0.2
    })
    norm_weights = weights_used.get('normalized_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })
    vol_weights = weights_used.get('volatility_weights', {
        'arimax': 0.25, 'xgboost': 0.20, 'rf': 0.20, 'lstm': 0.15, 'llm': 0.20
    })
    trend_weights = weights_used.get('trend_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })

    # Extract metadata for calculation
    metadata = ensemble_model.get('metadata', {})
    current_price = metadata.get('current_price', 0)
    current_sentiment = metadata.get('current_sentiment', 0)

    model_names = ['arimax', 'xgboost', 'rf', 'lstm', 'llm']
    model_preds = [arimax_pred, xgb_pred, rf_pred, lstm_pred, llm_pred]

    # 1. Simple average
    simple_avg = np.mean(model_preds)

    # 2. Weighted average (sentiment-boosted)
    weighted_avg = sum(norm_weights[k] * p for k, p in zip(model_names, model_preds))

    # 3. Sentiment-adjusted
    sentiment_factor = 1 + 0.02 * current_sentiment if abs(current_sentiment) > 0.1 else 1.0
    sentiment_adjusted = weighted_avg * sentiment_factor

    # 4. Volatility-weighted
    volatility_weighted = sum(vol_weights[k] * p for k, p in zip(model_names, model_preds))

    # 5. Trend-following
    trend_following = sum(trend_weights[k] * p for k, p in zip(model_names, model_preds))

    # 6. Meta-Ensemble
    meta_ensemble = (
        meta_weights['simple'] * simple_avg +
        meta_weights['weighted'] * weighted_avg +
        meta_weights['sentiment'] * sentiment_adjusted +
        meta_weights['volatility'] * volatility_weighted +
        meta_weights['trend'] * trend_following
    )

    pct_changes = {
        'simple_avg': (simple_avg - current_price) / current_price * 100 if current_price else 0,
        'weighted_avg': (weighted_avg - current_price) / current_price * 100 if current_price else 0,
        'sentiment_adjusted': (sentiment_adjusted - current_price) / current_price * 100 if current_price else 0,
        'volatility_weighted': (volatility_weighted - current_price) / current_price * 100 if current_price else 0,
        'trend_following': (trend_following - current_price) / current_price * 100 if current_price else 0,
        'meta_ensemble': (meta_ensemble - current_price) / current_price * 100 if current_price else 0
    }

    results = {
        'predictions': {
            'simple_average': simple_avg,
            'weighted_average': weighted_avg,
            'sentiment_adjusted': sentiment_adjusted,
            'volatility_weighted': volatility_weighted,
            'trend_following': trend_following,
            'meta_ensemble': meta_ensemble
        },
        'percentage_changes': pct_changes,
        'weights_used': weights_used,
        'metadata': metadata,
        'model_info': {
            **ensemble_model.get('model_info', {}),
            'created_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'individual_predictions': {
                'arimax': arimax_pred,
                'xgboost': xgb_pred,
                'random_forest': rf_pred,
                'lstm': lstm_pred,
                'llm': llm_pred
            }
        }
    }
    return results


In [19]:
start = datetime(2010, 1, 1)
end = datetime(2026, 1, 1)

# Download gold prices
gold_prices_csv = os.path.join(WORKAREA, "Tarun/data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv")
if os.path.exists(gold_prices_csv):
    gold = pd.read_csv(gold_prices_csv, parse_dates=['Date'], index_col='Date')
else:
    gold = generate_sentiment_from_trend_with_labels(add_technical_indicators(download_gold_prices(start, end)))

current_price = gold['Close'].iloc[-1]
print(f"Current Gold Price: {current_price}")

# Prepare input dictionary (ensure these variables are defined in your notebook)
# --------------------------------------------------------------------
# Load pre-trained models
# --------------------------------------------------------------------
# # Get device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

lstw_model           = LSTMModel(input_size=11).to(device)
lstw_model.load_state_dict(torch.load(f'{WORKAREA}/Jaison/Main_Code/Model/LSTM/lstm_{today}.pt', map_location=device))
arimax_model         = sm.load_pickle(f'{WORKAREA}/Jaison/Main_Code/Model/Arimax/arimax_{today}.pkl')
random_forest_model  = sm.load_pickle(f'{WORKAREA}/Jaison/Main_Code/Model/RandomForest/random_forest_{today}.pkl')
xgboost_model        = sm.load_pickle(f'{WORKAREA}/Jaison/Main_Code/Model/XGBoost/xgboost_{today}.pkl')
ensemble_model       = sm.load_pickle(f'{WORKAREA}/Tarun/Model/Final_Ensemble/ensemble_model_{today}.pkl')
news_model_path           = os.path.join(WORKAREA, "/Tarun/Model/final_model.pth")

Current Gold Price: 82.2300033569336
Using device: cpu


In [9]:
## TODO [Yaswanth] : Replace this with today's news articles scraping.

## TODO [Tejashwini] : cleaning script for scraped news data.


In [12]:
## TODO [Mohan]: Integrate sentiment extraction model here
# UI Based inputs + 



In [None]:
## TODO [Adithya] : Insert Topic extraction model here.


# 2. Define a function to generate dummy input (replace with real input later)
def generate_dummy_news_input(device):
    encodings = torch.tensor(np.random.rand(1, 10, 512).astype(np.float32), dtype=torch.float32).to(device)
    mask = torch.tensor(np.ones((1, 10)).astype(np.float32), dtype=torch.float32).to(device)
    return encodings, mask

In [None]:
# 1. Define a function to load the SetTransformer model and weights
def load_news_llm_model( device, model_path):
    news_model = SetTransformer(
        dim_input=512,
        num_outputs=1,
        dim_output=1,
        num_inds=32,
        dim_hidden=128,
        num_heads=4,
        ln=True
    ).to(device)
    if os.path.exists(model_path):
        news_model, _ = load_checkpoint(model_path, news_model, device)
    return news_model

# 3. Define the transform function for the TransformChain
def news_llm_transform(inputs):
    news_model = load_news_llm_model(inputs["device"],inputs["model_path"])
    encodings, mask = generate_dummy_news_input(inputs["device"])
    with torch.no_grad():
        pred = news_model(encodings, mask=mask)
        if hasattr(pred, "item"):
            pred = pred.item()
    predicted_price = inputs["current_price"] * (1 + pred)
    return {"predicted_price_news_llm": predicted_price}

In [21]:
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SequentialChain, TransformChain

# --------------------------------------------------------------------
# Define prompt templates for each model prediction step
# Define TransformChains for each model prediction step using the existing functions
arimax_chain = TransformChain(
    input_variables=["df", "arimax_model"],
    output_variables=["predicted_price_arimax"],
    transform=lambda inputs: {
        "predicted_price_arimax": predict_next_day_gold_price_arimax(inputs["df"], inputs["arimax_model"])
    }
)

rf_chain = TransformChain(
    input_variables=["df", "random_forest_model"],
    output_variables=["predicted_price_rf"],
    transform=lambda inputs: {
        "predicted_price_rf": predict_next_day_gold_price_rf(inputs["df"], inputs["random_forest_model"])
    }
)

xgb_chain = TransformChain(
    input_variables=["df", "xgboost_model"],
    output_variables=["predicted_price_xgb"],
    transform=lambda inputs: {
        "predicted_price_xgb": predict_next_day_gold_price_xgboost(inputs["df"], inputs["xgboost_model"])
    }
)

lstm_chain = TransformChain(
    input_variables=["df", "lstw_model"],
    output_variables=["predicted_price_lstw"],
    transform=lambda inputs: {
        "predicted_price_lstw": predict_next_day_gold_price_lstm(inputs["df"], inputs["lstw_model"])
    }
)

news_llm_chain = TransformChain(
    input_variables=["current_price", "device", "model_path"],
    output_variables=["predicted_price_news_llm"],
    transform=news_llm_transform
)

# Define the ensemble prediction as a TransformChain
ensemble_chain = TransformChain(
    input_variables=[
        "ensemble_model",
        "predicted_price_arimax",
        "predicted_price_xgb",
        "predicted_price_rf",
        "predicted_price_lstw",
        "predicted_price_news_llm"
    ],
    output_variables=["ensemble_results"],
    transform=lambda inputs: {
        "ensemble_results": predict_next_day_gold_price_ensemble(
            inputs["ensemble_model"],
            inputs["predicted_price_arimax"],
            inputs["predicted_price_xgb"],
            inputs["predicted_price_rf"],
            inputs["predicted_price_lstw"],
            inputs["predicted_price_news_llm"]
        )
    }
)

# Orchestrate the workflow with SequentialChain



# General inputs for the sequence
general_inputs = {
    "df": gold,
    "current_price": current_price,
    "device": device
}

# Prepare input dictionary for the time series models
ts_inputs = {
    "arimax_model": arimax_model,
    "random_forest_model": random_forest_model,
    "xgboost_model": xgboost_model,
    "lstw_model": lstw_model
}

# Prepare input dictionary for the news LLM chain
news_llm_inputs = {
    "model_path": news_model_path
}

# ensemble inputs
emsemble_inputs = {
    "ensemble_model": ensemble_model,
}

full_inputs = {
    **general_inputs,
    **ts_inputs,
    **news_llm_inputs,
    **emsemble_inputs
}

# Compose the full sequence
full_seq_chain = SequentialChain(
    chains=[arimax_chain, rf_chain, xgb_chain, lstm_chain, news_llm_chain, ensemble_chain],
    input_variables=[
        "current_price", "device", "df",
        "model_path", "arimax_model", "random_forest_model", "xgboost_model", "lstw_model",
        "ensemble_model", 
    ],
    output_variables=[
        "predicted_price_arimax", "predicted_price_rf", "predicted_price_xgb", "predicted_price_lstw",
        "predicted_price_news_llm", 
        "ensemble_results"
    ]
)

# Run the orchestrated sequence
results = full_seq_chain(full_inputs)
ensemble_results = final_results["ensemble_results"]

print("---------------------------------------------------")
print(f"Current Gold Price: {current_price}")
print("---------------------------------------------------")
print(f"Predictions for next day: {next_day}")
print("---------------------------------------------------")
print(f"ARIMAX: Predicted gold price: {results['predicted_price_arimax']}")
print(f"Random Forest: Predicted gold price: {results['predicted_price_rf']}")
print(f"XGBoost: Predicted gold price: {results['predicted_price_xgb']}")
print(f"LSTM: Predicted gold price: {results['predicted_price_lstw']}")
print(f"News LLM: Predicted gold price: {results['predicted_price_news_llm']}")
print("---------------------------------------------------")
print("Ensemble Model Results:")
print("---------------------------------------------------")
print(f"Predicted Price: {ensemble_results['predictions']['meta_ensemble']}")
print(f"Percentage Change: {ensemble_results['percentage_changes']['meta_ensemble']:.2f}%")

---------------------------------------------------
Current Gold Price: 82.2300033569336
---------------------------------------------------
Predictions for next day: 2025-06-24
---------------------------------------------------
ARIMAX: Predicted gold price: 82.31355078193063
Random Forest: Predicted gold price: 82.63219047144733
XGBoost: Predicted gold price: 83.17499557521478
LSTM: Predicted gold price: 81.82770381879871
News LLM: Predicted gold price: 78.56566835144264
---------------------------------------------------
Ensemble Model Results:
---------------------------------------------------
Predicted Price: 88.95519132780538
Percentage Change: 8.18%
