In [1]:
# This script installs all required libraries for data analysis, plotting, LLM workflows, and notebook imports.
# Note: The installation command is commented out to prevent accidental execution.
# --------------------------------------------------------------------------------

# Required Libraries:
# pandas: Data manipulation and analysis
# numpy: Numerical computations
# matplotlib: Data visualization
# yfinance: Downloading financial data from Yahoo Finance
# langchain: Building LLM-powered applications and chains
# import_ipynb: Importing Jupyter notebooks as Python modules
# scipy: Scientific computing (e.g., signal processing)
# statsmodels: Statistical modeling and time series analysis
# xgboost: Gradient boosting for machine learning
# selenium: Web scraping and browser automation
# webdriver_manager: Managing browser drivers for Selenium
# transformers: State-of-the-art NLP models
# peft: Parameter-efficient fine-tuning for transformers
# accelerate: Optimizing training and inference of models
# bitsandbytes: Efficient training of large models with 8-bit optimizers
# tensorflow: Deep learning framework
# torch: PyTorch deep learning framework
# tensorboard: Visualization tool for TensorFlow and PyTorch
# scikit-learn: Machine learning library for Python (version 1.6.1)

# Install all required libraries
# %pip install -U tensorflow pandas torch tensorboard numpy matplotlib yfinance langchain import_ipynb scipy statsmodels xgboost selenium webdriver_manager transformers peft accelerate bitsandbytes
# %pip install scikit-learn==1.6.1
# %pip install tensorflow-hub
# %pip install "numpy<2.0"
# %pip install --upgrade numpy
# %pip install gradio

In [2]:
import os
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SequentialChain, TransformChain
# -------------------------------------------------------------------------
# Other Imports
# -------------------------------------------------------------------------
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from datetime import datetime, timedelta
import statsmodels.api as sm
import torch
# -------------------------------------------------------------------------
#  Custom Imports ## It internally imports the modules & functions
# -------------------------------------------------------------------------
from modules.model_run_functions_old import *




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
WORKAREA = "D:/CAREER/IISC_B/Academics/Courses/SEM_3/DA_225o/Project/DL-7-25/Final"
# =========================================================================
# Get today's date and the next day in YYYY-MM-DD format
# =========================================================================
today = datetime.now().strftime("%Y-%m-%d")
next_day = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
print(f"Today's date: {today}")
print(f"Next day's date: {next_day}")

Today's date: 2025-06-23
Next day's date: 2025-06-24


In [4]:
start = datetime(2010, 1, 1)
end = datetime(2026, 1, 1)

## TODO: update news_data_raw to correct path
news_data_raw       = f"{WORKAREA}/Tarun/data/bullionvault_articles.csv"
news_data_csv       = f"{WORKAREA}/Tarun/data/news_data_{today}.csv"
gold_prices_csv     = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv"
gold_data_plain_csv = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data.csv"
news_data_with_sentiment_csv = f"{WORKAREA}/Tarun/data/news_data_with_sentiment_{today}.csv"
finbert_model       = f"{WORKAREA}/Tarun/Model/finbert_best_model_merged"

if os.path.exists(gold_prices_csv):
    gold = pd.read_csv(gold_prices_csv, parse_dates=['Date'], index_col='Date')
else:
    # Download gold prices and save to CSV
    gold = generate_sentiment_from_trend_with_labels(add_technical_indicators(download_gold_prices(start, end)))

current_price = gold['Close'].iloc[-1]
print(f"Current Gold Price: {current_price}")



# Prepare input dictionary (ensure these variables are defined in your notebook)
# --------------------------------------------------------------------
# Load pre-trained models
# --------------------------------------------------------------------
# # Get device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Current Gold Price: 82.44999694824219
Using device: cpu


In [5]:
#Get models if available, else train them.
def get_base_models (path,today,device):

    #If today's wights exists, then load them.
    if os.path.exists(f'{path}/LSTM/lstm_{today}.pt'):
        lstw_model           = LSTMModel(input_size=11).to(device)
        lstw_model.load_state_dict(torch.load(f'{path}/LSTM/lstm_{today}.pt', map_location=device))
        arimax_model         = sm.load_pickle(f'{path}/Arimax/arimax_{today}.pkl')
        random_forest_model  = sm.load_pickle(f'{path}/RandomForest/random_forest_{today}.pkl')
        xgboost_model        = sm.load_pickle(f'{path}/XGBoost/xgboost_{today}.pkl')
        ensemble_model       = sm.load_pickle(f'{path}/Final_Ensemble/ensemble_model_{today}.pkl')
    
    #If not then train models.
    else:

        ## TODO! : Replace with model training script!!!
        print(f"Could not load from {path}")
        
        # Loading latest by default until above is done.
        path = '../..//Final/Jaison/Main_Code/Model'
        today = "2025-06-20"
        lstw_model           = LSTMModel(input_size=11).to(device)
        lstw_model.load_state_dict(torch.load(f'{path}/LSTM/lstm_{today}.pt', map_location=device))
        arimax_model         = sm.load_pickle(f'{path}/Arimax/arimax_{today}.pkl')
        random_forest_model  = sm.load_pickle(f'{path}/RandomForest/random_forest_{today}.pkl')
        xgboost_model        = sm.load_pickle(f'{path}/XGBoost/xgboost_{today}.pkl')
        ensemble_model       = sm.load_pickle(f'{path}/Final_Ensemble/ensemble_model_{today}.pkl')    

    news_model_path      = os.path.join(path, "/final_model.pth")

    return lstw_model,arimax_model,random_forest_model,xgboost_model,ensemble_model,news_model_path

In [6]:
lstw_model,arimax_model,random_forest_model,xgboost_model,ensemble_model,news_model_path = get_base_models(f'{WORKAREA}/Tarun/Model/',today,device)

In [7]:
## TODO [Yaswanth] : Replace this with today's news articles scraping.
# Extract news data
#news_df = extract_news_data(local_news=False)

In [8]:
## TODO [Deepak, Tejashwini] : cleaning script for scraped news data. Mention expected input and output formats.
#clean_and_prepare_articles(news_data_raw,news_data_csv)

In [9]:
# 3. Define the transform function for the TransformChain
def news_llm_transform(inputs):
    news_model = load_news_llm_model(inputs["device"], inputs["model_path"])
    encodings, mask = generate_news_input(
        inputs["device"],
        inputs["news_data_csv"],
        inputs["gold_data_plain_csv"],
        inputs["finbert_model"],
        inputs["news_data_with_sentiment_csv"]
    )
    with torch.no_grad():
        pred = news_model(encodings, mask=mask)
        if hasattr(pred, "item"):
            pred = pred.item()
    predicted_price = inputs["current_price"] * (1 + pred)
    return {"predicted_price_news_llm": predicted_price}

In [10]:
# Orchestrate the workflow with SequentialChain
# --------------------------------------------------------------------
# Define prompt templates for each model prediction step
# Define TransformChains for each model prediction step using the existing functions
arimax_chain = TransformChain(
    input_variables=["df", "arimax_model"],
    output_variables=["predicted_price_arimax"],
    transform=lambda inputs: {
        "predicted_price_arimax": predict_next_day_gold_price_arimax(inputs["df"], inputs["arimax_model"])
    }
)

rf_chain = TransformChain(
    input_variables=["df", "random_forest_model"],
    output_variables=["predicted_price_rf"],
    transform=lambda inputs: {
        "predicted_price_rf": predict_next_day_gold_price_rf(inputs["df"], inputs["random_forest_model"])
    }
)

xgb_chain = TransformChain(
    input_variables=["df", "xgboost_model"],
    output_variables=["predicted_price_xgb"],
    transform=lambda inputs: {
        "predicted_price_xgb": predict_next_day_gold_price_xgboost(inputs["df"], inputs["xgboost_model"])
    }
)

lstm_chain = TransformChain(
    input_variables=["df", "device", "lstw_model"],
    output_variables=["predicted_price_lstw"],
    transform=lambda inputs: {
        "predicted_price_lstw": predict_next_day_gold_price_lstm(inputs["df"], inputs["device"], inputs["lstw_model"])
    }
)

news_llm_chain = TransformChain(
    input_variables=["current_price", "device", "model_path", "finbert_model", "news_data_csv", "gold_data_plain_csv", "news_data_with_sentiment_csv"],
    output_variables=["predicted_price_news_llm"],
    transform=news_llm_transform
)

# Define the ensemble prediction as a TransformChain
ensemble_chain = TransformChain(
    input_variables=[
        "ensemble_model",
        "predicted_price_arimax",
        "predicted_price_xgb",
        "predicted_price_rf",
        "predicted_price_lstw",
        "predicted_price_news_llm"
    ],
    output_variables=["ensemble_results"],
    transform=lambda inputs: {
        "ensemble_results": predict_next_day_gold_price_ensemble(
            inputs["ensemble_model"],
            inputs["predicted_price_arimax"],
            inputs["predicted_price_xgb"],
            inputs["predicted_price_rf"],
            inputs["predicted_price_lstw"],
            inputs["predicted_price_news_llm"]
        )
    }
)

# General inputs for the sequence
general_inputs = {
    "df": gold,
    "current_price": current_price,
    "device": device
}

# Prepare input dictionary for the time series models
ts_inputs = {
    "arimax_model": arimax_model,
    "random_forest_model": random_forest_model,
    "xgboost_model": xgboost_model,
    "lstw_model": lstw_model
}

# Prepare input dictionary for the news LLM chain
news_llm_inputs = {
    "device": device,
    "current_price": current_price,
    "model_path": news_model_path,
    "finbert_model": finbert_model,
    "news_data_csv": news_data_csv,
    "gold_data_plain_csv": gold_data_plain_csv,
    "news_data_with_sentiment_csv": news_data_with_sentiment_csv
}

# ensemble inputs
emsemble_inputs = {
    "ensemble_model": ensemble_model,
}

full_inputs = {
    **general_inputs,
    **ts_inputs,
    **news_llm_inputs,
    **emsemble_inputs
}

# Compose the full sequence
full_seq_chain = SequentialChain(
    chains=[arimax_chain, rf_chain, xgb_chain, lstm_chain, news_llm_chain, ensemble_chain],
    input_variables=[
        "current_price", "device", "df",
        "finbert_model", 
        "news_data_csv", "gold_data_plain_csv", "news_data_with_sentiment_csv",
        "model_path", "arimax_model", "random_forest_model", "xgboost_model", "lstw_model",
        "ensemble_model"        
    ],
    output_variables=[
        "predicted_price_arimax", "predicted_price_rf", "predicted_price_xgb", "predicted_price_lstw",
        "predicted_price_news_llm", 
        "ensemble_results"
    ]
)

# Run the orchestrated sequence
results = full_seq_chain.invoke(full_inputs)
ensemble_results = results["ensemble_results"]

print("---------------------------------------------------")
print(f"Current Gold Price: {current_price}")
print("---------------------------------------------------")
print(f"Predictions for next day: {next_day}")
print("---------------------------------------------------")
print(f"ARIMAX: Predicted gold price: {results['predicted_price_arimax']}")
print(f"Random Forest: Predicted gold price: {results['predicted_price_rf']}")
print(f"XGBoost: Predicted gold price: {results['predicted_price_xgb']}")
print(f"LSTM: Predicted gold price: {results['predicted_price_lstw']}")
print(f"News LLM: Predicted gold price: {results['predicted_price_news_llm']}")
print("---------------------------------------------------")
print("Ensemble Model Results:")
print("---------------------------------------------------")
print(f"Predicted Price: {ensemble_results['predictions']['meta_ensemble']}")
print(f"Percentage Change: {ensemble_results['percentage_changes']['meta_ensemble']:.2f}%")

Updated file saved to D:/CAREER/IISC_B/Academics/Courses/SEM_3/DA_225o/Project/DL-7-25/Final/Tarun/data/news_data_with_sentiment_2025-06-23.csv












---------------------------------------------------
Current Gold Price: 82.44999694824219
---------------------------------------------------
Predictions for next day: 2025-06-24
---------------------------------------------------
ARIMAX: Predicted gold price: 82.86905617926536
Random Forest: Predicted gold price: 82.12877083738854
XGBoost: Predicted gold price: 82.47662647419656
LSTM: Predicted gold price: 81.17937989204205
News LLM: Predicted gold price: 85.87733830491788
---------------------------------------------------
Ensemble Model Results:
---------------------------------------------------
Predicted Price: 82.7178386187372
Percentage Change: 0.59%
