In [None]:
# This script installs all required libraries for data analysis, plotting, LLM workflows, and notebook imports.
# Note: The installation command is commented out to prevent accidental execution.
# --------------------------------------------------------------------------------

# Required Libraries:
# pandas: Data manipulation and analysis
# numpy: Numerical computations
# matplotlib: Data visualization
# yfinance: Downloading financial data from Yahoo Finance
# langchain: Building LLM-powered applications and chains
# import_ipynb: Importing Jupyter notebooks as Python modules
# scipy: Scientific computing (e.g., signal processing)
# statsmodels: Statistical modeling and time series analysis
# xgboost: Gradient boosting for machine learning
# selenium: Web scraping and browser automation
# webdriver_manager: Managing browser drivers for Selenium
# transformers: State-of-the-art NLP models
# peft: Parameter-efficient fine-tuning for transformers
# accelerate: Optimizing training and inference of models
# bitsandbytes: Efficient training of large models with 8-bit optimizers
# tensorflow: Deep learning framework
# torch: PyTorch deep learning framework
# tensorboard: Visualization tool for TensorFlow and PyTorch
# scikit-learn: Machine learning library for Python (version 1.6.1)

# Install all required libraries
#%pip install -U tensorflow pandas torch tensorboard numpy matplotlib yfinance langchain import_ipynb scipy statsmodels xgboost selenium webdriver_manager transformers peft accelerate bitsandbytes
#%pip install scikit-learn==1.6.1

In [None]:
import os
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
import datetime
#from langchain.chains import SequentialChain, LLMChain
#from langchain.prompts import PromptTemplate
#from langchain.llms import OpenAI  # Replace with any LLM provider
#from langchain.output_parsers import RegexParser
# -------------------------------------------------------------------------
# Other Imports
# -------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from datetime import datetime, timedelta
from torch.utils.data import Dataset
import statsmodels.api as sm
import torch.nn as nn
import torch.optim as optim
import torch
# -------------------------------------------------------------------------
#  Custom Imports
from modules.modules import SetTransformer, VariableSetDataset
from modules.functions import *
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Import predict_sentiment from your finbert_finetune_refactored.py
#from finbert_finetune_refactored import predict_sentiment
# -------------------------------------------------------------------------
#  Web Scraping Imports
# -------------------------------------------------------------------------
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# -------------------------------------------------------------------------
import tensorflow_hub as hub

In [None]:
# ---------------------------------------------------------------------------
# Load Datasets for Time Series Models
# ---------------------------------------------------------------------------

input_data = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv"
df = pd.read_csv(input_data, index_col=0, parse_dates=True)
gold_today = df.iloc[[-1]]
print(f"Data \n\n{gold_today.head()}\n\n")
current_price = gold_today['Close'].values[0]
print(f"Current Gold Price: {current_price}")

# ---------------------------------------------------------------------------
# Load News Data and Predict Sentiment
# ---------------------------------------------------------------------------
gold_data_plain = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data.csv"
news_data_path = f"{WORKAREA}/Tarun/data/news_data_{today}.csv"
news_data_with_sentiment_path = f'{WORKAREA}/Tarun/data/news_data_with_sentiment_{today}.csv'
finbert_model_path = f'{WORKAREA}/Tarun/Model/finbert_best_model_merged'

batch_predict_and_update_csv(news_data_path, finbert_model_path, news_data_with_sentiment_path)

df_gold = pd.read_csv(gold_data_plain)
df_raw = pd.read_csv(news_data_with_sentiment_path)
print(f"Raw Data \n\n{df_raw.head()}\n\n")
df_processed = preprocess_dataset(df_raw)
print(f"Raw Data \n\n{df_raw.head()}\n\n")
df_processed = generate_topic_encodings(df_processed)  ## model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
print(f"Processed Data \n\n{df_processed.head()}\n\n")
#final_df = add_gold_price_change(df_processed,df_gold)  ## final_df = merged_df[['Date','text','sentiment','topic_encodings','sentiment_combined_encodings','price_percentage_change']].copy()
final_df = add_gold_price_change_with_weekend_handling(df_processed,df_gold)
print(f"Final Data \n\n{final_df.head()}\n\n")

In [None]:
## TODO [Tarun] : Replace the file read with values produced by previous members in the chain.

#news_llm_model_data = pd.read_pickle('data/combined_dataset_with_price_change.pkl')
#print("Number of rows in df:",news_llm_model_data.shape)
#news_llm_model_data.head()

## Group input data into sets for use in model.
#encodings, price_changes, masks = group_into_variable_sets(news_llm_model_data)
#print(f"Encodings shape: {encodings.shape}, Price changes shape: {price_changes.shape}, Masks shape: {masks.shape}")

## Create the dataset
#dataset = VariableSetDataset(encodings, price_changes, masks)
#print(f"Dataset {dataset}")

In [None]:
## TODO [Mohan]: Integrate sentiment extraction model here [Done this, please check]
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#Download the model from here and paste it inside Models/
#> https://indianinstituteofscience-my.sharepoint.com/:f:/g/personal/mohanpanakam_iisc_ac_in/Etg-B99_anJCk2jGh4Cy3vABWPLR3brtxxlMZAPxf9kDgQ?e=uGe18T

def predict_sentiments(model_path, inp_text):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    results = []
    for text in inp_text:
        prediction = predict_sentiment(text, model, tokenizer, device)
        if prediction:
            results.append(prediction)

    return results

#Quick sanity check
inp_text = ["Test_text","Test_text2"]
result = predict_sentiments("Model/finbert_best_model_merged",inp_text)
df = pd.DataFrame(result)
df['final_sentiment'] = df.probabilities.apply(lambda x: float(x['positive']) - float(x['negative'])) * df.confidence.astype(float)
df

Unnamed: 0,text,sentiment,confidence,logits,probabilities,final_sentiment
0,Test_text,neutral,0.9977,"[-2.3928005695343018, 4.305459976196289, -2.57...","{'positive': '0.0012', 'neutral': '0.9977', 'n...",0.0002
1,Test_text2,neutral,0.9979,"[-2.380605697631836, 4.358696460723877, -2.613...","{'positive': '0.0012', 'neutral': '0.9979', 'n...",0.000299


In [None]:
## TODO [Adithya] : Insert Topic extraction model here.
def embed_sentences(sentences):
    if(type(sentences) == str):
        sentences = [sentences]
    model_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #This is around 1 GB in size, it took a while for me to run this.
    embed = hub.load(model_url)

    # Generate embeddings
    embeddings = embed(sentences)
    return embeddings

In [None]:
#TODO: rename this to get__inputs or somehting like that once scraper is integrated.

# 1. Define a function to generate dummy input (replace with real input later)
def generate_dummy_news_input(device):

    #TODO:This should be replaced with scraping script later.
    dummy_input_text = ["Test input 1", "Test_input 2", "Test input 3"]

    #Get topic and sentiment
    encodings = embed_sentences(dummy_input_text)
    raw_sentiments = predict_sentiments("Model/finbert_best_model_merged",dummy_input_text)

    #Combine them
    sentiment = pd.DataFrame(raw_sentiments)
    sentiment['final_sentiment'] = sentiment.probabilities.apply(lambda x: float(x['positive']) - float(x['negative'])) * sentiment.confidence.astype(float)
    sentiment['encodings'] = list(encodings)
    sentiment['final_encodings'] = sentiment.encodings * sentiment.final_sentiment

    #Convert embeddings to required dimension.
    encodings = np.array(list(sentiment.final_encodings), dtype=np.float32)
    encodings = torch.tensor(encodings.reshape(1,*encodings.shape), dtype=torch.float32).to(device)
    # encodings = torch.tensor(np.random.rand(1, 10, 512).astype(np.float32), dtype=torch.float32).to(device)
    
    #Define placeholder mask
    mask = torch.tensor(np.ones((1, len(raw_sentiments))).astype(np.float32), dtype=torch.float32).to(device)

    return encodings, mask

#Sanity check to make sure function works.
enc,msk = generate_dummy_news_input(device)
print(enc.shape,msk.shape)














torch.Size([1, 3, 512]) torch.Size([1, 3])


In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def test_model(model_path="./finbert_best_model_merged"):
    """
    Test the trained model on sample texts and print sentiment, logits, and probabilities.
    """
    try:
        # Load model and tokenizer
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)

        # Sample texts for testing
        test_texts = [
            "Dec. gold climbs $9.40, or 0.7%, to settle at $1,356.90/oz",
            "gold prices rebound rs 350 on global cues, weak rupee",
            "Gold futures down at Rs 30,244 ",
            "gold, oil trade lower as jobs data weigh"
        ]

        # Make predictions
        results = []
        for text in test_texts:
            prediction = predict_sentiment(text, model, tokenizer, device)
            if prediction:
                results.append(prediction)
                print("\nText:", text)
                print("Sentiment:", prediction["sentiment"])
                print("Confidence:", prediction["confidence"])
                print("Logits:", prediction["logits"])
                print("Class Probabilities:", prediction["probabilities"])

        return results

    except Exception as e:
        print(f"Error in testing: {e}")
        return None

# Example usage:
if __name__ == "__main__":
    test_model()

Error in testing: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './finbert_best_model_merged'.


In [None]:
# Get device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Run Time Series Models and get predictions
# --------------------------------------------------------------------
# Load pre-trained models
arimax_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/Arimax/arimax_{today}.pkl')
random_forest_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/RandomForest/random_forest_{today}.pkl')
xgboost_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/XGBoost/xgboost_{today}.pkl')

lstw_model = LSTMModel(input_size=11).to(device)
lstw_model.load_state_dict(torch.load(f'{WORKAREA}/Tarun/Model/LSTM/lstm_{today}.pt', map_location=device))

# --------------------------------------------------------------------
# Predict gold price using all four models with gold_price_prediction where possible

# ARIMAX
predicted_price_arimax = predict_next_day_gold_price_arimax(df, arimax_model)

# Random Forest
predicted_price_rf = predict_next_day_gold_price_rf(df, random_forest_model)

# XGBoost
predicted_price_xgb = predict_next_day_gold_price_xgboost(df, xgboost_model)

# LSTM
predicted_price_lstw = predict_next_day_gold_price_lstm(df, lstw_model)

print(f"ARIMAX: Predicted gold price for {next_day}: {predicted_price_arimax}")

print(f"Random Forest: Predicted gold price for {next_day}: {predicted_price_rf}")

print(f"XGBoost: Predicted gold price for {next_day}: {predicted_price_xgb}")

print(f"LSTM: Predicted gold price for {next_day}: {predicted_price_lstw}")
# --------------------------------------------------------------------

In [None]:
# Prepare input dictionary for the news LLM chain
news_llm_inputs = {
    "device": device,
    "current_price": current_price,
    "model_path": news_model_path,
    "finbert_model": finbert_model,
    "news_data_csv": news_data_csv,
    "gold_data_plain_csv": gold_data_plain_csv,
    "news_data_with_sentiment_csv": news_data_with_sentiment_csv
}

news_llm_chain = TransformChain(
    input_variables=["current_price", "device", "model_path", "finbert_model", "news_data_csv", "gold_data_plain_csv", "news_data_with_sentiment_csv"],
    output_variables=["predicted_price_news_llm"],
    transform=news_llm_transform
)


# Compose the full sequence
news_llm_seq_chain = SequentialChain(
    chains=[news_llm_chain],
    input_variables=[
        "current_price", "device", "model_path", "finbert_model", "news_data_csv", "gold_data_plain_csv", "news_data_with_sentiment_csv"      
    ],
    output_variables=[
        "predicted_price_news_llm"
    ]
)

# Run the news LLM chain sequence
news_llm_results = news_llm_seq_chain.invoke(news_llm_inputs)
predicted_price_news_llm = news_llm_results["predicted_price_news_llm"]

print("---------------------------------------------------")
print(f"Current Gold Price: {current_price}")
print("---------------------------------------------------")
print(f"Predictions for next day: {next_day}")
print("---------------------------------------------------")
print(f"News LLM: Predicted gold price: {predicted_price_news_llm}")
print("---------------------------------------------------")

Updated file saved to D:/CAREER/IISC_B/Academics/Courses/SEM_3/DA_225o/Project/DL-7-25/Final/Tarun/data/news_data_with_sentiment_2025-06-23.csv
date          object
text          object
sentiment    float64
dtype: object
date                object
text                object
sentiment          float64
topic_encodings     object
dtype: object
---------------------------------------------------
Current Gold Price: 82.44999694824219
---------------------------------------------------
Predictions for next day: 2025-06-24
---------------------------------------------------
News LLM: Predicted gold price: 95.92861764982422
---------------------------------------------------


In [None]:
# TODO [Tarun]: Replace the input data with the actual news data for prediction.
# Generate random inputs and masks for testing
inputs = np.random.rand(1, 10, 512).astype(np.float32)
masks = np.ones((1,10)).astype(np.float32)

inputs = torch.tensor(inputs, dtype=torch.float32).to(device=device)
masks = torch.tensor(masks, dtype=torch.float32).to(device=device)

print(inputs.shape, masks.shape)
news_llm_change_precentage = news_model(inputs,mask= masks).item() * 100  # Convert to percentage
predicted_price_news_llm = current_price*(1 + news_llm_change_precentage / 100)
print(f"News LLM: Predicted gold price for {next_day}: {predicted_price_news_llm:.2f}")

In [None]:
# Load the pre-trained News LLM model & get predictions
# --------------------------------------------------------------------

news_model = SetTransformer(
    dim_input = 512, 
    num_outputs = 1, #One final prediction
    dim_output = 1, #1D output for price change
    num_inds=32, 
    dim_hidden=128, 
    num_heads=4, 
    ln=True #Layer normalization
    ).to(device)

# --------------------------------------------------------------------
# Load the pre-trained model weights
checkpoint_path = f'{WORKAREA}/Tarun/Model/final_model.pth'
if os.path.exists(checkpoint_path):
    start_epoch, start_loss = load_checkpoint(checkpoint_path, news_model, device)
    print(f"Model loaded from {checkpoint_path} at epoch {start_epoch} with loss {start_loss:.4f}")
else:
    start_epoch, start_loss = 0, float('inf')
    print(f"No checkpoint found at {checkpoint_path}. Starting from scratch.")

In [None]:
ensemble_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/Final_Ensemble/ensemble_model_{today}.pkl')
results = predict_next_day_gold_price_ensemble(
    ensemble_model,
    predicted_price_arimax,
    predicted_price_xgb,
    predicted_price_rf,
    predicted_price_lstw,
    predicted_price_news_llm,
)

print(f"Ensemble Model Results for {next_day}:")
print(f"Predicted Price: {results['predictions']['meta_ensemble']}")
print(f"Percentage Change: {results['percentage_changes']['meta_ensemble']:.2f}%")