In [None]:
# This script installs all required libraries for data analysis, plotting, LLM workflows, and notebook imports.
# Note: The installation command is commented out to prevent accidental execution.
# --------------------------------------------------------------------------------

# Required Libraries:
# pandas: Data manipulation and analysis
# numpy: Numerical computationsb
# matplotlib: Data visualization
# yfinance: Downloading financial data from Yahoo Finance
# langchain: Building LLM-powered applications and chains
# import_ipynb: Importing Jupyter notebooks as Python modules
# scipy: Scientific computing (e.g., signal processing)
# statsmodels: Statistical modeling and time series analysis
# xgboost: Gradient boosting for machine learning
# selenium: Web scraping and browser automation
# webdriver_manager: Managing browser drivers for Selenium
# transformers: State-of-the-art NLP models
# peft: Parameter-efficient fine-tuning for transformers
# accelerate: Optimizing training and inference of models
# bitsandbytes: Efficient training of large models with 8-bit optimizers
# tensorflow: Deep learning framework
# torch: PyTorch deep learning framework
# tensorboard: Visualization tool for TensorFlow and PyTorch
# scikit-learn: Machine learning library for Python (version 1.6.1)

# Install all required libraries
#%pip install -U tensorflow pandas torch tensorboard numpy matplotlib yfinance langchain import_ipynb scipy statsmodels xgboost selenium webdriver_manager transformers peft accelerate bitsandbytes
#%pip install scikit-learn==1.6.1

In [None]:
import os
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
import datetime
#from langchain.chains import SequentialChain, LLMChain
#from langchain.prompts import PromptTemplate
#from langchain.llms import OpenAI  # Replace with any LLM provider
#from langchain.output_parsers import RegexParser
# -------------------------------------------------------------------------
# Other Imports
# -------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from datetime import datetime, timedelta
import statsmodels.api as sm
import torch
# -------------------------------------------------------------------------
#  Custom Imports
from modules.modules import SetTransformer, LSTMModel, VariableSetDataset
from modules.functions import *

In [None]:
def predict_next_day_gold_price_arimax(df: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using ARIMAX with technical indicators.

    Returns:
        tuple: (next_day_price)
    """
    exog_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    for col in exog_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    df = df[['Close'] + exog_cols].dropna()
    df = df.asfreq('B')
    df.ffill(inplace=True)

    y = df['Close']
    exog = df[exog_cols]

    # -------------------------------
    # Forecast Next Price
    # -------------------------------
    next_exog = exog.iloc[[-1]].values
    predicted_price = model.forecast(steps=1, exog=next_exog).iloc[0]

    return predicted_price


In [None]:
def predict_next_day_gold_price_xgboost(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using XGBoost with technical indicators.
    Returns:
        tuple: (next_day_price)
    """

    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy().dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20']
    gold_clean = gold_clean.dropna()
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean = gold_clean.dropna()
    gold_clean = gold_clean[
        (np.isfinite(gold_clean['Target_pct_change'])) &
        (np.abs(gold_clean['Target_pct_change']) < 1.0)
    ]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price

In [None]:
def predict_next_day_gold_price_rf(gold: pd.DataFrame, model) -> float:
    """
    Predict next day's gold price using Random Forest with enhanced features.
    Saves model daily and loads if already exists. Returns price, model, and percentage change.
    """

    # Feature Engineering
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy()
    gold_clean = gold_clean.dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    gold_clean['Price_momentum_3'] = gold_clean['Close'] / gold_clean['Close'].shift(3) - 1
    gold_clean['Price_momentum_5'] = gold_clean['Close'] / gold_clean['Close'].shift(5) - 1

    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20',
        'Price_momentum_3', 'Price_momentum_5']

    gold_clean.dropna(inplace=True)
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean.dropna(inplace=True)
    gold_clean = gold_clean[(np.abs(gold_clean['Target_pct_change']) < 1.0)]

    # Predict next day
    latest_features = gold_clean[feature_cols_extended].iloc[[-1]]
    latest_price = gold_clean['Close'].iloc[-1]
    scaler = RobustScaler().fit(gold_clean[feature_cols_extended])
    latest_scaled = scaler.transform(latest_features)
    next_day_pct_change = model.predict(latest_scaled)[0]
    next_day_price = latest_price * (1 + next_day_pct_change)

    return next_day_price


In [None]:
def predict_next_day_gold_price_lstm(gold: pd.DataFrame, model=None) -> float:
    
    sequence_length = 10 ## Based on the model's training sequence length
    
    # -------------------------------
    # Feature Setup
    # -------------------------------
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold = gold[['Close'] + feature_cols].dropna()
    gold = gold.asfreq('B')
    gold.ffill(inplace=True)
    gold['Target'] = gold['Close'].shift(-1)
    gold.dropna(inplace=True)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(gold[feature_cols])
    y_scaled = scaler.fit_transform(gold[['Target']])

    X_seq, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X_seq.append(X_scaled[i:i + sequence_length])
        y_seq.append(y_scaled[i + sequence_length])

    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)

    # -------------------------------
    # Forecast
    # -------------------------------
    model.eval()
    last_seq = torch.tensor(X_scaled[-sequence_length:], dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        next_pred = model(last_seq).cpu().numpy()

    predicted_price = scaler.inverse_transform(
        np.concatenate([np.zeros((1, len(feature_cols))), next_pred], axis=1)
    )[:, -1][0]

    return predicted_price

In [None]:
def predict_next_day_gold_price_ensemble(
    ensemble_model: dict,
    arimax_pred: float,
    xgb_pred: float,
    rf_pred: float,
    lstm_pred: float,
    llm_pred: float
):
    """
    Use a pre-trained ensemble model (results hash) and new predictions to create a results hash.
    Only updates the 'individual_predictions' and recalculates all ensemble outputs.
    """
    # Extract weights and metadata from the loaded ensemble model
    weights_used = ensemble_model.get('weights_used', {})
    meta_weights = weights_used.get('meta_weights', {
        'simple': 0.1, 'weighted': 0.25, 'sentiment': 0.2, 'volatility': 0.25, 'trend': 0.2
    })
    norm_weights = weights_used.get('normalized_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })
    vol_weights = weights_used.get('volatility_weights', {
        'arimax': 0.25, 'xgboost': 0.20, 'rf': 0.20, 'lstm': 0.15, 'llm': 0.20
    })
    trend_weights = weights_used.get('trend_weights', {
        'arimax': 0.15, 'xgboost': 0.25, 'rf': 0.20, 'lstm': 0.20, 'llm': 0.20
    })

    # Extract metadata for calculation
    metadata = ensemble_model.get('metadata', {})
    current_price = metadata.get('current_price', 0)
    current_sentiment = metadata.get('current_sentiment', 0)

    model_names = ['arimax', 'xgboost', 'rf', 'lstm', 'llm']
    model_preds = [arimax_pred, xgb_pred, rf_pred, lstm_pred, llm_pred]

    # 1. Simple average
    simple_avg = np.mean(model_preds)

    # 2. Weighted average (sentiment-boosted)
    weighted_avg = sum(norm_weights[k] * p for k, p in zip(model_names, model_preds))

    # 3. Sentiment-adjusted
    sentiment_factor = 1 + 0.02 * current_sentiment if abs(current_sentiment) > 0.1 else 1.0
    sentiment_adjusted = weighted_avg * sentiment_factor

    # 4. Volatility-weighted
    volatility_weighted = sum(vol_weights[k] * p for k, p in zip(model_names, model_preds))

    # 5. Trend-following
    trend_following = sum(trend_weights[k] * p for k, p in zip(model_names, model_preds))

    # 6. Meta-Ensemble
    meta_ensemble = (
        meta_weights['simple'] * simple_avg +
        meta_weights['weighted'] * weighted_avg +
        meta_weights['sentiment'] * sentiment_adjusted +
        meta_weights['volatility'] * volatility_weighted +
        meta_weights['trend'] * trend_following
    )

    pct_changes = {
        'simple_avg': (simple_avg - current_price) / current_price * 100 if current_price else 0,
        'weighted_avg': (weighted_avg - current_price) / current_price * 100 if current_price else 0,
        'sentiment_adjusted': (sentiment_adjusted - current_price) / current_price * 100 if current_price else 0,
        'volatility_weighted': (volatility_weighted - current_price) / current_price * 100 if current_price else 0,
        'trend_following': (trend_following - current_price) / current_price * 100 if current_price else 0,
        'meta_ensemble': (meta_ensemble - current_price) / current_price * 100 if current_price else 0
    }

    results = {
        'predictions': {
            'simple_average': simple_avg,
            'weighted_average': weighted_avg,
            'sentiment_adjusted': sentiment_adjusted,
            'volatility_weighted': volatility_weighted,
            'trend_following': trend_following,
            'meta_ensemble': meta_ensemble
        },
        'percentage_changes': pct_changes,
        'weights_used': weights_used,
        'metadata': metadata,
        'model_info': {
            **ensemble_model.get('model_info', {}),
            'created_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'individual_predictions': {
                'arimax': arimax_pred,
                'xgboost': xgb_pred,
                'random_forest': rf_pred,
                'lstm': lstm_pred,
                'llm': llm_pred
            }
        }
    }
    return results


In [None]:
## TODO [Yaswanth] : Replace this with today's news articles scraping.

def extract_news_data(local_news=False):
    bullion_df = get_latest_bullionvault_articles()
    yf_df=get_latest_yf_articles()
    yf_df['Date']=pd.to_datetime(yf_df['Date'],errors='coerce').dt.date
    reuters_df = get_reuters_articles()
    df_list_for_concatenation = [bullion_df, yf_df, reuters_df]
    if local_news:
        telugu_news_df=fetch_bbc_telugu_news()
        df_list_for_concatenation.append(telugu_news_df)
    
    three_days_ago = pd.to_datetime('today').date() - timedelta(days=3)

    df_combined = pd.concat(df_list_for_concatenation, ignore_index=True)
    df_combined = df_combined.sort_values(by='Date')
    df_combined=df_combined[df_combined['Date'] >= three_days_ago]
    return df_combined

## TODO [Tejashwini] : cleaning script for scraped news data.


In [None]:
## TODO [Adithya] : Insert Topic extraction model here.


In [None]:
WORKAREA = os.getenv("WORKAREA", "D:/CAREER/IISC_B/Academics/Courses\SEM_3\DA_225o\Project\DL-7-25\Final")
# =========================================================================
# Get today's date and the next day in YYYY-MM-DD format
# =========================================================================
today = datetime.now().strftime("%Y-%m-%d")
next_day = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
print(f"Today's date: {today}")
print(f"Next day's date: {next_day}")

start = datetime(2010, 1, 1)
end = datetime(2026, 1, 1)
gold = generate_sentiment_from_trend_with_labels(add_technical_indicators(download_gold_prices(start, end)))

In [None]:
# ---------------------------------------------------------------------------
# Load Datasets for Time Series Models
# ---------------------------------------------------------------------------

input_data = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv"
df = pd.read_csv(input_data, index_col=0, parse_dates=True)
gold_today = df.iloc[[-1]]
print(f"Data \n\n{gold_today.head()}\n\n")
current_price = gold_today['Close'].values[0]
print(f"Current Gold Price: {current_price}")

# ---------------------------------------------------------------------------
# Load News Data and Predict Sentiment
# ---------------------------------------------------------------------------
gold_data_plain = f"{WORKAREA}/Tarun/data/GOLDBEES_ETF_price_data.csv"
news_data_path = f"{WORKAREA}/Tarun/data/news_data_{today}.csv"
news_data_with_sentiment_path = f'{WORKAREA}/Tarun/data/news_data_with_sentiment_{today}.csv'
finbert_model_path = f'{WORKAREA}/Tarun/Model/finbert_best_model_merged'

batch_predict_and_update_csv(news_data_path, finbert_model_path, news_data_with_sentiment_path)

df_gold = pd.read_csv(gold_data_plain)
df_raw = pd.read_csv(news_data_with_sentiment_path)
print(f"Raw Data \n\n{df_raw.head()}\n\n")
df_processed = preprocess_dataset(df_raw)
print(f"Raw Data \n\n{df_raw.head()}\n\n")
df_processed = generate_topic_encodings(df_processed)  ## model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
print(f"Processed Data \n\n{df_processed.head()}\n\n")
#final_df = add_gold_price_change(df_processed,df_gold)  ## final_df = merged_df[['Date','text','sentiment','topic_encodings','sentiment_combined_encodings','price_percentage_change']].copy()
final_df = add_gold_price_change_with_weekend_handling(df_processed,df_gold)
print(f"Final Data \n\n{final_df.head()}\n\n")

In [None]:
## TODO [Mohan]: Integrate sentiment extraction model here
# UI Based inputs + 



In [None]:
## TODO [Tarun] : Replace the file read with values produced by previous members in the chain.

#news_llm_model_data = pd.read_pickle('data/combined_dataset_with_price_change.pkl')
#print("Number of rows in df:",news_llm_model_data.shape)
#news_llm_model_data.head()

## Group input data into sets for use in model.
#encodings, price_changes, masks = group_into_variable_sets(news_llm_model_data)
#print(f"Encodings shape: {encodings.shape}, Price changes shape: {price_changes.shape}, Masks shape: {masks.shape}")

## Create the dataset
#dataset = VariableSetDataset(encodings, price_changes, masks)
#print(f"Dataset {dataset}")

In [None]:
# Get device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Run Time Series Models and get predictions
# --------------------------------------------------------------------
# Load pre-trained models
arimax_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/Arimax/arimax_{today}.pkl')
random_forest_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/RandomForest/random_forest_{today}.pkl')
xgboost_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/XGBoost/xgboost_{today}.pkl')

lstw_model = LSTMModel(input_size=11).to(device)
lstw_model.load_state_dict(torch.load(f'{WORKAREA}/Tarun/Model/LSTM/lstm_{today}.pt', map_location=device))

# --------------------------------------------------------------------
# Predict gold price using all four models with gold_price_prediction where possible

# ARIMAX
predicted_price_arimax = predict_next_day_gold_price_arimax(df, arimax_model)

# Random Forest
predicted_price_rf = predict_next_day_gold_price_rf(df, random_forest_model)

# XGBoost
predicted_price_xgb = predict_next_day_gold_price_xgboost(df, xgboost_model)

# LSTM
predicted_price_lstw = predict_next_day_gold_price_lstm(df, lstw_model)

print(f"ARIMAX: Predicted gold price for {next_day}: {predicted_price_arimax}")

print(f"Random Forest: Predicted gold price for {next_day}: {predicted_price_rf}")

print(f"XGBoost: Predicted gold price for {next_day}: {predicted_price_xgb}")

print(f"LSTM: Predicted gold price for {next_day}: {predicted_price_lstw}")
# --------------------------------------------------------------------

In [None]:
# Load the pre-trained News LLM model & get predictions
# --------------------------------------------------------------------

news_model = SetTransformer(
    dim_input = 512, 
    num_outputs = 1, #One final prediction
    dim_output = 1, #1D output for price change
    num_inds=32, 
    dim_hidden=128, 
    num_heads=4, 
    ln=True #Layer normalization
    ).to(device)

# --------------------------------------------------------------------
# Load the pre-trained model weights
checkpoint_path = f'{WORKAREA}/Tarun/Model/final_model.pth'
if os.path.exists(checkpoint_path):
    start_epoch, start_loss = load_checkpoint(checkpoint_path, news_model, device)
    print(f"Model loaded from {checkpoint_path} at epoch {start_epoch} with loss {start_loss:.4f}")
else:
    start_epoch, start_loss = 0, float('inf')
    print(f"No checkpoint found at {checkpoint_path}. Starting from scratch.")

In [None]:
# TODO [Tarun]: Replace the input data with the actual news data for prediction.
# Generate random inputs and masks for testing
inputs = np.random.rand(1, 10, 512).astype(np.float32)
masks = np.ones((1,10)).astype(np.float32)

inputs = torch.tensor(inputs, dtype=torch.float32).to(device=device)
masks = torch.tensor(masks, dtype=torch.float32).to(device=device)

print(inputs.shape, masks.shape)
news_llm_change_precentage = news_model(inputs,mask= masks).item() * 100  # Convert to percentage
predicted_price_news_llm = current_price*(1 + news_llm_change_precentage / 100)
print(f"News LLM: Predicted gold price for {next_day}: {predicted_price_news_llm:.2f}")

In [None]:
ensemble_model = sm.load_pickle(f'{WORKAREA}/Tarun/Model/Final_Ensemble/ensemble_model_{today}.pkl')
results = predict_next_day_gold_price_ensemble(
    ensemble_model,
    predicted_price_arimax,
    predicted_price_xgb,
    predicted_price_rf,
    predicted_price_lstw,
    predicted_price_news_llm,
)

print(f"Ensemble Model Results for {next_day}:")
print(f"Predicted Price: {results['predictions']['meta_ensemble']}")
print(f"Percentage Change: {results['percentage_changes']['meta_ensemble']:.2f}%")