## Model for price prediction from sentiment data.
> We need it to take a dictionary of sentence : sentiment pairs.
> This size of the dictionary is variable/flexible.

We could potentially design this module as an Transformer encoder model.


References:
> https://towardsdatascience.com/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1/

> [Set Transformer: A Framework for Attention-based
 Permutation-Invariant Neural Networks](https://arxiv.org/pdf/1810.00825)

 > https://github.com/juho-lee/set_transformer

 > [Deep sets](https://papers.nips.cc/paper_files/paper/2017/hash/f22e4747da1aa27e363d86d40ff442fe-Abstract.html)

Since we want our model to be invarient to permutations in the order in which we feed the news articles and associated sentiments, we can proceed using a set-transformer model.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MAB(nn.Module):
    def __init__(self, dim_Q, dim_K, dim_V, num_heads, ln=False):
        super(MAB, self).__init__()
        self.dim_V = dim_V
        self.num_heads = num_heads
        self.fc_q = nn.Linear(dim_Q, dim_V)
        self.fc_k = nn.Linear(dim_K, dim_V)
        self.fc_v = nn.Linear(dim_K, dim_V)
        if ln:
            self.ln0 = nn.LayerNorm(dim_V)
            self.ln1 = nn.LayerNorm(dim_V)
        self.fc_o = nn.Linear(dim_V, dim_V)

    def forward(self, Q, K):
        Q = self.fc_q(Q)
        K, V = self.fc_k(K), self.fc_v(K)

        dim_split = self.dim_V // self.num_heads
        Q_ = torch.cat(Q.split(dim_split, 2), 0)
        K_ = torch.cat(K.split(dim_split, 2), 0)
        V_ = torch.cat(V.split(dim_split, 2), 0)

        A = torch.softmax(Q_.bmm(K_.transpose(1,2))/math.sqrt(self.dim_V), 2)
        O = torch.cat((Q_ + A.bmm(V_)).split(Q.size(0), 0), 2)
        O = O if getattr(self, 'ln0', None) is None else self.ln0(O)
        O = O + F.relu(self.fc_o(O))
        O = O if getattr(self, 'ln1', None) is None else self.ln1(O)
        return O

class SAB(nn.Module):
    def __init__(self, dim_in, dim_out, num_heads, ln=False):
        super(SAB, self).__init__()
        self.mab = MAB(dim_in, dim_in, dim_out, num_heads, ln=ln)

    def forward(self, X):
        return self.mab(X, X)

class ISAB(nn.Module):
    def __init__(self, dim_in, dim_out, num_heads, num_inds, ln=False):
        super(ISAB, self).__init__()
        self.I = nn.Parameter(torch.Tensor(1, num_inds, dim_out))
        nn.init.xavier_uniform_(self.I)
        self.mab0 = MAB(dim_out, dim_in, dim_out, num_heads, ln=ln)
        self.mab1 = MAB(dim_in, dim_out, dim_out, num_heads, ln=ln)

    def forward(self, X):
        H = self.mab0(self.I.repeat(X.size(0), 1, 1), X)
        return self.mab1(X, H)

class PMA(nn.Module):
    def __init__(self, dim, num_heads, num_seeds, ln=False):
        super(PMA, self).__init__()
        self.S = nn.Parameter(torch.Tensor(1, num_seeds, dim))
        nn.init.xavier_uniform_(self.S)
        self.mab = MAB(dim, dim, dim, num_heads, ln=ln)

    def forward(self, X):
        return self.mab(self.S.repeat(X.size(0), 1, 1), X)

In [2]:
class DeepSet(nn.Module):
    def __init__(self, dim_input, num_outputs, dim_output, dim_hidden=128):
        super(DeepSet, self).__init__()
        self.num_outputs = num_outputs
        self.dim_output = dim_output
        self.enc = nn.Sequential(
                nn.Linear(dim_input, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden))
        self.dec = nn.Sequential(
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, num_outputs*dim_output))

    def forward(self, X):
        X = self.enc(X).mean(-2)
        X = self.dec(X).reshape(-1, self.num_outputs, self.dim_output)
        return X

class SetTransformer(nn.Module):
    def __init__(self, dim_input, num_outputs, dim_output,
            num_inds=32, dim_hidden=128, num_heads=4, ln=False):
        super(SetTransformer, self).__init__()
        self.enc = nn.Sequential(
                ISAB(dim_input, dim_hidden, num_heads, num_inds, ln=ln),
                ISAB(dim_hidden, dim_hidden, num_heads, num_inds, ln=ln))
        self.dec = nn.Sequential(
                PMA(dim_hidden, num_heads, num_outputs, ln=ln),
                SAB(dim_hidden, dim_hidden, num_heads, ln=ln),
                SAB(dim_hidden, dim_hidden, num_heads, ln=ln),
                nn.Linear(dim_hidden, dim_output))

    def forward(self, X):
        return self.dec(self.enc(X))

In [3]:
#Generate some Dummy data to test out the model using
import pandas as pd
import numpy as np

dataset_size = 100
embedding_dim = 256
encodings = np.random.rand(dataset_size, 10, embedding_dim).astype(np.float32)
print(encodings.shape,encodings[:5])

sentiments = np.random.rand(dataset_size, 1).astype(np.float32)
print(sentiments.shape, sentiments[:5])

price_percentage_changes = np.random.rand(dataset_size, 1).astype(np.float32)* 0.2;
print(price_percentage_changes.shape, price_percentage_changes[:5])


(100, 10, 256) [[[0.66429764 0.21788633 0.4673405  ... 0.7302728  0.9655143  0.19107188]
  [0.040458   0.67781246 0.6482353  ... 0.22029753 0.55051154 0.14246999]
  [0.2919994  0.67028606 0.04316776 ... 0.68322104 0.8101939  0.49121815]
  ...
  [0.10633    0.6714629  0.435911   ... 0.5686238  0.10436377 0.34665284]
  [0.92168087 0.7500064  0.45432103 ... 0.8899092  0.48369777 0.69192153]
  [0.04986018 0.19598113 0.8125859  ... 0.68314415 0.10458023 0.85667735]]

 [[0.23552139 0.7084505  0.61391175 ... 0.5516386  0.9953389  0.66800064]
  [0.3906856  0.6991774  0.8284985  ... 0.00889463 0.05936708 0.75884825]
  [0.46545276 0.87437534 0.82551455 ... 0.5051445  0.1157366  0.590836  ]
  ...
  [0.92636377 0.48613355 0.27276507 ... 0.9934282  0.4235375  0.23768935]
  [0.4663848  0.8582081  0.36346737 ... 0.06327558 0.83185446 0.8150008 ]
  [0.20311692 0.35876063 0.22717673 ... 0.05755747 0.40707418 0.5703019 ]]

 [[0.11754234 0.48071656 0.691093   ... 0.30063748 0.6573472  0.20621148]
  [0.08

In [4]:
dummy_df = pd.DataFrame({
    'encodings': list(encodings),
    'sentiments': list(sentiments),
    'price_percentage_changes': list(price_percentage_changes)
})    
dummy_df.head()

Unnamed: 0,encodings,sentiments,price_percentage_changes
0,"[[0.66429764, 0.21788633, 0.4673405, 0.5913993...",[0.5859587],[0.15434435]
1,"[[0.23552139, 0.7084505, 0.61391175, 0.7748798...",[0.3131849],[0.18284075]
2,"[[0.11754234, 0.48071656, 0.691093, 0.7109679,...",[0.23932096],[0.13994107]
3,"[[0.5986723, 0.8277822, 0.79720736, 0.1465592,...",[0.8917242],[0.12949806]
4,"[[0.6071242, 0.49942872, 0.91955084, 0.1184203...",[0.734044],[0.17175135]


In [5]:
from torch.utils.data import Dataset, DataLoader
# Create proper dataset class instead of generator
class NewsDataset(Dataset):
    def __init__(self, encodings, sentiments, price_changes):
        self.encodings = encodings
        self.sentiments = sentiments
        self.price_changes = price_changes
    
    def __len__(self):
        return len(self.encodings)
    
    def __getitem__(self, idx):
        # Multiply encodings by sentiment (broadcasting)
        input_data = self.encodings[idx] * self.sentiments[idx][0]
        target = self.price_changes[idx][0]  # Single value, not array
        
        return torch.tensor(input_data, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

# Create dataset and dataloader
dataset = NewsDataset(encodings, sentiments, price_percentage_changes)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
model = SetTransformer(
    dim_input = embedding_dim, 
    num_outputs = 1, #One final prediction
    dim_output = 1, #1D output for price change
    num_inds=32, 
    dim_hidden=128, 
    num_heads=4, 
    ln=True #Layer normalization
    )
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [7]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    losses = []
    
    for batch_idx, (inps, outs) in enumerate(train_loader):
        # Move to device
        inps = inps.to(device)
        outs = outs.to(device)
        
        # Forward pass
        preds = model(inps)
        
        # Ensure output shapes match
        if preds.dim() > 1: #Output will be 32*1*1 if batch size is 32
            preds = preds.squeeze(-1)  # Remove last dimension if it's 1
        
        loss = criterion(preds, outs)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    
    avg_loss = np.mean(losses)
    print(f"Epoch {epoch}: train loss {avg_loss:.4f}")
    
    # Validation every 5 epochs
    if epoch % 5 == 0:
        model.eval()
        val_losses = []
        
        with torch.no_grad():
            for inps, outs in train_loader:  # Using same data for demo
                inps = inps.to(device)
                outs = outs.to(device)
                
                preds = model(inps)
                if preds.dim() > 1:
                    preds = preds.squeeze(-1)
                
                loss = criterion(preds, outs)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        print(f"Epoch {epoch}: val loss {avg_val_loss:.4f}")
        
        # Print sample predictions
        # print(f"Sample predictions: {preds[:5].cpu().numpy()}")
        # print(f"Sample targets: {outs[:5].cpu().numpy()}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 0: train loss 2.7614
Epoch 0: val loss 0.5410
Epoch 1: train loss 0.2400
Epoch 2: train loss 0.0786
Epoch 3: train loss 0.0711
Epoch 4: train loss 0.0142
Epoch 5: train loss 0.0302
Epoch 5: val loss 0.0118
Epoch 6: train loss 0.0081
Epoch 7: train loss 0.0129
Epoch 8: train loss 0.0042
Epoch 9: train loss 0.0075
Epoch 10: train loss 0.0038
Epoch 10: val loss 0.0046
Epoch 11: train loss 0.0082
Epoch 12: train loss 0.0032
Epoch 13: train loss 0.0063
Epoch 14: train loss 0.0056
Epoch 15: train loss 0.0042
Epoch 15: val loss 0.0043
Epoch 16: train loss 0.0042
Epoch 17: train loss 0.0039
Epoch 18: train loss 0.0062
Epoch 19: train loss 0.0051


In [8]:
print (inps.shape,outs.shape)

torch.Size([4, 10, 256]) torch.Size([4])


# Try with real data

In [46]:
#Load gold price data
df_gold = pd.read_csv('..\Gold_Price_Prediction\Data\GOLDBEES_ETF_price_data.csv')
print("Number of rows in gold data:", len(df_gold))
df_gold.head()


Number of rows in gold data: 3786


Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2010-02-05,15.98,16.011,15.7385,15.765,2836000
1,2010-02-08,16.031,16.098,15.96,16.0625,445700
2,2010-02-09,16.065001,16.065001,15.96,15.9972,669100
3,2010-02-10,16.09,16.108999,16.0221,16.0609,335400
4,2010-02-11,16.099001,16.099001,16.0305,16.059299,385300


In [57]:
#Load cleaned news data
df_news = pd.read_csv('..\Sentiment_analysis\Cleaned_data_bullionvault_articles.csv')
print("Number of rows in news data:", len(df_news))
df_news.head()

Number of rows in news data: 252


Unnamed: 0,Date,Cleaned_Content
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...
1,2025-06-03,Gold Investing Trends Higher at Fresh Record P...
2,2025-05-29,Platinum Price 'Could Hit $1200' Amid 2025 Sup...
3,2025-05-14,Gold Volatility Tops Silver's the Most Since 9...
4,2025-05-12,Which Country Owns the Most Gold? Gold Reserve...


In [58]:
#Have a look at the data.
print(df_news.Cleaned_Content[0])
print("Number of words in scrape row0:",len(df_news.Cleaned_Content[0].split(" ")))
print("Number of rows in all:",len(df_news))

#Do some quick cleaning. We will replace this with data from Deepak and Tejashwini
length = 256
df_news['split_sentences'] = df_news['Cleaned_Content'].apply(lambda x: [x[i:i+length] for i in range(0, len(x), length)])
df_news.head()


Gold Price $3500 vs. the Investing Crowd Article:Did gold investing prices leap too far already in 2025...? DID GOLD $3500 mark a big top for the safe-haven metal back in April? asks Adrian Ash at BullionVault. History says maybe. Investment professionals think so, too. "According to Bank of America's latest fund managers survey, nearly half of the fund managers surveyed (49%) see long gold, or bets that gold prices will rise, as the most crowded trade in the market right now. "This," explained Yahoo! in April, "marks the first time in two years that fund managers did not see the Magnificent Seven [of giant US tech stocks] as Wall Street's most crowded trade." Gold's over-crowding then got worse in May, or so the 208 institutional fund managers replying to BoA's monthly survey said. A massive 58% of them labelled gold "the most crowded trade"...! But really? Sure, physical gold investing has picked up in 2025. March and April each brought BullionVault more first-time users than any mon

Unnamed: 0,Date,Cleaned_Content,split_sentences
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,[Gold Price $3500 vs. the Investing Crowd Arti...
1,2025-06-03,Gold Investing Trends Higher at Fresh Record P...,[Gold Investing Trends Higher at Fresh Record ...
2,2025-05-29,Platinum Price 'Could Hit $1200' Amid 2025 Sup...,[Platinum Price 'Could Hit $1200' Amid 2025 Su...
3,2025-05-14,Gold Volatility Tops Silver's the Most Since 9...,[Gold Volatility Tops Silver's the Most Since ...
4,2025-05-12,Which Country Owns the Most Gold? Gold Reserve...,[Which Country Owns the Most Gold? Gold Reserv...


In [59]:
#New dataframe with usable encodings
split_data_dict = df_news.explode('split_sentences').reset_index(drop=True)
print("Number of rows after splitting sentences:", len(split_data_dict))
split_data_dict.head()

Number of rows after splitting sentences: 7327


Unnamed: 0,Date,Cleaned_Content,split_sentences
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's..."
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f..."
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...


In [60]:
#Temporary model. We will replace this with what Mohan is working on.

# Import libraries
import transformers
import torch

#"DistilBERT is a smaller, faster and cheaper version of BERT. [Around 268 Mb]
sentiment_pipeline = transformers.pipeline(
    task="text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    torch_dtype=torch.float16,
    device=0
) 


In [61]:
# Get sentiment scores for the news content
data = list(split_data_dict.split_sentences)
result = sentiment_pipeline(data)
print(result)


[{'label': 'NEGATIVE', 'score': 0.9938787817955017}, {'label': 'NEGATIVE', 'score': 0.9698996543884277}, {'label': 'NEGATIVE', 'score': 0.997065007686615}, {'label': 'POSITIVE', 'score': 0.9930423498153687}, {'label': 'NEGATIVE', 'score': 0.9952816367149353}, {'label': 'NEGATIVE', 'score': 0.9851003289222717}, {'label': 'NEGATIVE', 'score': 0.9988120794296265}, {'label': 'NEGATIVE', 'score': 0.999321460723877}, {'label': 'NEGATIVE', 'score': 0.9956600069999695}, {'label': 'NEGATIVE', 'score': 0.9947697520256042}, {'label': 'NEGATIVE', 'score': 0.9925393462181091}, {'label': 'POSITIVE', 'score': 0.5716919302940369}, {'label': 'NEGATIVE', 'score': 0.9970190525054932}, {'label': 'NEGATIVE', 'score': 0.9564718008041382}, {'label': 'NEGATIVE', 'score': 0.6514380574226379}, {'label': 'NEGATIVE', 'score': 0.9764232039451599}, {'label': 'NEGATIVE', 'score': 0.9993588328361511}, {'label': 'NEGATIVE', 'score': 0.9886682629585266}, {'label': 'NEGATIVE', 'score': 0.9995793700218201}, {'label': 'NE

In [None]:
#Add the sentiment scores to the dataframe
df_sentiment = pd.DataFrame(result)
df_combined = pd.concat([split_data_dict, df_sentiment], axis=1)
df_combined.head()

Unnamed: 0,Date,Cleaned_Content,split_sentences,label,score
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...,NEGATIVE,0.993879
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's...",NEGATIVE,0.9699
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f...",NEGATIVE,0.997065
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...,POSITIVE,0.993042
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...,NEGATIVE,0.995282


In [None]:
print(df_combined.label.unique())  # Check unique labels
print(df_combined.score.describe())  # Check score distribution
#Looks like score is confidence score, label is positive or negative sentiment.


['NEGATIVE' 'POSITIVE']
count    7327.000000
mean        0.950882
std         0.097278
min         0.500000
25%         0.963643
50%         0.991748
75%         0.997697
max         0.999853
Name: score, dtype: float64


In [None]:
#Combine the sentiment and score to get a single representative value.
df_combined['sentiment'] = df_combined['label'].apply(lambda x: 1 if x == 'POSITIVE' else -1) * df_combined['score']
df_combined.head()

Unnamed: 0,Date,Cleaned_Content,split_sentences,label,score,sentiment
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...,NEGATIVE,0.993879,-0.993879
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's...",NEGATIVE,0.9699,-0.9699
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f...",NEGATIVE,0.997065,-0.997065
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...,POSITIVE,0.993042,0.993042
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...,NEGATIVE,0.995282,-0.995282


In [78]:
#Remove all unneccsary columns.
df_data = df_combined.drop(columns=['Cleaned_Content', 'label', 'score']).rename(columns={'split_sentences': 'text'})
print("Number of rows in final data:", len(df_data))
df_data.head()

Number of rows in final data: 7327


Unnamed: 0,Date,text,sentiment
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,-0.993879
1,2025-06-11,"think so, too. ""According to Bank of America's...",-0.9699
2,2025-06-11,"ril, ""marks the first time in two years that f...",-0.997065
3,2025-06-11,oA's monthly survey said. A massive 58% of the...,0.993042
4,2025-06-11,Most of them have chosen to buy gold first. Bu...,-0.995282


In [80]:
#Generate topic encodings for text using our model.

#Import necessary packages.
import tensorflow_hub as hub
import tensorflow as tf

# Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #This is around 1 GB in size, it took a while for me to run this.
embed = hub.load(model_url)

In [None]:
# Generate embeddings
embeddings = embed(list(df_data['text']))

In [101]:
#Add it to our Df
df_data['topic_encodings'] = list(np.array(embeddings))
df_data.head()

Unnamed: 0,Date,text,sentiment,topic_encodings
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,-0.993879,"[-0.023777742, -0.069628574, 0.016904347, -0.0..."
1,2025-06-11,"think so, too. ""According to Bank of America's...",-0.9699,"[-0.046788722, -0.0713354, 0.04981599, 0.00485..."
2,2025-06-11,"ril, ""marks the first time in two years that f...",-0.997065,"[-0.044499613, -0.07782423, 0.00515722, -0.067..."
3,2025-06-11,oA's monthly survey said. A massive 58% of the...,0.993042,"[-0.06420384, -0.0801013, 0.016934335, 0.00675..."
4,2025-06-11,Most of them have chosen to buy gold first. Bu...,-0.995282,"[-0.06829211, -0.092043534, 0.029120624, -0.01..."
