## Model for price prediction from sentiment data.
> We need it to take a dictionary of sentence : sentiment pairs.
> This size of the dictionary is variable/flexible.

We could potentially design this module as an Transformer encoder model.


References:
> https://towardsdatascience.com/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1/

> [Set Transformer: A Framework for Attention-based
 Permutation-Invariant Neural Networks](https://arxiv.org/pdf/1810.00825)

 > https://github.com/juho-lee/set_transformer

 > [Deep sets](https://papers.nips.cc/paper_files/paper/2017/hash/f22e4747da1aa27e363d86d40ff442fe-Abstract.html)

Since we want our model to be invarient to permutations in the order in which we feed the news articles and associated sentiments, we can proceed using a set-transformer model.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MAB(nn.Module):
    def __init__(self, dim_Q, dim_K, dim_V, num_heads, ln=False):
        super(MAB, self).__init__()
        self.dim_V = dim_V
        self.num_heads = num_heads
        self.fc_q = nn.Linear(dim_Q, dim_V)
        self.fc_k = nn.Linear(dim_K, dim_V)
        self.fc_v = nn.Linear(dim_K, dim_V)
        if ln:
            self.ln0 = nn.LayerNorm(dim_V)
            self.ln1 = nn.LayerNorm(dim_V)
        self.fc_o = nn.Linear(dim_V, dim_V)

    def forward(self, Q, K):
        Q = self.fc_q(Q)
        K, V = self.fc_k(K), self.fc_v(K)

        dim_split = self.dim_V // self.num_heads
        Q_ = torch.cat(Q.split(dim_split, 2), 0)
        K_ = torch.cat(K.split(dim_split, 2), 0)
        V_ = torch.cat(V.split(dim_split, 2), 0)

        A = torch.softmax(Q_.bmm(K_.transpose(1,2))/math.sqrt(self.dim_V), 2)
        O = torch.cat((Q_ + A.bmm(V_)).split(Q.size(0), 0), 2)
        O = O if getattr(self, 'ln0', None) is None else self.ln0(O)
        O = O + F.relu(self.fc_o(O))
        O = O if getattr(self, 'ln1', None) is None else self.ln1(O)
        return O

class SAB(nn.Module):
    def __init__(self, dim_in, dim_out, num_heads, ln=False):
        super(SAB, self).__init__()
        self.mab = MAB(dim_in, dim_in, dim_out, num_heads, ln=ln)

    def forward(self, X):
        return self.mab(X, X)

class ISAB(nn.Module):
    def __init__(self, dim_in, dim_out, num_heads, num_inds, ln=False):
        super(ISAB, self).__init__()
        self.I = nn.Parameter(torch.Tensor(1, num_inds, dim_out))
        nn.init.xavier_uniform_(self.I)
        self.mab0 = MAB(dim_out, dim_in, dim_out, num_heads, ln=ln)
        self.mab1 = MAB(dim_in, dim_out, dim_out, num_heads, ln=ln)

    def forward(self, X):
        H = self.mab0(self.I.repeat(X.size(0), 1, 1), X)
        return self.mab1(X, H)

class PMA(nn.Module):
    def __init__(self, dim, num_heads, num_seeds, ln=False):
        super(PMA, self).__init__()
        self.S = nn.Parameter(torch.Tensor(1, num_seeds, dim))
        nn.init.xavier_uniform_(self.S)
        self.mab = MAB(dim, dim, dim, num_heads, ln=ln)

    def forward(self, X):
        return self.mab(self.S.repeat(X.size(0), 1, 1), X)

In [2]:
class DeepSet(nn.Module):
    def __init__(self, dim_input, num_outputs, dim_output, dim_hidden=128):
        super(DeepSet, self).__init__()
        self.num_outputs = num_outputs
        self.dim_output = dim_output
        self.enc = nn.Sequential(
                nn.Linear(dim_input, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden))
        self.dec = nn.Sequential(
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, dim_hidden),
                nn.ReLU(),
                nn.Linear(dim_hidden, num_outputs*dim_output))

    def forward(self, X):
        X = self.enc(X).mean(-2)
        X = self.dec(X).reshape(-1, self.num_outputs, self.dim_output)
        return X

class SetTransformer(nn.Module):
    def __init__(self, dim_input, num_outputs, dim_output,
            num_inds=32, dim_hidden=128, num_heads=4, ln=False):
        super(SetTransformer, self).__init__()
        self.enc = nn.Sequential(
                ISAB(dim_input, dim_hidden, num_heads, num_inds, ln=ln),
                ISAB(dim_hidden, dim_hidden, num_heads, num_inds, ln=ln))
        self.dec = nn.Sequential(
                PMA(dim_hidden, num_heads, num_outputs, ln=ln),
                SAB(dim_hidden, dim_hidden, num_heads, ln=ln),
                SAB(dim_hidden, dim_hidden, num_heads, ln=ln),
                nn.Linear(dim_hidden, dim_output))

    def forward(self, X):
        return self.dec(self.enc(X))

In [3]:
#Generate some Dummy data to test out the model using
import pandas as pd
import numpy as np

dataset_size = 100
embedding_dim = 256
encodings = np.random.rand(dataset_size, 10, embedding_dim).astype(np.float32)
print(encodings.shape,encodings[:5])

sentiments = np.random.rand(dataset_size, 1).astype(np.float32)
print(sentiments.shape, sentiments[:5])

price_percentage_changes = np.random.rand(dataset_size, 1).astype(np.float32)* 0.2;
print(price_percentage_changes.shape, price_percentage_changes[:5])


(100, 10, 256) [[[0.49589458 0.10688338 0.07089077 ... 0.2417602  0.13408759 0.3104174 ]
  [0.2830605  0.7448318  0.06072749 ... 0.9730455  0.63575613 0.5952416 ]
  [0.16305001 0.9752018  0.47348017 ... 0.684376   0.20311153 0.68559074]
  ...
  [0.8857465  0.9527168  0.08160522 ... 0.8719958  0.1463117  0.59270805]
  [0.9071859  0.5187692  0.01907815 ... 0.25754723 0.9248328  0.21163559]
  [0.3347137  0.9098806  0.94591063 ... 0.1439867  0.6698862  0.017032  ]]

 [[0.1597876  0.68441844 0.79044855 ... 0.75574255 0.31490383 0.88023686]
  [0.50667256 0.88767827 0.50290173 ... 0.90525234 0.29555482 0.45902964]
  [0.50102556 0.6491193  0.43429908 ... 0.1339961  0.56095845 0.13995658]
  ...
  [0.18016414 0.4612971  0.22606702 ... 0.06254128 0.5076931  0.4933054 ]
  [0.4924748  0.3433779  0.67802346 ... 0.9883423  0.8737031  0.46125203]
  [0.9302367  0.8404922  0.0609713  ... 0.9834517  0.3138393  0.96972144]]

 [[0.9989209  0.869532   0.674207   ... 0.46666616 0.1801449  0.78210807]
  [0.42

In [4]:
dummy_df = pd.DataFrame({
    'encodings': list(encodings),
    'sentiments': list(sentiments),
    'price_percentage_changes': list(price_percentage_changes)
})    
dummy_df.head()

Unnamed: 0,encodings,sentiments,price_percentage_changes
0,"[[0.49589458, 0.106883384, 0.07089077, 0.11005...",[0.46531165],[0.14830486]
1,"[[0.1597876, 0.68441844, 0.79044855, 0.390362,...",[0.06911514],[0.10886174]
2,"[[0.9989209, 0.869532, 0.674207, 0.054929234, ...",[0.11816343],[0.17394719]
3,"[[0.08408968, 0.9981818, 0.85595757, 0.660596,...",[0.6874855],[0.06424456]
4,"[[0.27143306, 0.4095051, 0.14273366, 0.2883808...",[0.06379012],[0.17689848]


In [5]:
from torch.utils.data import Dataset, DataLoader
# Create proper dataset class instead of generator
class NewsDataset(Dataset):
    def __init__(self, encodings, sentiments, price_changes):
        self.encodings = encodings
        self.sentiments = sentiments
        self.price_changes = price_changes
    
    def __len__(self):
        return len(self.encodings)
    
    def __getitem__(self, idx):
        # Multiply encodings by sentiment (broadcasting)
        input_data = self.encodings[idx] * self.sentiments[idx][0]
        target = self.price_changes[idx][0]  # Single value, not array
        
        return torch.tensor(input_data, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

# Create dataset and dataloader
dataset = NewsDataset(encodings, sentiments, price_percentage_changes)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
model = SetTransformer(
    dim_input = embedding_dim, 
    num_outputs = 1, #One final prediction
    dim_output = 1, #1D output for price change
    num_inds=32, 
    dim_hidden=128, 
    num_heads=4, 
    ln=True #Layer normalization
    )
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [7]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    losses = []
    
    for batch_idx, (inps, outs) in enumerate(train_loader):
        # Move to device
        inps = inps.to(device)
        outs = outs.to(device)
        
        # Forward pass
        preds = model(inps)
        
        # Ensure output shapes match
        if preds.dim() > 1: #Output will be 32*1*1 if batch size is 32
            preds = preds.squeeze(-1)  # Remove last dimension if it's 1
        
        loss = criterion(preds, outs)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    
    avg_loss = np.mean(losses)
    print(f"Epoch {epoch}: train loss {avg_loss:.4f}")
    
    # Validation every 5 epochs
    if epoch % 5 == 0:
        model.eval()
        val_losses = []
        
        with torch.no_grad():
            for inps, outs in train_loader:  # Using same data for demo
                inps = inps.to(device)
                outs = outs.to(device)
                
                preds = model(inps)
                if preds.dim() > 1:
                    preds = preds.squeeze(-1)
                
                loss = criterion(preds, outs)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        print(f"Epoch {epoch}: val loss {avg_val_loss:.4f}")
        
        # Print sample predictions
        # print(f"Sample predictions: {preds[:5].cpu().numpy()}")
        # print(f"Sample targets: {outs[:5].cpu().numpy()}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 0: train loss 2.3102
Epoch 0: val loss 0.3143
Epoch 1: train loss 0.1318
Epoch 2: train loss 0.0514
Epoch 3: train loss 0.0481
Epoch 4: train loss 0.0109
Epoch 5: train loss 0.0237
Epoch 5: val loss 0.0056
Epoch 6: train loss 0.0110
Epoch 7: train loss 0.0100
Epoch 8: train loss 0.0059
Epoch 9: train loss 0.0052
Epoch 10: train loss 0.0044
Epoch 10: val loss 0.0034
Epoch 11: train loss 0.0038
Epoch 12: train loss 0.0045
Epoch 13: train loss 0.0034
Epoch 14: train loss 0.0057
Epoch 15: train loss 0.0057
Epoch 15: val loss 0.0051
Epoch 16: train loss 0.0056
Epoch 17: train loss 0.0039
Epoch 18: train loss 0.0038
Epoch 19: train loss 0.0042


In [8]:
print (inps.shape,outs.shape)

torch.Size([4, 10, 256]) torch.Size([4])


# Try with real data

In [9]:
#Load gold price data
df_gold = pd.read_csv('..\Gold_Price_Prediction\Data\GOLDBEES_ETF_price_data.csv')
print("Number of rows in gold data:", len(df_gold))
df_gold.head()


Number of rows in gold data: 3786


Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2010-02-05,15.98,16.011,15.7385,15.765,2836000
1,2010-02-08,16.031,16.098,15.96,16.0625,445700
2,2010-02-09,16.065001,16.065001,15.96,15.9972,669100
3,2010-02-10,16.09,16.108999,16.0221,16.0609,335400
4,2010-02-11,16.099001,16.099001,16.0305,16.059299,385300


In [10]:
#Load cleaned news data
df_news = pd.read_csv('..\Sentiment_analysis\Cleaned_data_bullionvault_articles.csv')
print("Number of rows in news data:", len(df_news))
df_news.head()

Number of rows in news data: 252


Unnamed: 0,Date,Cleaned_Content
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...
1,2025-06-03,Gold Investing Trends Higher at Fresh Record P...
2,2025-05-29,Platinum Price 'Could Hit $1200' Amid 2025 Sup...
3,2025-05-14,Gold Volatility Tops Silver's the Most Since 9...
4,2025-05-12,Which Country Owns the Most Gold? Gold Reserve...


In [11]:
#Have a look at the data.
print(df_news.Cleaned_Content[0])
print("Number of words in scrape row0:",len(df_news.Cleaned_Content[0].split(" ")))
print("Number of rows in all:",len(df_news))

#Do some quick cleaning. We will replace this with data from Deepak and Tejashwini
length = 256
df_news['split_sentences'] = df_news['Cleaned_Content'].apply(lambda x: [x[i:i+length] for i in range(0, len(x), length)])
df_news.head()


Gold Price $3500 vs. the Investing Crowd Article:Did gold investing prices leap too far already in 2025...? DID GOLD $3500 mark a big top for the safe-haven metal back in April? asks Adrian Ash at BullionVault. History says maybe. Investment professionals think so, too. "According to Bank of America's latest fund managers survey, nearly half of the fund managers surveyed (49%) see long gold, or bets that gold prices will rise, as the most crowded trade in the market right now. "This," explained Yahoo! in April, "marks the first time in two years that fund managers did not see the Magnificent Seven [of giant US tech stocks] as Wall Street's most crowded trade." Gold's over-crowding then got worse in May, or so the 208 institutional fund managers replying to BoA's monthly survey said. A massive 58% of them labelled gold "the most crowded trade"...! But really? Sure, physical gold investing has picked up in 2025. March and April each brought BullionVault more first-time users than any mon

Unnamed: 0,Date,Cleaned_Content,split_sentences
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,[Gold Price $3500 vs. the Investing Crowd Arti...
1,2025-06-03,Gold Investing Trends Higher at Fresh Record P...,[Gold Investing Trends Higher at Fresh Record ...
2,2025-05-29,Platinum Price 'Could Hit $1200' Amid 2025 Sup...,[Platinum Price 'Could Hit $1200' Amid 2025 Su...
3,2025-05-14,Gold Volatility Tops Silver's the Most Since 9...,[Gold Volatility Tops Silver's the Most Since ...
4,2025-05-12,Which Country Owns the Most Gold? Gold Reserve...,[Which Country Owns the Most Gold? Gold Reserv...


In [12]:
#New dataframe with usable encodings
split_data_dict = df_news.explode('split_sentences').reset_index(drop=True)
print("Number of rows after splitting sentences:", len(split_data_dict))
split_data_dict.head()

Number of rows after splitting sentences: 7327


Unnamed: 0,Date,Cleaned_Content,split_sentences
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's..."
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f..."
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...


In [13]:
#Temporary model. We will replace this with what Mohan is working on.

# Import libraries
import transformers
import torch

#"DistilBERT is a smaller, faster and cheaper version of BERT. [Around 268 Mb]
sentiment_pipeline = transformers.pipeline(
    task="text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    torch_dtype=torch.float16,
    device=0
) 


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [14]:
# Get sentiment scores for the news content
data = list(split_data_dict.split_sentences)
result = sentiment_pipeline(data)
print(result)


[{'label': 'NEGATIVE', 'score': 0.9938787817955017}, {'label': 'NEGATIVE', 'score': 0.9698996543884277}, {'label': 'NEGATIVE', 'score': 0.997065007686615}, {'label': 'POSITIVE', 'score': 0.9930423498153687}, {'label': 'NEGATIVE', 'score': 0.9952816367149353}, {'label': 'NEGATIVE', 'score': 0.9851003289222717}, {'label': 'NEGATIVE', 'score': 0.9988120794296265}, {'label': 'NEGATIVE', 'score': 0.999321460723877}, {'label': 'NEGATIVE', 'score': 0.9956600069999695}, {'label': 'NEGATIVE', 'score': 0.9947697520256042}, {'label': 'NEGATIVE', 'score': 0.9925393462181091}, {'label': 'POSITIVE', 'score': 0.5716919302940369}, {'label': 'NEGATIVE', 'score': 0.9970190525054932}, {'label': 'NEGATIVE', 'score': 0.9564718008041382}, {'label': 'NEGATIVE', 'score': 0.6514380574226379}, {'label': 'NEGATIVE', 'score': 0.9764232039451599}, {'label': 'NEGATIVE', 'score': 0.9993588328361511}, {'label': 'NEGATIVE', 'score': 0.9886682629585266}, {'label': 'NEGATIVE', 'score': 0.9995793700218201}, {'label': 'NE

In [15]:
#Add the sentiment scores to the dataframe
df_sentiment = pd.DataFrame(result)
df_combined = pd.concat([split_data_dict, df_sentiment], axis=1)
print("Combined DataFrame shape:", df_combined.shape)
df_combined.head()

Combined DataFrame shape: (7327, 5)


Unnamed: 0,Date,Cleaned_Content,split_sentences,label,score
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...,NEGATIVE,0.993879
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's...",NEGATIVE,0.9699
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f...",NEGATIVE,0.997065
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...,POSITIVE,0.993042
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...,NEGATIVE,0.995282


In [16]:
print(df_combined.label.unique())  # Check unique labels
print(df_combined.score.describe())  # Check score distribution
#Looks like score is confidence score, label is positive or negative sentiment.


['NEGATIVE' 'POSITIVE']
count    7327.000000
mean        0.950882
std         0.097278
min         0.500000
25%         0.963643
50%         0.991748
75%         0.997697
max         0.999853
Name: score, dtype: float64


In [17]:
#Combine the sentiment and score to get a single representative value.
df_combined['sentiment'] = df_combined['label'].apply(lambda x: 1 if x == 'POSITIVE' else -1) * df_combined['score']
df_combined.head()

Unnamed: 0,Date,Cleaned_Content,split_sentences,label,score,sentiment
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Gold Price $3500 vs. the Investing Crowd Artic...,NEGATIVE,0.993879,-0.993879
1,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"think so, too. ""According to Bank of America's...",NEGATIVE,0.9699,-0.9699
2,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,"ril, ""marks the first time in two years that f...",NEGATIVE,0.997065,-0.997065
3,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,oA's monthly survey said. A massive 58% of the...,POSITIVE,0.993042,0.993042
4,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,Most of them have chosen to buy gold first. Bu...,NEGATIVE,0.995282,-0.995282


In [18]:
#Remove all unneccsary columns.
df_data = df_combined.drop(columns=['Cleaned_Content', 'label', 'score']).rename(columns={'split_sentences': 'text'})
print("\nNumber of rows in final data:", len(df_data))
print("\nNumber of unique dates", len(df_data['Date'].unique()))
print("\nNumber of rows per date:\n", df_data.groupby('Date').size().describe())
df_data.head()


Number of rows in final data: 7327

Number of unique dates 252

Number of rows per date:
 count    252.000000
mean      29.075397
std       17.429376
min        5.000000
25%       18.000000
50%       25.000000
75%       34.250000
max      118.000000
dtype: float64


Unnamed: 0,Date,text,sentiment
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,-0.993879
1,2025-06-11,"think so, too. ""According to Bank of America's...",-0.9699
2,2025-06-11,"ril, ""marks the first time in two years that f...",-0.997065
3,2025-06-11,oA's monthly survey said. A massive 58% of the...,0.993042
4,2025-06-11,Most of them have chosen to buy gold first. Bu...,-0.995282


In [19]:
#Generate topic encodings for text using our model.

#Import necessary packages.
import tensorflow_hub as hub
import tensorflow as tf

# Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #This is around 1 GB in size, it took a while for me to run this.
embed = hub.load(model_url)

In [20]:
# Generate embeddings
embeddings = embed(list(df_data['text']))

In [21]:
#Add it to our Df
df_data['topic_encodings'] = list(np.array(embeddings))
df_data.head()

Unnamed: 0,Date,text,sentiment,topic_encodings
0,2025-06-11,Gold Price $3500 vs. the Investing Crowd Artic...,-0.993879,"[-0.023777742, -0.069628574, 0.016904347, -0.0..."
1,2025-06-11,"think so, too. ""According to Bank of America's...",-0.9699,"[-0.046788722, -0.0713354, 0.04981599, 0.00485..."
2,2025-06-11,"ril, ""marks the first time in two years that f...",-0.997065,"[-0.044499613, -0.07782423, 0.00515722, -0.067..."
3,2025-06-11,oA's monthly survey said. A massive 58% of the...,0.993042,"[-0.06420384, -0.0801013, 0.016934335, 0.00675..."
4,2025-06-11,Most of them have chosen to buy gold first. Bu...,-0.995282,"[-0.06829211, -0.092043534, 0.029120624, -0.01..."


In [22]:
#Merge above dataframe with our gold data.

df_gold_premerge = df_gold.copy()
df_data_premerge = df_data.copy()

df_gold_premerge['Date'] = pd.to_datetime(df_gold_premerge['Date'])
df_data_premerge['Date'] = pd.to_datetime(df_data_premerge['Date'])

# Compute the relative change in price from one day to the next before merging.
df_gold_premerge = df_gold_premerge.sort_values(by='Date').reset_index(drop=True)
df_gold_premerge['next_day_price'] = df_gold_premerge['Close'].shift(-1)
df_gold_premerge['next_day'] = df_gold_premerge['Date'].shift(-1)
df_gold_premerge['day_gap'] = (df_gold_premerge['next_day'] - df_gold_premerge['Date']).dt.days
df_gold_premerge['relative_change'] = (df_gold_premerge['next_day_price'] - df_gold_premerge['Close']) / df_gold_premerge['day_gap']

#Make it a percentage change
df_gold_premerge['relative_change'] = 100 * (df_gold_premerge['relative_change'] / df_gold_premerge['Close'])

## Commenting out since we already have the change to the next day.
# # We want to predict gold price for the next day. Data to use for prediction is the day before. ]
# df_gold_premerge['Date'] = df_gold_premerge['Date'] - pd.Timedelta(days=1)

# Perform the merge on the adjusted date
merged_df = pd.merge(df_data_premerge, df_gold_premerge, on='Date', how='inner')
merged_df['sentiment_combined_encodings'] =  merged_df['topic_encodings'] * merged_df['sentiment']

merged_df.groupby('Date').first().sort_values(by='Date').head()

Unnamed: 0_level_0,text,sentiment,topic_encodings,Open,High,Low,Close,Volume,next_day_price,next_day,day_gap,relative_change,sentiment_combined_encodings
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-10-03,Gold Bullion Finds Fewest Buyers in 4 Years Ar...,-0.996489,"[-0.05636743, -0.07095238, 0.006274609, -0.017...",48.639999,49.18,48.200001,48.84,1986307,48.389999,2023-10-04,1.0,-0.921377,"[0.056169543, 0.07070329, -0.006252581, 0.0176..."
2023-10-09,"Why China's Buying Gold Article:Stocks, Yuan, ...",-0.999599,"[-0.042531792, -0.07121902, -0.0014956162, 0.0...",49.5,49.59,48.779999,48.900002,1497206,48.93,2023-10-10,1.0,0.061347,"[0.042514723, 0.07119044, 0.0014950159, -0.024..."
2023-10-11,Weaponizing Migration Article:The strategic ma...,0.995372,"[-0.009264145, -0.026787553, 0.031075692, 0.04...",48.93,49.200001,48.91,49.169998,696759,49.389999,2023-10-12,1.0,0.44743,"[-0.009221275, -0.026663592, 0.030931888, 0.04..."
2023-10-12,US Stocks Rose After Pearl Harbor Article:But ...,-0.997717,"[-0.059025448, -0.05793509, 0.026720842, 0.035...",49.389999,49.459999,49.209999,49.389999,716608,49.48,2023-10-13,1.0,0.182223,"[0.058890708, 0.057802837, -0.026659846, -0.03..."
2023-10-19,Zombie Credit to Die At Last? Article:The Fed ...,-0.99869,"[-0.021228043, 0.03637999, 0.021637974, -0.024...",50.740002,50.990002,50.720001,50.93,216275,51.580002,2023-10-20,1.0,1.276265,"[0.021200243, -0.036332346, -0.021609638, 0.02..."


In [23]:
## Visual sanity check
# df_gold_premerge[['Date','next_day','day_gap','Close','next_day_price','relative_change']].head()

In [24]:
#Remove the extra columns we don't need.
final_df = merged_df.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'next_day_price', 'day_gap'])

#View it
final_df.groupby('Date').first().sort_values(by='Date').head()

Unnamed: 0_level_0,text,sentiment,topic_encodings,next_day,relative_change,sentiment_combined_encodings
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-03,Gold Bullion Finds Fewest Buyers in 4 Years Ar...,-0.996489,"[-0.05636743, -0.07095238, 0.006274609, -0.017...",2023-10-04,-0.921377,"[0.056169543, 0.07070329, -0.006252581, 0.0176..."
2023-10-09,"Why China's Buying Gold Article:Stocks, Yuan, ...",-0.999599,"[-0.042531792, -0.07121902, -0.0014956162, 0.0...",2023-10-10,0.061347,"[0.042514723, 0.07119044, 0.0014950159, -0.024..."
2023-10-11,Weaponizing Migration Article:The strategic ma...,0.995372,"[-0.009264145, -0.026787553, 0.031075692, 0.04...",2023-10-12,0.44743,"[-0.009221275, -0.026663592, 0.030931888, 0.04..."
2023-10-12,US Stocks Rose After Pearl Harbor Article:But ...,-0.997717,"[-0.059025448, -0.05793509, 0.026720842, 0.035...",2023-10-13,0.182223,"[0.058890708, 0.057802837, -0.026659846, -0.03..."
2023-10-19,Zombie Credit to Die At Last? Article:The Fed ...,-0.99869,"[-0.021228043, 0.03637999, 0.021637974, -0.024...",2023-10-20,1.276265,"[0.021200243, -0.036332346, -0.021609638, 0.02..."


In [None]:
# final_df.to_csv('temporary_gold_news_data.csv', index=False)

# #Load the same file again
# import pandas as pd
# final_df = pd.read_csv('temporary_gold_news_data.csv')

In [25]:
final_df.groupby('Date').count().sort_values(by='text')['text'].describe()
#Date with lowest number of articles is 5 articles. Date with highest number of articles is 118 articles. Mean is around 30.

count    203.000000
mean      29.655172
std       18.478495
min        5.000000
25%       18.000000
50%       25.000000
75%       35.500000
max      118.000000
Name: text, dtype: float64

In [26]:
## Finally create the datagenerator and train our model!
articles_per_day = int(np.round(final_df.groupby('Date')['text'].count().mean()))
print("Number or articles model will expect for each prediction",articles_per_day)


# Create dataset and dataloader
encodings = final_df.groupby('Date')['sentiment_combined_encodings'].apply(list).apply(lambda x: [x[i%len(x)] for i in range(articles_per_day)])
sentiments = final_df.groupby('Date').first()['relative_change'].values
price_percentage_changes = final_df.groupby('Date').first()['relative_change'].values

encodings = np.array(encodings.tolist(), dtype=np.float32)
sentiments = np.array(sentiments, dtype=np.float32).reshape(-1, 1)  # Ensure shape is (N, 1)
price_percentage_changes = np.array(price_percentage_changes, dtype=np.float32).reshape(-1, 1)  # Ensure shape is (N, 1)

print(encodings.shape, sentiments.shape, price_percentage_changes.shape)
# print(encodings[:5], sentiments[:5], price_percentage_changes[:5])

Number or articles model will expect for each prediction 30
(203, 30, 512) (203, 1) (203, 1)


In [27]:
dataset = NewsDataset(encodings, sentiments, price_percentage_changes)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
#Define the model.
model = SetTransformer(
    dim_input=encodings.shape[2],  # Embedding dimension
    num_outputs=1,  # One final prediction
    dim_output=1,  # 1D output for price change
    num_inds=32, 
    dim_hidden=512, 
    num_heads=32, 
    ln=True  # Layer normalization
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [29]:
from tqdm import notebook as tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter()

In [30]:
%load_ext tensorboard

#Run this in your terminal separately to visualize the training process live
%tensorboard --logdir runs

#You can manually reload this in the GUI after running the next cell. Or you can set it to auto reload.


Reusing TensorBoard on port 6006 (pid 34856), started 0:06:45 ago. (Use '!kill 34856' to kill it.)

In [31]:
# Training loop
num_epochs = 2000

for epoch in tqdm.tqdm(range(num_epochs)):
    model.train()
    losses = []
    
    for batch_idx, (inps, outs) in enumerate(train_loader):
        # Move to device
        inps = inps.to(device)
        outs = outs.to(device)
        
        # Forward pass
        preds = model(inps)
        
        # Ensure output shapes match
        if preds.dim() > 1: #Output will be 32*1*1 if batch size is 32
            preds = preds.squeeze(-1)  # Remove last dimension if it's 1
        
        loss = criterion(preds, outs)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    
    avg_loss = np.mean(losses)
    writer.add_scalar('Loss/train', avg_loss, epoch)
    # print(f"Epoch {epoch}: train loss {avg_loss:.4f}")
    
    # Validation every 5 epochs
    if epoch % 5 == 0:
        model.eval()
        val_losses = []
        
        with torch.no_grad():
            for inps, outs in train_loader:  # Using same data for demo
                inps = inps.to(device)
                outs = outs.to(device)
                
                preds = model(inps)
                if preds.dim() > 1:
                    preds = preds.squeeze(-1)
                
                loss = criterion(preds, outs)
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        writer.add_scalar('Loss/validation', avg_val_loss, epoch)
        # print(f"Epoch {epoch}: val loss {avg_val_loss:.4f}")



  0%|          | 0/2000 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


KeyboardInterrupt: 