In [1]:
from gensim.models import Word2Vec, FastText
import pandas as pd
from nltk import word_tokenize
from data_processing import read_data
import torch
from model import *
from utils import *
import torch.nn.functional as F

In [2]:
embedding_size = 250
device='cpu'

## Training Word2Vec model on corpus

In [3]:
corpus = pd.read_csv("corpus.csv", sep='@', index_col=0)
corpus = corpus.loc[corpus['Content'] != '']
corpus.dropna(subset=['Content'], axis=0, inplace=True)
corpus.head()

Unnamed: 0,Title,URL,Content
0,Stocks making the biggest moves before the bel...,https://www.cnbc.com/2023/03/28/stocks-making-...,chinese technology stocks such as alibaba and ...
1,'Be very vigilant': Bank of England chief says...,https://www.cnbc.com/2023/03/28/bank-of-englan...,"andrew bailey, governor of the bank of england..."
2,"This is not another banking crisis, analysts s...",https://www.cnbc.com/2023/03/28/this-is-not-an...,the collapse of u.s.-based silicon valley bank...
3,Private equity deals in Asia plunged 44% in 20...,https://www.cnbc.com/2023/03/28/private-equity...,asia-pacific's private equity market plummeted...
4,Stocks making the biggest midday moves: Coinba...,https://www.cnbc.com/2023/03/27/stocks-making-...,check out the companies making the biggest mov...


In [4]:
# Adding training data in the corpus
training_data = read_data("FinancialPhraseBank/Sentences_50Agree.txt")
training_data['News'] = training_data['News'].str.lower()
training_data['News'] = training_data['News'].str.replace('\n', '')
training_data.dropna(subset=['News'], axis=0, inplace=True)
training_data = training_data.loc[training_data['News'] != '']

In [5]:
content = [word_tokenize(row[-1]) for _, row in corpus.iterrows()] + [word_tokenize(row[0]) for _, row in training_data.iterrows()]

In [6]:
training_content = [word_tokenize(row[0]) for _, row in training_data.iterrows()]

In [7]:
max_length = max([len(sentence) for sentence in training_content]) # Max headline token length, going to need to pad according to this number

In [8]:
model = FastText(sentences=content, vector_size=embedding_size, window=5, min_count=1, workers=4)

In [9]:
sims = model.wv.most_similar('increase', topn=10)
sims

[('increased.some', 0.9702471494674683),
 ('increase.as', 0.9593074321746826),
 ('incredibly', 0.9375899434089661),
 ('increases', 0.936568558216095),
 ('increased', 0.9365633726119995),
 ('increasingly', 0.9178104996681213),
 ('grease', 0.9151845574378967),
 ('outcomes.increased', 0.9145744442939758),
 ('ratify', 0.9096012115478516),
 ('raivv', 0.9074144959449768)]

# Creating Train/Test Dataset

In [10]:
dataset = read_data("FinancialPhraseBank/Sentences_50Agree.txt")
sentiment_dataset = NewsDataset(dataset, model.wv, embedding_size, max_length)

In [17]:
dataset

Unnamed: 0,News,Sentiment
0,"according to gran , the company has no plans t...",1
1,technopolis plans to develop in stages an area...,1
2,the international electronic industry company ...,0
3,with the new production plant the company woul...,2
4,according to the company 's updated strategy f...,2
...,...,...
4841,london marketwatch -- share prices ended lower...,0
4842,rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,operating profit fell to eur 35.4 mn from eur ...,0
4844,net sales of the paper segment decreased to eu...,0


In [12]:
train_set, val_set = torch.utils.data.random_split(sentiment_dataset, [round(0.7*len(sentiment_dataset)), round(0.3*len(sentiment_dataset))])

In [18]:
dataset.loc[train_set.indices, 'Sentiment'].value_counts(), dataset.loc[val_set.indices, 'Sentiment'].value_counts() / dataset.loc[val_set.indices, 'Sentiment'].shape[0]

(Sentiment
 1    2019
 2     960
 0     413
 Name: count, dtype: int64,
 Sentiment
 1    0.591472
 2    0.277166
 0    0.131362
 Name: count, dtype: float64)

### Creating LSTM model

In [14]:
input_size = embedding_size
batch_size = 64
num_layers = 1
hidden_size = 500
lstm_model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers).to(device)

In [15]:
train_lstm(lstm_model, train_set, val_set,  30, 0.01, batch_size, num_layers, hidden_size, device)

  x = torch.FloatTensor(sentence).reshape((n, self.embedding_size))


Best accuracy : 0.671251719394773 || Best confusion matrix : 
 [[  0.  90. 101.]
 [  0. 792.  68.]
 [  0. 219. 184.]]


([0.5450158057347784,
  0.5097310329383274,
  0.49624588017193777,
  0.47876273573569533,
  0.4667603441004483,
  0.46335391953306376,
  0.4501104995889484,
  0.44154885298800917,
  0.4265644786492834,
  0.42185696003572,
  0.4128099814900812,
  0.4064915095860103,
  0.40479370951652527,
  0.40048695341596063,
  0.37887172091682003,
  0.38355098580414393,
  0.3781532127902193,
  0.3635732132308888,
  0.357504332965275,
  0.3594315597471201,
  0.34081914852250295,
  0.35023650217731045,
  0.363102691353492,
  0.3417814573589361,
  0.32715759693451646,
  0.32970263513754,
  0.31801445725953803,
  0.31515604707429995,
  0.2954532765554932,
  0.3019534447845423],
 [0.5881485849056604,
  0.6317806603773585,
  0.6367924528301887,
  0.6535966981132075,
  0.6586084905660378,
  0.6659787735849056,
  0.6715801886792453,
  0.6724646226415094,
  0.6886792452830188,
  0.6875,
  0.6969339622641509,
  0.6931014150943396,
  0.7090212264150944,
  0.7045990566037735,
  0.7252358490566038,
  0.7211084905

In [30]:
test = 'Dogecoin price suddenly rose after Elon Musk tweet'
test = test.lower()
test = word_tokenize(test)
n = len(test)
print(test)
test = [model.wv[token] for token in test]
test = torch.Tensor(test)
test = F.pad(test, (0, 0, max_length - n, 0), 'constant', 0)
hidden = (torch.zeros(num_layers, hidden_size).to(device), torch.zeros(
                num_layers, hidden_size).to(device))
lstm_model(test, hidden)

['dogecoin', 'price', 'suddenly', 'rose', 'after', 'elon', 'musk', 'tweet']


(tensor([[9.5867e-02, 4.7276e-01, 4.3138e-01],
         [8.7565e-04, 6.4110e-01, 3.5802e-01],
         [2.6831e-06, 7.9604e-01, 2.0396e-01],
         [2.9390e-08, 9.9919e-01, 8.0943e-04],
         [8.1049e-09, 9.9996e-01, 3.5048e-05],
         [4.1709e-09, 1.0000e+00, 3.5744e-06],
         [4.0630e-09, 1.0000e+00, 2.7370e-06],
         [4.1289e-09, 1.0000e+00, 2.6553e-06],
         [4.1714e-09, 1.0000e+00, 2.6428e-06],
         [4.1922e-09, 1.0000e+00, 2.6509e-06],
         [4.1942e-09, 1.0000e+00, 2.6599e-06],
         [4.1842e-09, 1.0000e+00, 2.6680e-06],
         [4.1677e-09, 1.0000e+00, 2.6754e-06],
         [4.1482e-09, 1.0000e+00, 2.6827e-06],
         [4.1272e-09, 1.0000e+00, 2.6897e-06],
         [4.1057e-09, 1.0000e+00, 2.6967e-06],
         [4.0843e-09, 1.0000e+00, 2.7036e-06],
         [4.0630e-09, 1.0000e+00, 2.7102e-06],
         [4.0422e-09, 1.0000e+00, 2.7165e-06],
         [4.0218e-09, 1.0000e+00, 2.7226e-06],
         [4.0018e-09, 1.0000e+00, 2.7284e-06],
         [3.9