In [8]:
from gensim.models import Word2Vec, FastText
from gensim.models.fasttext import load_facebook_vectors
import fasttext
import pandas as pd
from nltk import word_tokenize
from data_processing import read_data
import torch
from model import *
from utils import *
import torch.nn.functional as F
from torch.utils.data import WeightedRandomSampler

In [9]:
trained_embedding_size = 250
device='cuda'

## Loading Pretrained FastText embedding

In [10]:
pretrained_model_path = './sogou_news.bin'

pretrained_model = fasttext.load_model(pretrained_model_path)

pretrained_embedding_size = pretrained_model.get_dimension()



## Training Word2Vec model on corpus

In [11]:
corpus = pd.read_csv("corpus.csv", sep='@', index_col=0)
corpus = corpus.loc[corpus['Content'] != '']
corpus.dropna(subset=['Content'], axis=0, inplace=True)
corpus.head()

Unnamed: 0,Title,URL,Content
0,Stocks making the biggest moves before the bel...,https://www.cnbc.com/2023/03/28/stocks-making-...,chinese technology stocks such as alibaba and ...
1,'Be very vigilant': Bank of England chief says...,https://www.cnbc.com/2023/03/28/bank-of-englan...,"andrew bailey, governor of the bank of england..."
2,"This is not another banking crisis, analysts s...",https://www.cnbc.com/2023/03/28/this-is-not-an...,the collapse of u.s.-based silicon valley bank...
3,Private equity deals in Asia plunged 44% in 20...,https://www.cnbc.com/2023/03/28/private-equity...,asia-pacific's private equity market plummeted...
4,Stocks making the biggest midday moves: Coinba...,https://www.cnbc.com/2023/03/27/stocks-making-...,check out the companies making the biggest mov...


In [12]:
# Adding training data in the corpus
training_data = read_data("FinancialPhraseBank/Sentences_50Agree.txt")
training_data['News'] = training_data['News'].str.lower()
training_data['News'] = training_data['News'].str.replace('\n', '')
training_data.dropna(subset=['News'], axis=0, inplace=True)
training_data = training_data.loc[training_data['News'] != '']

In [13]:
content = [word_tokenize(row[0]) for _, row in corpus.iterrows()] + [word_tokenize(row[-1]) for _, row in corpus.iterrows()] + [word_tokenize(row[0]) for _, row in training_data.iterrows()]

In [14]:
training_content = [word_tokenize(row[0]) for _, row in training_data.iterrows()]

In [15]:
max_length = max([len(sentence) for sentence in training_content]) # Max headline token length, going to need to pad according to this number

In [16]:
model = FastText(sentences=content, vector_size=trained_embedding_size, window=5, min_count=1, workers=4)

In [17]:
sims = model.wv.most_similar('increase', topn=10)
sims

[('increased.some', 0.9761280417442322),
 ('increase.as', 0.9685688018798828),
 ('increased', 0.9491811394691467),
 ('incredibly', 0.9409798979759216),
 ('increases', 0.9407033920288086),
 ('grease', 0.9346209168434143),
 ('outcomes.increased', 0.9345734715461731),
 ('revenue.', 0.9249630570411682),
 ('revenue', 0.9235275983810425),
 ('ratify', 0.9174684286117554)]

# Creating Train/Test Dataset

In [37]:
# Choosing embedding model
#embedding_model, embedding_size = model.wv, trained_embedding_size
embedding_model, embedding_size = pretrained_model, pretrained_embedding_size

In [38]:
dataset = read_data("FinancialPhraseBank/Sentences_50Agree.txt")
sentiment_dataset = NewsDataset(dataset, embedding_model, embedding_size, max_length)

In [39]:
dataset

Unnamed: 0,News,Sentiment
0,"according to gran , the company has no plans t...",1
1,technopolis plans to develop in stages an area...,1
2,the international electronic industry company ...,0
3,with the new production plant the company woul...,2
4,according to the company 's updated strategy f...,2
...,...,...
4841,london marketwatch -- share prices ended lower...,0
4842,rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,operating profit fell to eur 35.4 mn from eur ...,0
4844,net sales of the paper segment decreased to eu...,0


In [40]:
train_set, val_set = torch.utils.data.random_split(sentiment_dataset, [round(0.7*len(sentiment_dataset)), round(0.3*len(sentiment_dataset))])
label_distribution = (dataset.loc[train_set.indices, 'Sentiment'].value_counts() / dataset.loc[train_set.indices, 'Sentiment'].shape[0]).sort_index().to_list()
label_distribution

[0.12382075471698113, 0.5958136792452831, 0.2803655660377358]

In [41]:
#class_weights = [1 / w for w in label_distribution]
class_weights = [2.5, 0.4, 1.3]
weights = [class_weights[torch.argmax(label)] for _, label in train_set]
train_sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_set), replacement=True)

In [42]:
dataset.loc[train_set.indices, 'Sentiment'].value_counts() / dataset.loc[train_set.indices, 'Sentiment'].shape[0], dataset.loc[val_set.indices, 'Sentiment'].value_counts() / dataset.loc[val_set.indices, 'Sentiment'].shape[0]

(Sentiment
 1    0.595814
 2    0.280366
 0    0.123821
 Name: count, dtype: float64,
 Sentiment
 1    0.590096
 2    0.283356
 0    0.126547
 Name: count, dtype: float64)

### Creating LSTM model

In [43]:
input_size = embedding_size
batch_size = 64
num_layers = 1
hidden_size = 500
lstm_model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers).to(device)

In [45]:
train_lstm(lstm_model, train_set, val_set, 30, 0.0001, batch_size, num_layers, hidden_size, device, train_sampler)

Best accuracy : 0.43878954607977994 || Best confusion matrix : 
 [[ 78.  25.  81.]
 [167. 361. 330.]
 [119.  94. 199.]]


([0.6033651705058116,
  0.6002245905264368,
  0.6042841301774079,
  0.6120803311186017,
  0.6015701496376181,
  0.6004271597232459,
  0.595699242825778,
  0.5993701433235744,
  0.5914941666261205,
  0.5969755447135782,
  0.5973543243588142,
  0.5956359651853453,
  0.5896597853246724,
  0.5880235298624579,
  0.5908363573956039,
  0.5984627559499921,
  0.5955156335290873,
  0.5851468646301413,
  0.5959091355215829,
  0.5803304685736602,
  0.5778521749208558,
  0.58232200820491,
  0.5731808655666855,
  0.5789588127496108,
  0.579621987522773,
  0.5735313082641026,
  0.5851405476624111,
  0.5870079769278472,
  0.5752580728171006,
  0.574985830289013],
 [0.44339622641509435,
  0.4525353773584906,
  0.43307783018867924,
  0.43101415094339623,
  0.44752358490566035,
  0.4513561320754717,
  0.46285377358490565,
  0.44192216981132076,
  0.46639150943396224,
  0.4548938679245283,
  0.4481132075471698,
  0.4563679245283019,
  0.4684551886792453,
  0.48172169811320753,
  0.47877358490566035,
  0.4

In [52]:
test = 'Airbus won case'
test = test.lower()
test = word_tokenize(test)
n = len(test)
print(test)
try:
    test = [embedding_model[token] for token in test]
except:
    test = [embedding_model.get_word_vector(token) for token in test]
test = torch.Tensor(test).to(device)
test = F.pad(test, (0, 0, max_length - n, 0), 'constant', 0)
hidden = (torch.zeros(num_layers, hidden_size).to(device), torch.zeros(
                num_layers, hidden_size).to(device))
lstm_model(test, hidden)[0][-1]

['airbus', 'won', 'case']


tensor([0.2697, 0.3758, 0.3545], device='cuda:0', grad_fn=<SelectBackward0>)