In [35]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import pandas as pd
import numpy as np

from collections import Counter

#Using for training and testing data creation
import random
from torch.utils.data import Subset
from torch.utils.data import Dataset, DataLoader

import keras as keras

#Used for model creation 
import torch as torch
from torch import nn as nn
from torch import optim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
import torchtext

glove = torchtext.vocab.GloVe(name = '6B', dim = 300)

In [39]:
news1 = pd.read_csv('./data/djia_news copy.csv')
news2 = pd.read_csv('./data/nasdaq.csv')
# frames = [news1, news2]

# combined_news = pd.concat(frames)
# combined_news.drop(['Ticker'], axis = 1)

In [44]:
# class_count_0, class_count_1, class_count_2 = combined_news['Label'].value_counts()

news3 = pd.read_csv('./data/train.csv')
# news3 = news3[news3['Label'] == 'neutral']
# news3 = news3.sample(class_count_1)
print(news1['Label'].value_counts())

# frames  = [combined_news, news3]
# combined_news = pd.concat(frames)
combined_news = news1

0    1430
1     936
2      15
Name: Label, dtype: int64


In [45]:
combined_news['Headline'] = combined_news['Headline'].astype(str)

In [51]:
class Get_Modify():
    
    def __init__(self, dataframe, max_length):
        
        self.max_length = max_length
        self.dataframe = dataframe
        
        self.dataframe = self.dataframe[['Label', 'Headline']]
        self.label_dict = {'0' : 0, '1' : 1, '2' : 2}

        
        stop_words = set(stopwords.words("english"))
        self.dataframe = self.dataframe.apply(lambda x: x.astype(str).str.lower())
        self.dataframe.Headline = self.dataframe.Headline.str.replace('[@:]','')
        self.dataframe.Headline = self.dataframe.Headline.str.replace('totestravel', '')
    
        self.dataframe['Headline_tokens'] = self.dataframe['Headline'].apply(lambda x: word_tokenize(x))
        
        all_words = []
        for entry in self.dataframe['Headline_tokens']:
            all_words += entry
        frequency = Counter(all_words)
        
        self.vocab = torchtext.vocab.Vocab(counter = frequency, min_freq = 5, vectors = glove)
        
        self.dataframe = pd.DataFrame(self.dataframe)

        #print(self.dataframe.head())
    
    def __len__(self):
        return len(self.dataframe['Headline'])
    
    def back_to_text(self, tokens):
        text = ''
        for token in tokens:
            text += self.vocab.itos[token] + " "
        return text
    
    def __getitem__(self, index):
        label = self.label_dict[self.dataframe['Label'][index]]
        label = torch.tensor(label)
        
        int_tokens = []
        headline_tokens = self.dataframe['Headline_tokens'][index]
        for token in headline_tokens:
            int_tokens.append(self.vocab[token])
        
        if len(int_tokens) < self.max_length:
            num_to_pad = self.max_length - len(int_tokens)
            int_tokens += [0] * num_to_pad
        else:
            int_tokens = int_tokens[:self.max_length]
        int_tokens = torch.tensor(int_tokens)
        return(int_tokens, label)

In [52]:
#Retreving data from csv files and merge both datasets
news = Get_Modify(combined_news, 75)
vocab = news.vocab
len(vocab)

1477

In [53]:
#Determining training and test indices 

train_amount = int(0.70 * len(news))
train_indices = list(range(0, train_amount))
test_indices = list(range(train_amount, len(news)))

#Creating subsets using the indices determined above
training_data = Subset(news, train_indices)
testing_data = Subset(news, test_indices)

In [54]:
#Generating the data to feed into model 

training_generator = DataLoader(training_data, batch_size = 8, shuffle = True)
testing_generator = DataLoader(testing_data, batch_size = 8)


In [55]:
batched_data, batched_labels = next(iter(training_generator))
batched_data.shape, batched_labels.shape

(torch.Size([8, 75]), torch.Size([8]))

In [56]:
#Creationg of the NLP model
class News_NLP(nn.Module):
    def __init__(self, num_words, emb_size, num_classes):
        super().__init__()
        self.num_words = num_words
        self.emb_size = emb_size
        
        self.emb = nn.Embedding(self.num_words, self.emb_size)
        self.emb.from_pretrained(vocab.vectors)
        self.lstm = nn.LSTM(input_size = emb_size, hidden_size = 16, batch_first = True, num_layers = 2)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(32, num_classes)
    
    def forward(self, batch_data):
        token_embs = self.emb(batch_data)
        outputs, (h_n, c_n) = self.lstm(token_embs)
        

        
        last_hidden_state = h_n
        last_hidden_state = last_hidden_state.permute(1, 0, 2)
        last_hidden_state = last_hidden_state.flatten(start_dim = 1)
        
        last_hidden_state = self.relu(last_hidden_state)
        logits = self.linear(last_hidden_state)
        
        return logits

In [57]:
model = News_NLP(num_words = len(vocab), emb_size = 300, num_classes = 3)
model

News_NLP(
  (emb): Embedding(1477, 300)
  (lstm): LSTM(300, 16, num_layers=2, batch_first=True)
  (relu): ReLU()
  (linear): Linear(in_features=32, out_features=3, bias=True)
)

In [58]:
optimizer = optim.Adam(model.parameters(), 0.1)

In [59]:
loss_func = nn.CrossEntropyLoss()

In [74]:
def cal_acc(preds, batched_labels):
    predicted_labels = torch.softmax(preds, dim = 1).argmax(dim = 1)
    count = 0
    for label in range(0, len(predicted_labels)):
            if( batched_labels[label] ==  predicted_labels[label]):
                count = count + 1
        
    acc = count/len(batched_labels)
    return acc

In [75]:
num_epochs = 5
for epoch in range(num_epochs):
    print("-" * 60)
    for index, (batched_data, batched_labels) in enumerate(training_generator):
        preds = model(batched_data)
        loss = loss_func(preds, batched_labels)
        accuracy = cal_acc(preds, batched_labels)
        if( index % 50 == 0):
            print("Train: loss : {0}, accuracy: {1}".format(loss, accuracy))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    for index, (batched_data, batched_labels) in enumerate(testing_generator):
        preds = model(batched_data)
        loss = loss_func(preds, batched_labels)
        accuracy = cal_acc(preds, batched_labels)
        if(index % 40 == 0):
            print("Test: loss: {0}, accuracy: {1}".format(loss, accuracy))

------------------------------------------------------------
Train: loss : 0.845587432384491, accuracy: 0.5
Train: loss : 0.6741369962692261, accuracy: 0.625
Train: loss : 0.9073151350021362, accuracy: 0.375
Train: loss : 0.6957859992980957, accuracy: 0.5
Train: loss : 0.5660184621810913, accuracy: 0.75
Test: loss: 0.6952836513519287, accuracy: 0.5
Test: loss: 0.7380596995353699, accuracy: 0.125
Test: loss: 1.5629606246948242, accuracy: 0.375
------------------------------------------------------------
Train: loss : 0.652507483959198, accuracy: 0.875
Train: loss : 0.7088024616241455, accuracy: 0.5
Train: loss : 0.6642118692398071, accuracy: 0.625
Train: loss : 0.5458628535270691, accuracy: 0.875
Train: loss : 1.1399377584457397, accuracy: 0.375
Test: loss: 0.7182468771934509, accuracy: 0.5
Test: loss: 0.5602640509605408, accuracy: 0.875
Test: loss: 1.3253257274627686, accuracy: 0.5
------------------------------------------------------------
Train: loss : 0.8235688209533691, accuracy: 