In [240]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import pandas as pd
import numpy as np

from collections import Counter

#Using for training and testing data creation
import random
from torch.utils.data import Subset
from torch.utils.data import Dataset, DataLoader

import keras as keras

#Used for model creation 
import torch as torch
from torch import nn as nn
from torch import optim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Denylson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
import torchtext

glove = torchtext.vocab.GloVe(name = '42B', dim = 300)

In [308]:
news1 = pd.read_csv('./djia_news/djia_news copy.csv')
news2 = pd.read_csv('./nasdaq/nasdaq.csv')

# combined_news = news1.append(news2)
print(len(news1))
news2 = news2.replace([0,1,2], ['decrease', 'increase', 'constant'])

2381


In [313]:
class Get_Modify():
    
    def __init__(self, dataframe, max_length):
        
        self.max_length = max_length
        self.dataframe = dataframe
        
        self.dataframe = self.dataframe[['Label', 'Headline']]
        
        self.label_dict = {'decrease' : 0, 'increase' : 1, 'constant' : 2}
        stop_words = set(stopwords.words("english"))
        self.dataframe = self.dataframe.apply(lambda x: x.astype(str).str.lower())
        self.dataframe.Headline = self.dataframe.Headline.str.replace('[@:]','')
        self.dataframe.Headline = self.dataframe.Headline.str.replace('totestravel', '')
        self.dataframe['Headline_tokens'] = self.dataframe['Headline'].apply(lambda x: word_tokenize(x))
        
        all_words = []
        for entry in self.dataframe['Headline_tokens']:
            all_words += entry
        frequency = Counter(all_words)
        
        self.vocab = torchtext.vocab.Vocab(counter = frequency, min_freq = 5, vectors = glove)
        
        self.dataframe = pd.DataFrame(self.dataframe)

        #print(self.dataframe.head())
    
    def __len__(self):
        return len(self.dataframe['Headline'])
    
    def back_to_text(self, tokens):
        text = ''
        for token in tokens:
            text += self.vocab.itos[token] + " "
        return text
    
    def __getitem__(self, index):
        label = self.label_dict[self.dataframe['Label'][index]]
        label = torch.tensor(label)
        
        int_tokens = []
        headline_tokens = self.dataframe['Headline_tokens'][index]
        for token in headline_tokens:
            int_tokens.append(self.vocab[token])
        
        if len(int_tokens) < self.max_length:
            num_to_pad = self.max_length - len(int_tokens)
            int_tokens += [0] * num_to_pad
        else:
            int_tokens = int_tokens[:self.max_length]
        int_tokens = torch.tensor(int_tokens)
        return(int_tokens, label)

In [314]:
#Retreving data from csv files and merge both datasets
news = Get_Modify(news2, 75)
vocab = news.vocab
len(vocab)

  del sys.path[0]


6343

In [315]:
#Determining training and test indices 

train_amount = int(0.70 * len(news))
train_indices = list(range(0, train_amount))
test_indices = list(range(train_amount, len(news)))

#Creating subsets using the indices determined above
training_data = Subset(news, train_indices)
testing_data = Subset(news, test_indices)

In [316]:
#Generating the data to feed into model 

training_generator = DataLoader(training_data, batch_size = 8, shuffle = True)
testing_generator = DataLoader(testing_data, batch_size = 8)


In [317]:
batched_data, batched_labels = next(iter(training_generator))
# output = news.dataframe['Label'][1]
# news.label_dict[news.dataframe['Label'][1]]
batched_data.shape, batched_labels.shape

(torch.Size([8, 75]), torch.Size([8]))

In [420]:
#Creationg of the NLP model
class News_NLP(nn.Module):
    def __init__(self, num_words, emb_size, num_classes):
        super().__init__()
        self.num_words = num_words
        self.emb_size = emb_size
        
        self.emb = nn.Embedding(self.num_words, self.emb_size)
        self.emb.from_pretrained(vocab.vectors)
        self.gru = nn.GRU(input_size = emb_size, hidden_size = 32, batch_first = True, num_layers = 2)
        self.lstm = nn.LSTM(input_size = 32, hidden_size = 32, batch_first= True, num_layers = 2, dropout = 0.1)
        self.sig = nn.Sigmoid()
        self.linear = nn.Linear(64, num_classes)
    
    def forward(self, batch_data):
        token_embs = self.emb(batch_data)
        outputs, (h_n, c_n) = self.gru(token_embs)
        outputs, (h_n, c_n) = self.lstm(outputs)
        
        last_hidden_state = h_n
        last_hidden_state = last_hidden_state.permute(1, 0, 2)
        last_hidden_state = last_hidden_state.flatten(start_dim = 1)
        last_hidden_state = self.sig(last_hidden_state)
        
        logits = self.linear(last_hidden_state)
        
        return logits

In [421]:
model = News_NLP(num_words = len(vocab), emb_size = 300, num_classes = 3)
model

News_NLP(
  (emb): Embedding(6343, 300)
  (gru): GRU(300, 32, num_layers=2, batch_first=True)
  (lstm): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.1)
  (sig): Sigmoid()
  (linear): Linear(in_features=64, out_features=3, bias=True)
)

In [422]:
optimizer = optim.Adam(model.parameters(), 0.02)

In [423]:
loss_func = nn.CrossEntropyLoss()

In [424]:
def cal_acc(preds, batched_labels):
    predicted_labels = torch.softmax(preds, dim = 1).argmax(dim = 1)
    num_correct = (predicted_labels == batched_labels).sum()
    
    acc = num_correct/len(batched_labels)
    return acc

In [425]:
num_epochs = 3
for epoch in range(num_epochs):
    print("-" * 60)
    for index, (batched_data, batched_labels) in enumerate(training_generator):
        preds = model(batched_data)
        loss = loss_func(preds, batched_labels)
        accuracy = cal_acc(preds, batched_labels)
        if( index % 50 == 0):
            print("Train: loss : {0}, accuracy: {1}".format(loss, accuracy))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    for index, (batched_data, batched_labels) in enumerate(testing_generator):
        preds = model(batched_data)
        loss = loss_func(preds, batched_labels)
        accuracy = cal_acc(preds, batched_labels)
        if(index % 50 == 0):
            print("Test: loss: {0}, accuracy: {1}".format(loss, accuracy))

------------------------------------------------------------
Train: loss : 1.1169962882995605, accuracy: 0.125
Train: loss : 1.0466067790985107, accuracy: 0.5
Train: loss : 0.5740376710891724, accuracy: 0.875
Train: loss : 0.5023205280303955, accuracy: 0.875
Train: loss : 0.8633860945701599, accuracy: 0.75
Train: loss : 1.1283587217330933, accuracy: 0.375
Train: loss : 0.7542713284492493, accuracy: 0.5
Train: loss : 0.6130088567733765, accuracy: 0.75
Train: loss : 0.7151778936386108, accuracy: 0.5
Train: loss : 0.7484060525894165, accuracy: 0.375
Train: loss : 0.5744444131851196, accuracy: 0.75
Train: loss : 0.8306998014450073, accuracy: 0.375
Train: loss : 0.6781908869743347, accuracy: 0.625
Train: loss : 0.5825556516647339, accuracy: 0.75
Train: loss : 1.1113386154174805, accuracy: 0.5
Train: loss : 0.7053288817405701, accuracy: 0.625
Train: loss : 0.6712198257446289, accuracy: 0.75
Train: loss : 0.887102484703064, accuracy: 0.125
Train: loss : 0.6278265714645386, accuracy: 0.75
Trai