# Headline Sentiment Analysis using a BiLSTM RNN

# Imports and Misc

Imports:

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import pandas as pd



Downloading the tokenizer. Punkt is good for this use case: formal headlines from news articles.

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/avinav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Custom Dataset Class

The code below contains the custom class that is used to organize the dataset. The dataset contains multiple points of data with each data contianing a headline (string), a date (YYYY-MM-DD), and a tone (float in the range [-1, 1]). The class is used to load the data, and to perform the preprocessing steps.

In [None]:
class HeadlineDataset(Dataset):
  def __init__(self, headlines, labels, word2vec_model, max_length=128):
    self.headlines = headlines
    self.labels = labels
    self.word2vecmodel = word2vec_model
    self.max_length = max_length

  def __len__(self):
    return len(self.headlines)

  def __getitem__(self, idx):
    headline = self.headlines[idx]
    sentiment = self.sentiments[idx]
    
    # tokenize
    tokens = word_tokenize(headline.lower())
    
    # embed using word2vec
    vectors = []
    for token in tokens[:self.max_length]:
      if token in self.word2vec_model.wv:
        vectors.append(self.word2vec_model.wv[token])
      else:
        vectors.append(np.zeros(self.vector_size))
    
    # pad if needed
    if len(vectors) < self.max_length:
      vectors.extend([np.zeros(self.vector_size)] * (self.max_length - len(vectors)))
    
    # convert to tensrs
    vectors = torch.FloatTensor(vectors)
    sentiment = torch.FloatTensor([sentiment])
    
    return vectors, sentiment

# BiLSTM Neural Network Model

Choice here is to use a bi-directional LSTM model. This is good for NLP and this task. Chosen as its a good balance between accuracy and computational cost. Something like a transformer model may have performed better but it would have been a lot more complicated and computationally expensive.

In [3]:
class BiLSTMSentiment(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim=1, num_layers=2, dropout=0.5):
    super(BiLSTMSentiment, self).__init__()
    
    self.lstm = nn.LSTM(embedding_dim, 
                        hidden_dim,
                        num_layers=num_layers,
                        bidirectional=True,
                        dropout=dropout if num_layers > 1 else 0,
                        batch_first=True)
    
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
  def forward(self, text):
    # text shape: [batch size, sequence length, embedding dim]

    lstm_output, (hidden, cell) = self.lstm(text) # lstm layers
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) # dropout layer
    
    # output layer is tanh to get output in range [-1, 1]
    return torch.tanh(self.fc(hidden))

# Training Function 