<a href="https://colab.research.google.com/github/CharlesPoletowin/YCBS-273/blob/master/RNN_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import time
from sklearn.metrics import accuracy_score

This notebook was inspired from https://github.com/bentrevett/pytorch-sentiment-analysis. Great thanks to the authors!

# Data setup

In [0]:
import torch
from torchtext import data
from torchtext import datasets
import random


SEED = 1234
MAX_VOCAB_SIZE = 25_000
BATCH_SIZE = 64

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.6MB/s]


In [0]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of testing examples: 25000


In [0]:
print(vars(train_data.examples[0]))

{'text': ['Action', ',', 'violence', ',', 'sex', 'and', 'coarse', 'language', 'are', 'the', 'things', 'that', 'the', 'characters', 'do', 'during', 'the', 'whole', 'movie', '.', 'And', 'everything', 'they', 'do', 'is', 'done', 'without', 'reason', '.', 'Mark', 'L.', 'Lester', 'is', '(', 'un)known', 'for', 'his', 'violent', '(', 'without', 'reason)movies', '(', 'Commando', ',', 'The', 'Base', ')', '.', 'The', 'story', 'is', 'weird', 'but', 'stupid', '.', 'The', 'actors', 'play', 'their', 'stupid', 'characters', 'very', 'well', '...', "I'm", 'not', 'telling', 'they', 'are', 'stupid', 'but', 'I', 'mean', 'they', 'are', 'very', 'bad', 'actors', '.', 'It', "'s", 'another', 'low', '-', 'budget', 'unknown', 'B', 'series', 'action', 'movie', '.', 'If', 'you', 'saw', 'something', 'like', 'Operation', 'Delta', 'Force', ',', 'Drive', ',', 'The', 'Patriot', ',', 'Sanctuary', 'or', 'something', 'like', 'these', 'bad', 'movies', 'from', 'the', 'same', 'kind', 'than', 'Misbegotten', '...', "don't", 'r

In [0]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [0]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [0]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 202601), (',', 192854), ('.', 165199), ('and', 109602), ('a', 109052), ('of', 100879), ('to', 93512), ('is', 76323), ('in', 61437), ('I', 54316), ('it', 53278), ('that', 49351), ('"', 44443), ("'s", 43293), ('this', 42326), ('-', 36828), ('/><br', 35508), ('was', 35133), ('as', 30461), ('with', 29882)]


In [0]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [0]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fb19ee32400>, {'neg': 0, 'pos': 1})


# Model definition

## Exercise: RNN variants are given below. Write LSTM equivalents of these and observe their performance on the given task.

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell_state) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        return self.fc(hidden.squeeze(0))

class RNN_deep(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, 
                           num_layers=n_layers)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):#, text_lengths):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
                
        #output = [sent len, batch size, hid dim]
        
        #hidden = [num layers, batch size, hid dim]
        #cell = [num layers, batch size, hid dim]
          
        return self.fc(hidden[-1])

class RNN_deep_bidir(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, 
                           num_layers=n_layers,
                           bidirectional=True)
        
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
    def forward(self, text):#, text_lengths):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden,ct) = self.rnn(embedded)
                
        #output = [sent len, batch size, hid dim * num directions]
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
          
        return self.fc(hidden)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
def evaluate(model, data_iterator, loss_func):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in data_iterator:

            text, text_lengths = batch.text

            predictions = model(text)#, text_lengths)
            
            loss = loss_func(predictions, batch.label.long())
            
            acc = accuracy_score(torch.argmax(predictions, dim=1).cpu().detach().numpy(), batch.label.cpu().numpy())
#             acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_acc / len(data_iterator), epoch_loss / len(data_iterator)

In [0]:
def train_model(model, train_data_iterator, valid_data_iterator, loss_func, optimizer, epochs=5):

  for epoch in range(epochs):
    
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    tic = time.time()
    for batch in train_data_iterator:
      
      text, text_lengths = batch.text
      
      predictions = model(text)#, text_lengths)
      loss = loss_func(predictions, batch.label.long())

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      epoch_loss += loss.item()
    toc = time.time()
    
    acc, _ = evaluate(model, valid_data_iterator, loss_func)
    toe = time.time()

    print('Loss at epoch %d : %f, validation acc : %f | training time : %d sec, evaluation time : %d sec' % (epoch, epoch_loss / len(train_data_iterator), acc, toc-tic, toe - toc))

In [0]:
def get_model(model_type, n_layers=2):
  INPUT_DIM = len(TEXT.vocab)
  EMBEDDING_DIM = 100
  HIDDEN_DIM = 256
  OUTPUT_DIM = 2
  N_LAYERS = n_layers

  if model_type == 'rnn':
    model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
  elif model_type == 'rnn_deep':
    model = RNN_deep(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)
  elif model_type == 'rnn_deep_bidir':
    model = RNN_deep_bidir(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)

  return model

## run rnn

In [0]:
import torch.nn.functional as F
import torch.optim as optim

model = get_model('rnn')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

loss_func = F.cross_entropy

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 2,592,362 trainable parameters
Loss at epoch 0 : 0.701807, validation acc : 0.495012 | training time : 15 sec, evaluation time : 2 sec
Loss at epoch 1 : 0.700418, validation acc : 0.501059 | training time : 14 sec, evaluation time : 2 sec
Loss at epoch 2 : 0.701293, validation acc : 0.513683 | training time : 14 sec, evaluation time : 2 sec
Loss at epoch 3 : 0.698721, validation acc : 0.503134 | training time : 14 sec, evaluation time : 2 sec
Loss at epoch 4 : 0.698575, validation acc : 0.502383 | training time : 14 sec, evaluation time : 2 sec


KeyboardInterrupt: ignored

## run rnn_deep

In [0]:
model = get_model('rnn_deep')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

loss_func = F.cross_entropy

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 2,723,946 trainable parameters
Loss at epoch 0 : 0.701095, validation acc : 0.516199 | training time : 17 sec, evaluation time : 3 sec
Loss at epoch 1 : 0.698753, validation acc : 0.486714 | training time : 17 sec, evaluation time : 3 sec
Loss at epoch 2 : 0.696769, validation acc : 0.512933 | training time : 17 sec, evaluation time : 3 sec


KeyboardInterrupt: ignored

## run rnn deep bidir

In [0]:
model = get_model('rnn_deep_bidir')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

loss_func = F.cross_entropy

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 3,078,762 trainable parameters
Loss at epoch 0 : 0.697817, validation acc : 0.530764 | training time : 34 sec, evaluation time : 6 sec
Loss at epoch 1 : 0.698857, validation acc : 0.539504 | training time : 34 sec, evaluation time : 6 sec
Loss at epoch 2 : 0.680997, validation acc : 0.512712 | training time : 34 sec, evaluation time : 6 sec
Loss at epoch 3 : 0.695609, validation acc : 0.536679 | training time : 34 sec, evaluation time : 6 sec
Loss at epoch 4 : 0.672020, validation acc : 0.543388 | training time : 34 sec, evaluation time : 6 sec


KeyboardInterrupt: ignored

## run lstm

In [0]:
model = get_model('rnn')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

loss_func = F.cross_entropy

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 2,867,306 trainable parameters
Loss at epoch 0 : 0.691799, validation acc : 0.528867 | training time : 23 sec, evaluation time : 3 sec
Loss at epoch 1 : 0.682714, validation acc : 0.644200 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 2 : 0.517796, validation acc : 0.834304 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 3 : 0.317706, validation acc : 0.864627 | training time : 23 sec, evaluation time : 3 sec
Loss at epoch 4 : 0.208494, validation acc : 0.877295 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 5 : 0.136868, validation acc : 0.884490 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 6 : 0.088621, validation acc : 0.883960 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 7 : 0.059830, validation acc : 0.881179 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 8 : 0.043577, validation acc : 0.881444 | training time : 22 sec, evaluation time : 3 sec
Loss at epoch 9 : 0

## run lstm-biddir

In [0]:
model = get_model('rnn_deep_bidir')
model = model.to(device)
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

loss_func = F.cross_entropy

train_model(model, train_iterator, valid_iterator, loss_func, optimizer, epochs=10)

The model has 4,811,370 trainable parameters
Loss at epoch 0 : 0.681243, validation acc : 0.614627 | training time : 69 sec, evaluation time : 10 sec
Loss at epoch 1 : 0.610965, validation acc : 0.706744 | training time : 68 sec, evaluation time : 11 sec
Loss at epoch 2 : 0.487641, validation acc : 0.742673 | training time : 70 sec, evaluation time : 10 sec
Loss at epoch 3 : 0.350620, validation acc : 0.845030 | training time : 69 sec, evaluation time : 10 sec
Loss at epoch 4 : 0.236802, validation acc : 0.868997 | training time : 69 sec, evaluation time : 10 sec
Loss at epoch 5 : 0.179124, validation acc : 0.878531 | training time : 69 sec, evaluation time : 10 sec
Loss at epoch 6 : 0.118053, validation acc : 0.826713 | training time : 69 sec, evaluation time : 11 sec
Loss at epoch 7 : 0.102422, validation acc : 0.885814 | training time : 69 sec, evaluation time : 10 sec
Loss at epoch 8 : 0.053716, validation acc : 0.854211 | training time : 69 sec, evaluation time : 11 sec
Loss at ep