# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/aclImdb/README'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-86.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-82.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-81.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-78.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-88.pth'),
 PosixPath('/data2/yinterian/aclImdb/test'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-87.pth'),
 PosixPath('/data2/yinterian/aclImdb/imdbEr.txt'),
 PosixPath('/data2/yinterian/aclImdb/train'),
 PosixPath('/data2/yinterian/aclImdb/imdb.vocab')]

In [3]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [7]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('/data2/yinterian/aclImdb/train/pos/8030_9.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/8819_10.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/6316_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/4781_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/10085_10.txt')]

In [8]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [9]:
#counts

In [10]:
len(counts.keys())

103578

In [11]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [12]:
len(counts.keys())

33918

In [13]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
#vocab2index

## Dataset

In [15]:
# note that spacy_tok takes a while run it just once
def encode_sentence(path, vocab2index, N=400):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [16]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400)

(array([    1,   774,   101,  2247,   101,   239,    22,  3051,   106,
          455,   834,   123,    52,   940,   131,  1999,   276,  3050,
         1040,    94,   416,  4813,    94,  4814,    76,  2336,  1100,
           76, 31038,    47,   510,   145,  1661,    22,     1,    33,
           25, 18194,   376,   746,   931,    74,  1480,   205,  2770,
         3235,    52,     3,   392,  4605,    52, 11851,    29,  2879,
           12,   276,    99,    25,  1580,  1190,    62,     8,    67,
         6907,  2338,    47,   376,    58,    22,  2247,   376,  8076,
        28445,    74,  1108,   793,  1436,   145,   302,    62,  1999,
         1018,    47,   737,    74,    52,  1131,   847,  5916,    47,
         2090,    74,   283,    63,    72,    52,  6027,  4495,     3,
        18684,    74,   176,   518, 31038,    64, 14484,  8440,    47,
           62,    67,  2748,  4313,    58,     5,    74, 29624,   171,
          566,   176,   108,     1,   647,  4771,    72,    67,   166,
      

In [17]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400):
        self.N = N
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        self.X = [encode_sentence(path, vocab2index, self.N) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [18]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [19]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [20]:
train_ds[1]

(array([  2,   8,  67,  69,  70,  71,  72,  73,  74,  10,  40,  75,  76,
         62,   8,  77,  67,  14,  78,  79,  80,  74,  81,  76,  25,  82,
         13,  83,  76,  18,  84,  69,  85,  86,  87,  88,  84,  89,  74,
         90,  91,  54,  16,  83,  76,  92,  76,  93,  94,  95,  96,  97,
         76,  40,  54,  98,  99,  62,  63,  29,  29,  52, 100,  85,  29,
         90,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

## Understanding LSTMs 

In [21]:
# Input dim is the dimension of the embedding for each word (2 in the example)
# Output dim is the dimension of the hidden layer (4 in this example)
# batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). 
lstm = nn.LSTM(2, 4, batch_first=True)  

In [22]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view(1, len(inputs), -1)
inputs

tensor([[[-1.5285,  1.5959],
         [-0.1864, -1.2827],
         [-0.3314,  0.2441],
         [ 0.8824,  0.5509],
         [-0.4532,  0.2388]]])

In [23]:
# RNNs with batch_first=True assume this input shape
# input shape should be bash_size x seq_len x embedding dimension
inputs.shape

torch.Size([1, 5, 2])

In [24]:
out, hidden = lstm(inputs)

In [25]:
print(out.shape)
out

torch.Size([1, 5, 4])


tensor([[[-0.0850,  0.3282,  0.0319, -0.1222],
         [ 0.2700,  0.0893, -0.0398, -0.2218],
         [ 0.1886,  0.2395, -0.0205, -0.2601],
         [ 0.1625,  0.2563, -0.0132, -0.1012],
         [ 0.1969,  0.2833,  0.0016, -0.2270]]], grad_fn=<TransposeBackward0>)

In [26]:
hidden

(tensor([[[ 0.1969,  0.2833,  0.0016, -0.2270]]], grad_fn=<StackBackward>),
 tensor([[[ 0.3525,  0.6692,  0.0037, -0.4593]]], grad_fn=<StackBackward>))

### Debugging our model

In [27]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sentences

In [29]:
x.shape, s.shape

(torch.Size([7, 400]), torch.Size([7]))

In [30]:
s

tensor([158, 206, 218, 154, 211, 160,  74])

In [31]:
y

tensor([0, 0, 1, 0, 0, 1, 0])

In [32]:
# sort by length so we can use pack_padded_sequence
s, index = s.sort(0, descending=True)
x = x[index]

In [33]:
s

tensor([218, 211, 206, 160, 158, 154,  74])

In [34]:
index

tensor([2, 4, 1, 5, 0, 3, 6])

In [35]:
y[index]

tensor([1, 0, 0, 1, 0, 0, 0])

In [36]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [37]:
x = embed(x.long())
x.shape

torch.Size([7, 400, 10])

In [38]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

In [39]:
# RNN will not perform calculation on pad elements if pack_padded_sequence is used
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [45]:
out_pack, (ht, ct) = lstm(x_pack)

In [77]:
## final hidden layer
ht.shape

torch.Size([1, 7, 9])

In [78]:
ht[-1].shape

torch.Size([7, 9])

## Model

In [59]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        return self.linear(ht[-1])

In [52]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # sorting this batch by sequence length
            s, sort_index = torch.sort(s,0,descending=True)
            s = s.numpy().tolist()
            x = x[sort_index].long().cuda()
            y = y[sort_index].float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [53]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # sorting by length
        s, sort_index = torch.sort(s,0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index].long().cuda()
        y = y[sort_index].float().cuda().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [55]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [56]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100).cuda()

33920


In [57]:
train_epocs(model, epochs=15, lr=0.01)

train loss 0.718 val loss 0.672 and val accuracy 0.578
train loss 0.531 val loss 0.638 and val accuracy 0.708
train loss 0.452 val loss 0.572 and val accuracy 0.710


In [58]:
train_epocs(model, epochs=20, lr=0.001)

train loss 0.252 val loss 0.507 and val accuracy 0.800
train loss 0.212 val loss 0.489 and val accuracy 0.806
train loss 0.185 val loss 0.484 and val accuracy 0.809
train loss 0.163 val loss 0.491 and val accuracy 0.810


In [82]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [87]:
p = PATH/"models/model-81.pth"
save_model(model, p)

In [88]:
val_metrics(model, valid_dl)

(0.49832358241081237, tensor(0.8218, device='cuda:0'))

In [89]:
load_model(model, p)

## GRU model with dropout

In [104]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        return self.linear(ht[-1])

In [105]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

33920


In [106]:
train_epocs(model2, epochs=30, lr=0.01)

train loss 0.684 val loss 0.674 and val accuracy 0.577
train loss 0.605 val loss 0.850 and val accuracy 0.641
train loss 0.438 val loss 0.444 and val accuracy 0.812
train loss 0.273 val loss 0.398 and val accuracy 0.855
train loss 0.185 val loss 0.478 and val accuracy 0.861
train loss 0.139 val loss 0.470 and val accuracy 0.872


In [107]:
p = PATH/"models/model-gru-87.pth"
save_model(model2, p)

## Bidirectional and multiple layers GRUs / LSTMs

In [123]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sentences

In [124]:
vocab_size = len(words)
embedding_dim = 10
hidden_dim = 9
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)

In [125]:
s, index = s.sort(0, descending=True)
x = x[index]
x = embed(x.long())
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [130]:
lstm_out, (ht, ct) = lstm1(x)

In [131]:
ht.shape

torch.Size([2, 7, 9])

In [132]:
ht[-2,:,:].shape

torch.Size([7, 9])

In [128]:
lstm_out, (ht2, ct2) = lstm2(x)

In [129]:
ht2.shape

torch.Size([4, 7, 9])

In [134]:
ht2[-2,:,:].shape, ht2[-1,:,:].shape

(torch.Size([7, 9]), torch.Size([7, 9]))

In [136]:
#concat the final forward (ht[-2,:,:]) and backward (ht[-1,:,:]) hidden layers      
h = torch.cat((ht2[-2,:,:], ht2[-1,:,:]), dim = 1)
h.shape

torch.Size([7, 18])

In [144]:
class LSTMBiModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMBiModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                            dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        return self.linear(h)

In [145]:
vocab_size = len(words)
model3 = LSTMBiModel(vocab_size, 50, 50).cuda()

In [None]:
train_epocs(model3, epochs=30, lr=0.01)

train loss 0.364 val loss 0.392 and val accuracy 0.829
train loss 0.260 val loss 0.410 and val accuracy 0.832


## Bi GRUS

In [None]:
class GRUBiModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUBiModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                            dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, ht = self.gru(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        return self.linear(h)

## Exercise:
Start with pre-trained embeddings.

## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 