# 1 - RNN + LSTM

## import lib

In [None]:
import torch
import torch.nn.functional as F
from torchtext.legacy import data

from torchtext.legacy import datasets
import time
import random

torch.backends.cudnn.deterministic = True

In [None]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

## Preparing data

In [None]:
TEXT = data.Field(tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')



Num Train: 20000
Num Valid: 5000
Num Test: 25000


In [None]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [None]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    device=DEVICE)

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([132, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([59, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


## Build the model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        packed_output, (hidden, cell) = self.rnn(packed)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [None]:
INPUT_DIM = len(TEXT.vocab)

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Train the model

In [None]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += batch_data.label.size(0)
            correct_pred += (predicted_labels == batch_data.label.long()).sum()
        return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.text
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.label)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/157 | Cost: 0.6926
Epoch: 001/015 | Batch 050/157 | Cost: 0.6961
Epoch: 001/015 | Batch 100/157 | Cost: 0.6822
Epoch: 001/015 | Batch 150/157 | Cost: 0.6801
training accuracy: 57.45%
valid accuracy: 55.80%
Time elapsed: 0.21 min
Epoch: 002/015 | Batch 000/157 | Cost: 0.6784
Epoch: 002/015 | Batch 050/157 | Cost: 0.6727
Epoch: 002/015 | Batch 100/157 | Cost: 0.5835
Epoch: 002/015 | Batch 150/157 | Cost: 0.6063
training accuracy: 70.53%
valid accuracy: 70.06%
Time elapsed: 0.39 min
Epoch: 003/015 | Batch 000/157 | Cost: 0.5987
Epoch: 003/015 | Batch 050/157 | Cost: 0.5703
Epoch: 003/015 | Batch 100/157 | Cost: 0.5279
Epoch: 003/015 | Batch 150/157 | Cost: 0.5966
training accuracy: 75.06%
valid accuracy: 73.86%
Time elapsed: 0.57 min
Epoch: 004/015 | Batch 000/157 | Cost: 0.4822
Epoch: 004/015 | Batch 050/157 | Cost: 0.4943
Epoch: 004/015 | Batch 100/157 | Cost: 0.4635
Epoch: 004/015 | Batch 150/157 | Cost: 0.5266
training accuracy: 78.39%
valid accuracy: 76.46%

# 2 - RNN + LSTM + Glob

## Install lib and import lib

In [None]:
!pip install torchdata

In [None]:
!pip install -U torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 7.0 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+

In [None]:
import torch
import torch.nn.functional as F
import torchtext
import random

from torchtext.legacy import data

from torchtext.legacy import datasets
import random

In [None]:
MODELNAME = "imdb-rnn.model"
NUM_EPOCHS = 15
# BATCHSIZE = 64
VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

## Preparing data

One of the main concepts of TorchText is the `Field`. These define how your data should be processed. In our sentiment classification task the data consists of both the raw string of the review and the sentiment, either "pos" or "neg".

The parameters of a `Field` specify how the data should be processed. 

We use the `TEXT` field to define how the review should be processed, and the `LABEL` field to process the sentiment. 

Our `TEXT` field has `tokenize='spacy'` as an argument. This defines that the "tokenization" (the act of splitting the string into discrete "tokens") should be done using the [spaCy](https://spacy.io) tokenizer. If no `tokenize` argument is passed, the default is simply splitting the string on spaces. We also need to specify a `tokenizer_language` which tells torchtext which spaCy model to use. We use the `en_core_web_sm` model which has to be downloaded with `python -m spacy download en_core_web_sm` before you run this notebook!

`LABEL` is defined by a `LabelField`, a special subset of the `Field` class specifically used for handling labels. We will explain the `dtype` argument later.

For more on `Fields`, go [here](https://github.com/pytorch/text/blob/master/torchtext/data/field.py).

We also set the random seeds for reproducibility. 

In [None]:
TEXT = data.Field(tokenize='spacy', include_lengths=True) # necessary for packed_padded_sequence
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)



downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 23.5MB/s]


In [None]:
# train_data = [(label, tokenizer(line)) for label, line in train_iter]
# train_data.sort(key = lambda x: len(x[1]))
# test_data = [(label, tokenizer(line)) for label, line in test_iter]
# test_data.sort(key = lambda x: len(x[1]))
# for i in range(10):
  # print(train_data[i])

In [None]:
print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 20000
Num Valid: 5000
Num Test: 25000


### Build the vocabulary based on the top "VOCABULARY_SIZE" words:

Text.Vocab Dictionary sẽ chứa số lượng từ và chỉ số từ. Lý do tại sao số lượng từ là VOCABULARY_SIZE + 2 là vì nó chứa các mã thông báo đặc biệt để đệm và các từ chưa biết: <'unk> and <'pad>.

In [None]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29464.39it/s]


Vocabulary size: 20002
Number of classes: 2


In [None]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    device=DEVICE)

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([132, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([59, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


In [None]:
# def make_vocab(train_data, min_freq):
#   vocab = {}
#   for label, tokenlist in train_data:
#     for token in tokenlist:
#       if token not in vocab:
#         vocab[token] = 0
#       vocab[token] += 1
#   vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
#   vocabidx = {}
#   for token, freq in vocab.items():
#     if freq >= min_freq:
#       idx = len(vocablist)
#       vocablist.append((token, freq))
#       vocabidx[token]=idx
#   vocabidx['<unk>']=0
#   vocabidx['<pad>']=1
#   vocabidx['<cls>']=2
#   vocabidx['<eos>']=3
#   return vocablist, vocabidx

# vocablist, vocabidx = make_vocab(train_data, 10)

In [None]:
# def preprocess(data, vocabidx):
#   rr = []
#   for label, tokenlist in data:
#     tkl = ['<cls>']
#     for token in tokenlist:
#       tkl.append(token if token in vocabidx else '<unk>')
#     tkl.append('<eos>')
#     rr.append((label, tkl))
#   return rr

# train_data = preprocess(train_data, vocabidx)
# test_data = preprocess(test_data, vocabidx)
# for i in range(10):
#   print(train_data[i])

In [None]:
# def make_batch(data, batchsize):
#   bb = []
#   blabel = []
#   btokenlist = []
#   for label, tokenlist in data: 
#     blabel.append(label)
#     btokenlist.append(tokenlist)
#     if len(blabel) >= batchsize:
#       bb.append((btokenlist, blabel))
#       blabel = []
#       btokenlist = []
#   if len(blabel) > 0:
#     bb.append((btokenlist, blabel))
#   return bb

# train_data = make_batch(train_data, BATCHSIZE)
# test_data = make_batch(test_data, BATCHSIZE)
# for i in range(10):
#   print(train_data[i])

In [None]:
# def padding(bb):
#   for tokenlists, labels in bb: 
#     maxlen = max([len(x) for x in tokenlists])
#     for tkl in tokenlists:
#       for i in range(maxlen - len(tkl)):
#         tkl.append('<pad>')
#   return bb

# train_data = padding(train_data)
# test_data = padding(test_data)
# for i in range(10):
#   print(train_data[i])

In [None]:
# def word2id(bb, vocabidx):
#   rr = []
#   for tokenlists, labels in bb:
#     id_labels = [1 if label == 'pos' else 0 for label in labels]
#     id_tokenlists = []
#     for tokenlist in tokenlists:
#       id_tokenlists.append([vocabidx[token] for token in tokenlist])
#     rr.append((id_tokenlists, id_labels))
#   return rr

# train_data = word2id(train_data, vocabidx)
# test_data = word2id(test_data, vocabidx)
# for i in range(10):
#   print(train_data[i])

## Build the model

In [None]:
# class MyRNN(torch.nn.Module):
#   def __init__(self):
#     super(MyRNN, self).__init__()
#     vocabsize = len(vocablist)
#     self.emb = torch.nn.Embedding(vocabsize, 300, padding_idx=vocabidx['<pad>'])
#     self.l1 = torch.nn.Linear(300,300)
#     self.l2 = torch.nn.Linear(300, 2)
#   def forward(self, x):
#     e = self.emb(x)
#     h = torch.zeros(e[0].size(),dtype=torch.float32).to(DEVICE)
#     for i in range(x.size()[0]):
#       h = F.relu(e[i] + self.l1(h))
#     return self.l2(h)

In [None]:
# model = MyRNN().to(DEVICE)
# print(model)

MyRNN(
  (emb): Embedding(20439, 300, padding_idx=1)
  (l1): Linear(in_features=300, out_features=300, bias=True)
  (l2): Linear(in_features=300, out_features=2, bias=True)
)


In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        packed_output, (hidden, cell) = self.rnn(packed)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [None]:
INPUT_DIM = len(TEXT.vocab)

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
print(model)

RNN(
  (embedding): Embedding(20002, 128)
  (rnn): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


## Train the model 

In [None]:
# def train():
#   model = MyRNN().to(DEVICE)
#   optimizer = torch.optim.Adam(model.parameters(), lr=LR)
#   for epoch in range(EPOCH):
#     loss = 0
#     for tokenlists, labels in train_data:
#       tokenlists = torch.tensor(tokenlists, dtype=torch.int64).transpose(0, 1).to(DEVICE)
#       labels = torch.tensor(labels, dtype=torch.int64).to(DEVICE)
#       optimizer.zero_grad()
#       y = model(tokenlists)
#       print(len(y))
#       batchloss = F.cross_entropy(y, labels)
#       batchloss.backward()
#       optimizer.step()
#       loss = loss + batchloss.item()
#     print("epoch", epoch, ": loss")
#   torch.save(model.state_dict(), MODELNAME)

# train()

In [None]:
# def test():
#   total = 0
#   correct = 0
#   model = MyRNN().to(DEVICE)
#   model.load_state_dict(torch.load(MODELNAME))
#   model.eval()
#   for tokenlists, labels in test_data:
#     total += len(labels)
#     tokenlists = torch.tensor(tokenlists, dtype=torch.int64).transpose(0, 1).to(DEVICE)
#     labels = torch.tensor(labels, dtype=torch.int64).to(DEVICE)
#     y = model(tokenlists)
#     pred_labels = y.max(dim=1)[1]
#     correct += (pred_labels == labels).sum()
#   print("correct:", correct.item())
#   print("total:", total)
#   print("accuracy:", (correct.item() / float(total)))

In [None]:
# train()
# test()

In [None]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += batch_data.label.size(0)
            correct_pred += (predicted_labels == batch_data.label.long()).sum()
        return correct_pred.float()/num_examples * 100

In [None]:
import time

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.text
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.label)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/157 | Cost: 0.6926
Epoch: 001/015 | Batch 050/157 | Cost: 0.6961
Epoch: 001/015 | Batch 100/157 | Cost: 0.6822
Epoch: 001/015 | Batch 150/157 | Cost: 0.6801
training accuracy: 57.45%
valid accuracy: 55.80%
Time elapsed: 0.18 min
Epoch: 002/015 | Batch 000/157 | Cost: 0.6784
Epoch: 002/015 | Batch 050/157 | Cost: 0.6727
Epoch: 002/015 | Batch 100/157 | Cost: 0.5835
Epoch: 002/015 | Batch 150/157 | Cost: 0.6063
training accuracy: 70.53%
valid accuracy: 70.06%
Time elapsed: 0.35 min
Epoch: 003/015 | Batch 000/157 | Cost: 0.5987
Epoch: 003/015 | Batch 050/157 | Cost: 0.5703
Epoch: 003/015 | Batch 100/157 | Cost: 0.5279
Epoch: 003/015 | Batch 150/157 | Cost: 0.5966
training accuracy: 75.06%
valid accuracy: 73.86%
Time elapsed: 0.53 min
Epoch: 004/015 | Batch 000/157 | Cost: 0.4822
Epoch: 004/015 | Batch 050/157 | Cost: 0.4943
Epoch: 004/015 | Batch 100/157 | Cost: 0.4635
Epoch: 004/015 | Batch 150/157 | Cost: 0.5266
training accuracy: 78.39%
valid accuracy: 76.46%

## Demo

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def predict_sentiment(model, sentence):
    # based on:
    # https://github.com/bentrevett/pytorch-sentiment-analysis/blob/
    # master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
print('Probability positive:')
predict_sentiment(model, "I really love this movie. This movie is so great!")

Probability positive:


0.842251181602478

# 3 - Multilayer bidirectional RNN + LSTM

## import lib

In [None]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BIDIRECTIONAL = True

EMBEDDING_DIM = 128
NUM_LAYERS = 2
HIDDEN_DIM = 128
OUTPUT_DIM = 1

## Preparing data

In [None]:
TEXT = data.Field(tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')



Num Train: 20000
Num Valid: 5000
Num Test: 25000


In [None]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [None]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    device=DEVICE)

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([132, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([59, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


## Build the model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=NUM_LAYERS,
                           bidirectional=BIDIRECTIONAL)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed)
        
        # combine both directions
        combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        
        return self.fc(combined.squeeze(0)).view(-1)

In [None]:
INPUT_DIM = len(TEXT.vocab)

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Train the model 

In [None]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += batch_data.label.size(0)
            correct_pred += (predicted_labels == batch_data.label.long()).sum()
        return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.text
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.label)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/157 | Cost: 0.6932
Epoch: 001/015 | Batch 050/157 | Cost: 0.6960
Epoch: 001/015 | Batch 100/157 | Cost: 0.6795
Epoch: 001/015 | Batch 150/157 | Cost: 0.6831
training accuracy: 58.94%
valid accuracy: 57.70%
Time elapsed: 0.30 min
Epoch: 002/015 | Batch 000/157 | Cost: 0.6762
Epoch: 002/015 | Batch 050/157 | Cost: 0.6331
Epoch: 002/015 | Batch 100/157 | Cost: 0.5971
Epoch: 002/015 | Batch 150/157 | Cost: 0.5944
training accuracy: 69.57%
valid accuracy: 69.92%
Time elapsed: 0.60 min
Epoch: 003/015 | Batch 000/157 | Cost: 0.6172
Epoch: 003/015 | Batch 050/157 | Cost: 0.5505
Epoch: 003/015 | Batch 100/157 | Cost: 0.4957
Epoch: 003/015 | Batch 150/157 | Cost: 0.6229
training accuracy: 75.52%
valid accuracy: 74.62%
Time elapsed: 0.91 min
Epoch: 004/015 | Batch 000/157 | Cost: 0.4683
Epoch: 004/015 | Batch 050/157 | Cost: 0.5007
Epoch: 004/015 | Batch 100/157 | Cost: 0.5525
Epoch: 004/015 | Batch 150/157 | Cost: 0.5566
training accuracy: 79.42%
valid accuracy: 77.62%

# 4 - RNN + LSTM + Glob + dropout

## import lib

In [None]:
import torch
from spacy.tokenizer import Tokenizer
# from torchtext import data
# from torchtext import datasets
from torchtext.legacy import data
from torchtext.legacy import datasets

# SEED = 11
# torch.manual_seed(SEED)                         ## Reproducibility
torch.backends.cudnn.deterministic = True

RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BIDIRECTIONAL = True

EMBEDDING_DIM = 128
NUM_LAYERS = 2
HIDDEN_DIM = 128
OUTPUT_DIM = 1

# TEXT = data.Field(tokenize = 'spacy', include_lengths = True)   ## Text field
# LABEL = data.LabelField(dtype = torch.float)                    ## Label Field

## Preparing data

In [None]:
import random
TEXT = data.Field(tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')



downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 51.2MB/s]


Num Train: 20000
Num Valid: 5000
Num Test: 25000


In [None]:
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29088.80it/s]


Vocabulary size: 20002
Number of classes: 2


In [None]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    device=DEVICE)

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([133, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([61, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([42, 128])
Target vector size: torch.Size([128])


## Build the model

In [None]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers = n_layers, 
                           bidirectional = bidirectional, 
                           dropout = dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        embedding = self.embedding(text)    ## shape = (sent_length, batch_size)
        embedded = self.dropout(embedding)  ## shape = (sent_length, batch_size, emb_dim)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)    ## pack sequence
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)        ## unpack sequence

        ## output shape = (sent_len, batch_size, hid_dim * num_directions)
        ## output over padding tokens are zero tensors
        
        ## hidden shape = (num_layers * num_directions, batch_size, hid_dim)
        ## cell shape = (num_layers * num_directions, batch_size, hid_dim)
        
        ## concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        ## and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) ## shape = (batch_size, hid_dim * num_directions)
            
        return self.fc(hidden)

In [None]:
VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BIDIRECTIONAL = True

EMBEDDING_DIM = 100  # 128
NUM_LAYERS = 2 
HIDDEN_DIM = 128 # 256
OUTPUT_DIM = 1
DROPOUT = 0.4
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [None]:
INPUT_DIM = len(TEXT.vocab)

torch.manual_seed(RANDOM_SEED)
model = Model(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"There are {train_params} trainable parameters")

There are 2631241 trainable parameters


##  Replace initial embedding with pretrained embedding

In [None]:
TEXT.vocab.vectors.size()

torch.Size([20002, 100])

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.3374, -0.1778, -0.3035,  ...,  0.2770,  0.6455, -0.8957],
        [ 0.4928, -0.0141, -0.2747,  ...,  0.0493,  0.8484,  0.4671],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4098,  1.0487, -0.2304,  ..., -0.6889, -0.8636, -0.6693],
        [-0.5235,  0.8734,  0.6664,  ...,  0.3166,  0.0621,  0.3844],
        [ 0.0667,  0.3963, -0.5351,  ..., -0.5661,  0.4373,  0.5835]],
       device='cuda:0')

## Replace and with zeros (they were initialized with the normal distribution)

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4098,  1.0487, -0.2304,  ..., -0.6889, -0.8636, -0.6693],
        [-0.5235,  0.8734,  0.6664,  ...,  0.3166,  0.0621,  0.3844],
        [ 0.0667,  0.3963, -0.5351,  ..., -0.5661,  0.4373,  0.5835]],
       device='cuda:0')


## Train the model

In [None]:
import torch
import torch.nn.functional as F
from torchtext.legacy import data

from torchtext.legacy import datasets
import time
import random

torch.backends.cudnn.deterministic = True

In [None]:
def binary_accuracy(preds, y):

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    accuracy = correct.sum() / len(correct)
    return accuracy
    
def binary_classification_metrics(prediction, ground_truth):
    '''
    Computes metrics for binary classification

    Arguments:
    prediction, np array of bool (num_samples) - model predictions
    ground_truth, np array of bool (num_samples) - true labels

    Returns:
    precision, recall, f1, accuracy - classification metrics
    '''

    prediction = torch.round(torch.sigmoid(prediction))
    correct = (prediction == ground_truth).float() #convert into float for division 
    
    precision = 0
    recall = 0
    accuracy = 0
    f1 = 0

    tp = 0      ## true positive
    tn = 0      ## true negative
    fp = 0      ## false positive
    fn = 0      ## false negative

    for i in range(len(prediction)):
        if prediction[i] == True and ground_truth[i] == True:
            tp += 1
        if prediction[i] == True and ground_truth[i] == False:
            fp += 1
        if prediction[i] == False and ground_truth[i] == True:
            fn += 1
        if prediction[i] == False and ground_truth[i] == False:
            tn += 1

    accuracy = (tp + tn)/(tp + tn + fp + fn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1 = 2 * (precision * recall)/(precision + recall)

    return precision, recall, f1, accuracy

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.train()
    for batch in iterator:
        
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths.to('cpu')).squeeze(1)

        loss = criterion(predictions, batch.label)
        accuracy = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            text, text_lengths = batch.text
            predictions = model(text, text_lengths.to('cpu')).squeeze(1)
            loss = criterion(predictions, batch.label)
            
            accuracy = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
        
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [None]:
def metrics(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_f1 = 0

    tp = tn = fp = fn = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            text, text_lengths = batch.text
            predictions = model(text, text_lengths.to('cpu')).squeeze(1)
            loss = criterion(predictions, batch.label)
            
            precision, recall, f1, accuracy = binary_classification_metrics(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_f1 += f1
        
    return epoch_loss / len(iterator), epoch_f1 / len(iterator)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
criterion = nn.BCEWithLogitsLoss()      ## use GPU
criterion = criterion.to(DEVICE)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_accuracy = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_accuracy*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 16s
	Train Loss: 0.669 | Train Acc: 58.55%
	 Val. Loss: 0.688 |  Val. Acc: 56.11%
Epoch: 02 | Epoch Time: 0m 16s
	Train Loss: 0.623 | Train Acc: 65.27%
	 Val. Loss: 0.571 |  Val. Acc: 69.20%
Epoch: 03 | Epoch Time: 0m 16s
	Train Loss: 0.640 | Train Acc: 62.55%
	 Val. Loss: 0.600 |  Val. Acc: 68.36%
Epoch: 04 | Epoch Time: 0m 16s
	Train Loss: 0.478 | Train Acc: 77.66%
	 Val. Loss: 0.522 |  Val. Acc: 72.66%
Epoch: 05 | Epoch Time: 0m 16s
	Train Loss: 0.382 | Train Acc: 83.91%
	 Val. Loss: 0.321 |  Val. Acc: 87.13%
Epoch: 06 | Epoch Time: 0m 16s
	Train Loss: 0.383 | Train Acc: 84.36%
	 Val. Loss: 0.748 |  Val. Acc: 61.05%
Epoch: 07 | Epoch Time: 0m 16s
	Train Loss: 0.503 | Train Acc: 75.05%
	 Val. Loss: 0.357 |  Val. Acc: 84.71%
Epoch: 08 | Epoch Time: 0m 16s
	Train Loss: 0.289 | Train Acc: 88.44%
	 Val. Loss: 0.287 |  Val. Acc: 88.44%
Epoch: 09 | Epoch Time: 0m 16s
	Train Loss: 0.252 | Train Acc: 89.86%
	 Val. Loss: 0.362 |  Val. Acc: 83.32%
Epoch: 10 | Epoch T

In [None]:
test_loss, test_acc = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.306 | Test Acc: 88.74%
