In [1]:
#!pip install portalocker==2.8.2

In [2]:
import torch
import torchtext
import random
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
import collections




In [3]:
train_iter = IMDB(split='train')
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens_batch(train_iter):
    for (label, line) in train_iter:
      yield tokenizer(line)

# CHANGED
MAX_VOCAB_SIZE = 25000
#voc = build_vocab_from_iterator(yield_tokens_batch(train_iter), min_freq =3, specials=["<PAD>","<unk>"])
voc = build_vocab_from_iterator(yield_tokens_batch(train_iter), max_tokens=MAX_VOCAB_SIZE, specials=["<PAD>","<unk>"])
voc.set_default_index(voc['<unk>'])
# CHANGED
new_stoi = voc.get_stoi()
PAD_IDX = new_stoi['<PAD>']
print("The index of '<PAD>' is", PAD_IDX)
#label_transform = lambda x: 1 if x == 'pos' else 0
label_transform = lambda x: 1 if x == 2 else 0
text_transform = lambda x: [voc[token] for token in tokenizer(x)]

The index of '<PAD>' is 0


In [4]:
import numpy as np
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')
train_list = list(train_iter)
random.shuffle(train_list)

dev_list = train_list[:7500]
devfile = open("dev.txt", "w")
for element in dev_list:
    devfile.write(str(element[1]) + str(element[0])+"\n")
devfile.close()

test_list = list(test_iter)
testfile = open("test.txt", "w")
for element in test_list:
    testfile.write(str(element[1])+ str(element[0]) + "\n")
testfile.close()

train_list = train_list[7500:]

In [5]:
count_pos = 0
for label, line in dev_list:
    #print(f"Label: {label}")
    #print(f"Line: '{line}'")
    if label == 2:
        count_pos = count_pos + 1
print("%ge Positive reviews in validation dataset = ", count_pos * 100 / len(dev_list))

%ge Positive reviews in validation dataset =  50.0


In [6]:
def collate_batch(batch):
   label_list, text_list, length_list = [], [], []
   max_len = 50
   for (_label, _text) in batch:
        # CHANGED
        label_list.append(label_transform(_label))
        #label_list.append(_label)
        tokens = text_transform(_text)
        length_list.append(len(tokens))
        processed_text = torch.LongTensor(tokens)
        #processed_text = nn.ConstantPad1d((0, max_len - processed_text.shape[0]), 0)(processed_text)
        text_list.append(processed_text)

   # CHANGED
   PAD_IDX = 0
   return   pad_sequence(text_list, padding_value=PAD_IDX, batch_first=True), torch.LongTensor(label_list), torch.LongTensor(length_list)

In [7]:
batch_size = 5

train_iterator = DataLoader(train_list, batch_size, shuffle=True,
                              collate_fn=collate_batch, drop_last=True)
valid_iterator = DataLoader(dev_list, batch_size, shuffle=False,
                              collate_fn=collate_batch, drop_last=True)
test_iterator = DataLoader(test_list, batch_size, shuffle=False,
                              collate_fn=collate_batch, drop_last=True)
print(f'Number of training examples: {len(train_iterator)}')
print(f'Number of dev examples: {len(valid_iterator)}')
print(f'Number of test examples: {len(test_iterator)}')

Number of training examples: 3500
Number of dev examples: 1500
Number of test examples: 5000


In [8]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [9]:
import torch.nn as nn
class LSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidirectional,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids, length):
        embedded = self.dropout(self.embedding(ids))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, length, batch_first=True, enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1])
        prediction = self.fc(hidden)
        return prediction

In [10]:
INPUT_DIM = voc.__len__()
EMBEDDING_DIM = 300
# CHANGED
HIDDEN_DIM = 300
OUTPUT_DIM = 2 # num unique label values
PAD_IDX = voc.__getitem__('<PAD>')
n_layers = 2
bidirectional = True
dropout_rate = 0.5

model = LSTM(
    INPUT_DIM,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    OUTPUT_DIM,
    n_layers,
    bidirectional,
    dropout_rate,
    PAD_IDX
)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,110,802 trainable parameters


In [12]:
import torch.optim as optim

lr = 5e-4

optimizer = optim.Adam(model.parameters(), lr=lr)

In [13]:
criterion = nn.CrossEntropyLoss()

In [14]:
model = model.to(device)
criterion = criterion.to(device)

In [15]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [16]:
from tqdm import tqdm
def train(iterator, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for ids, label, length in tqdm(iterator, desc="training..."):
        prediction = model(ids.to(device), length)
        loss = criterion(prediction, label.to(device))
        accuracy = get_accuracy(prediction, label.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)


In [17]:
def evaluate(iterator, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for ids, label, length in tqdm(iterator, desc="evaluating..."):
            prediction = model(ids.to(device), length)
            loss = criterion(prediction, label.to(device))
            accuracy = get_accuracy(prediction, label.to(device))
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)


In [18]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
n_epochs = 5
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    start_time = time.time()
    train_loss, train_acc = train(
        train_iterator, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_iterator, model, criterion, device)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "lstm.pt")
    print(f"epoch: {epoch}")
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc*100:.2f}%")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc*100:.2f}%")


training...: 100%|███████████████████████████████████████████████████████| 3500/3500 [2:06:48<00:00,  2.17s/it]
evaluating...: 100%|███████████████████████████████████████████████████████| 1500/1500 [03:13<00:00,  7.74it/s]


epoch: 0
Epoch: 01 | Epoch Time: 130m 1s
train_loss: 0.677, train_acc: 57.07%
valid_loss: 0.620, valid_acc: 66.40%


training...: 100%|███████████████████████████████████████████████████████| 3500/3500 [2:12:14<00:00,  2.27s/it]
evaluating...: 100%|███████████████████████████████████████████████████████| 1500/1500 [03:14<00:00,  7.69it/s]


epoch: 1
Epoch: 02 | Epoch Time: 135m 29s
train_loss: 0.599, train_acc: 67.50%
valid_loss: 0.492, valid_acc: 75.49%


training...: 100%|███████████████████████████████████████████████████████| 3500/3500 [2:14:00<00:00,  2.30s/it]
evaluating...: 100%|███████████████████████████████████████████████████████| 1500/1500 [03:15<00:00,  7.68it/s]


epoch: 2
Epoch: 03 | Epoch Time: 137m 15s
train_loss: 0.422, train_acc: 80.98%
valid_loss: 0.327, valid_acc: 86.04%


training...: 100%|███████████████████████████████████████████████████████| 3500/3500 [2:13:16<00:00,  2.28s/it]
evaluating...: 100%|███████████████████████████████████████████████████████| 1500/1500 [03:15<00:00,  7.67it/s]


epoch: 3
Epoch: 04 | Epoch Time: 136m 31s
train_loss: 0.322, train_acc: 86.45%
valid_loss: 0.307, valid_acc: 87.73%


training...: 100%|███████████████████████████████████████████████████████| 3500/3500 [2:13:07<00:00,  2.28s/it]
evaluating...: 100%|███████████████████████████████████████████████████████| 1500/1500 [03:13<00:00,  7.76it/s]

epoch: 4
Epoch: 05 | Epoch Time: 136m 20s
train_loss: 0.268, train_acc: 89.05%
valid_loss: 0.293, valid_acc: 88.17%





In [20]:
model.load_state_dict(torch.load('lstm.pt'))

test_loss, test_acc = evaluate(test_iterator, model, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

evaluating...: 100%|███████████████████████████████████████████████████████| 5000/5000 [10:24<00:00,  8.01it/s]

Test Loss: 0.299 | Test Acc: 87.56%





In [21]:
# Test of model correctness
max_n_test_instances = 5
i = 1
for  ids, labels, length in valid_iterator:
  score = model(ids.to(device), length)
  print(score.argmax(dim=-1))
  if i >= max_n_test_instances:
    break
  else:
    i += 1


tensor([0, 0, 1, 1, 1])
tensor([0, 1, 0, 0, 1])
tensor([1, 1, 0, 1, 0])
tensor([1, 0, 0, 0, 1])
tensor([1, 1, 1, 1, 1])


In [22]:
def predict_sentiment(text, model, tokenizer, vocab, device):
    #tokens = tokenizer(text)
    #ids = vocab.lookup_indices(tokens)
    ids = text_transform(text)
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

In [23]:
text = "This film is great!"

predict_sentiment(text, model, tokenizer, vocab, device)

(1, 0.9980506896972656)

In [24]:
text = "This film is terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

(0, 0.986133873462677)