In [1]:
#!pip install portalocker==2.8.2

In [1]:
import torch
import torchtext
import random
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
train_iter = IMDB(split='train')
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens_batch(train_iter):
    for (label, line) in train_iter:
      yield tokenizer(line)

# CHANGED
MAX_VOCAB_SIZE = 25000
#voc = build_vocab_from_iterator(yield_tokens_batch(train_iter), min_freq =3, specials=["<PAD>","<unk>"])
voc = build_vocab_from_iterator(yield_tokens_batch(train_iter), max_tokens=MAX_VOCAB_SIZE, specials=["<PAD>","<unk>"])
voc.set_default_index(voc['<unk>'])
# CHANGED
new_stoi = voc.get_stoi()
PAD_IDX = new_stoi['<PAD>']
print("The index of '<PAD>' is", PAD_IDX)
#label_transform = lambda x: 1 if x == 'pos' else 0
label_transform = lambda x: 1 if x == 2 else 0
text_transform = lambda x: [voc[token] for token in tokenizer(x)]

The index of '<PAD>' is 0


In [3]:
import numpy as np
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')
train_list = list(train_iter)
random.shuffle(train_list)

dev_list = train_list[:7500]
devfile = open("dev.txt", "w")
for element in dev_list:
    devfile.write(str(element[1]) + str(element[0])+"\n")
devfile.close()

test_list = list(test_iter)
testfile = open("test.txt", "w")
for element in test_list:
    testfile.write(str(element[1])+ str(element[0]) + "\n")
testfile.close()

train_list = train_list[7500:]

In [4]:
count_pos = 0
for label, line in dev_list:
    #print(f"Label: {label}")
    #print(f"Line: '{line}'")
    if label == 2:
        count_pos = count_pos + 1
print("%ge Positive reviews in validation dataset = ", count_pos * 100 / len(dev_list))

%ge Positive reviews in validation dataset =  50.17333333333333


In [5]:
def collate_batch(batch):
   label_list, text_list = [], []
   max_len = 50
   for (_label, _text) in batch:
        # CHANGED
        label_list.append(label_transform(_label))
        #label_list.append(_label)
        processed_text = torch.LongTensor(text_transform(_text))
        #processed_text = nn.ConstantPad1d((0, max_len - processed_text.shape[0]), 0)(processed_text)
        text_list.append(processed_text)

   # CHANGED
   PAD_IDX = 0
   return   pad_sequence(text_list, padding_value=PAD_IDX),torch.LongTensor(label_list)

In [6]:
#batch_size = 2
batch_size = 5

train_iterator = DataLoader(train_list, batch_size, shuffle=True,
                              collate_fn=collate_batch, drop_last=True)
valid_iterator = DataLoader(dev_list, batch_size, shuffle=False,
                              collate_fn=collate_batch, drop_last=True)
test_iterator = DataLoader(test_list, batch_size, shuffle=False,
                              collate_fn=collate_batch, drop_last=True)
print(f'Number of training examples: {len(train_iterator)}')
print(f'Number of dev examples: {len(valid_iterator)}')
print(f'Number of test examples: {len(test_iterator)}')

Number of training examples: 3500
Number of dev examples: 1500
Number of test examples: 5000


In [7]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [8]:
import torch.nn as nn
class LR(nn.Module):
    def __init__(self, input_dim, embedding_dim, output_dim, padding_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=padding_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
    def forward(self, text):
        embedded = self.embedding(text).squeeze().sum(0)
        return self.fc(embedded)

In [9]:
INPUT_DIM = voc.__len__()
EMBEDDING_DIM = 100
# CHANGED
#HIDDEN_DIM = 256
OUTPUT_DIM = 1
PAD_IDX = voc.__getitem__('<PAD>')
model = LR(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM,PAD_IDX)

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,101 trainable parameters


In [11]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-6)

In [12]:
criterion = nn.BCEWithLogitsLoss()

In [13]:
model = model.to(device)
criterion = criterion.to(device)

In [14]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [15]:
from tqdm import tqdm
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for inputs, labels in iterator:

        optimizer.zero_grad()

        predictions = model(inputs.to(device)).squeeze(1).to(device)

        loss = criterion(predictions,labels.float().to(device)).to(device)

        acc = binary_accuracy(predictions, labels.float().to(device))

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for  inputs, labels in iterator:

            predictions = model(inputs.to(device)).squeeze(1).to(device)

            loss = criterion(predictions,labels.float().to(device)).to(device)

            acc = binary_accuracy(predictions, labels.float().to(device))

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 15s
	Train Loss: 4.608 | Train Acc: 51.89%
	 Val. Loss: 4.167 |  Val. Acc: 52.33%
Epoch: 02 | Epoch Time: 0m 14s
	Train Loss: 4.112 | Train Acc: 52.31%
	 Val. Loss: 3.751 |  Val. Acc: 53.05%
Epoch: 03 | Epoch Time: 0m 15s
	Train Loss: 3.719 | Train Acc: 52.86%
	 Val. Loss: 3.392 |  Val. Acc: 53.65%
Epoch: 04 | Epoch Time: 0m 15s
	Train Loss: 3.388 | Train Acc: 53.49%
	 Val. Loss: 3.098 |  Val. Acc: 54.07%
Epoch: 05 | Epoch Time: 0m 15s
	Train Loss: 3.105 | Train Acc: 54.10%
	 Val. Loss: 2.846 |  Val. Acc: 54.63%


In [19]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 2.839 | Test Acc: 54.16%


In [20]:
# Test of model correctness
max_n_test_instances = 5
i = 1
for  inputs, labels in valid_iterator:
  score = model(inputs.to(device))
  print(score)
  if i >= max_n_test_instances:
    break
  else:
    i += 1


tensor([[-3.3263],
        [-8.7919],
        [ 2.2856],
        [-0.9200],
        [-4.6624]], grad_fn=<AddmmBackward0>)
tensor([[-1.3006],
        [-5.1110],
        [-1.9126],
        [ 7.6993],
        [ 7.5721]], grad_fn=<AddmmBackward0>)
tensor([[ -6.1139],
        [  2.5826],
        [  3.1684],
        [-21.5025],
        [  6.1800]], grad_fn=<AddmmBackward0>)
tensor([[  0.0962],
        [ -0.1970],
        [ -7.7068],
        [-11.7926],
        [  6.8536]], grad_fn=<AddmmBackward0>)
tensor([[-5.6539],
        [-7.1100],
        [-3.3802],
        [ 9.2038],
        [11.5038]], grad_fn=<AddmmBackward0>)
