In [3]:
!pip install torch torchtext torchdata tqdm torchinfo numpy pandas matplotlib seaborn watermark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Collecting torchinfo
  Downloading torchinfo-1.7.2-py3-none-any.whl (22 kB)
Collecting watermark
  Downloading watermark-2.3.1-py2.py3-none-any.whl (7.2 kB)
Collecting portalocker>=2.0.0
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchinfo, portalocker, jedi, torchdata, watermark
Successfully installed jedi-0.18.2 portalocker-2.7.0 torchdata-0.5.1 torchinfo-1.7.2 watermark-2.3.1


In [4]:
%load_ext watermark
%reload_ext watermark

In [5]:
%watermark -a 'Pushpakant Behera' -h -m -v -p torch,torchtext,torchdata,numpy,pandas,matplotlib,seaborn

Author: Pushpakant Behera

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.9.0

torch     : 1.13.1+cu116
torchtext : 0.14.1
torchdata : 0.5.1
numpy     : 1.22.4
pandas    : 1.3.5
matplotlib: 3.5.3
seaborn   : 0.11.2

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 5.10.147+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

Hostname: bd6d9dacda51



In [6]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import torchtext
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from tqdm.auto import tqdm
from torchinfo import summary

%matplotlib inline
sns.set()

In [7]:
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

torch.backends.cudnn.deterministic = True

In [8]:
imdb_trainset, imdb_testset = IMDB(root='.', split=('train', 'test')) # iterable dataset

In [9]:
cnt = 0
for idx, (label, line) in enumerate(imdb_trainset):
    if label != 1:
        print(label, line)
        cnt = cnt + 1

    if cnt == 10:
        break

2 Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn't really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I'd have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.
2 Zentropa is the most original movie I've seen in years. If you like unique thrillers that are influenced by film noir, then this is just the right cure for all of those Hollywood summer blockbusters clogging the theaters these days. Von Trier's follow-ups like Breaki

In [10]:
# Preprocesing

'''
    Here is an example for typical NLP data processing with tokenizer and vocabulary.
    The first step is to build a vocabulary with the raw training dataset.
    Here we use built in factory function build_vocab_from_iterator which accepts iterator that yield list or iterator of tokens.
    Users can also pass any special symbols to be added to the vocabulary.
'''
tokenizer = get_tokenizer('basic_english')
# tokenizer = get_tokenizer('spacy')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

token_generator = yield_tokens(imdb_trainset)

vocab = build_vocab_from_iterator(token_generator, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [11]:
print(vocab.get_itos())



In [12]:
print(vocab.get_stoi())



In [13]:
# The vocabulary block converts a list of tokens into integers. For example,
print(vocab(['here', 'is', 'an', 'example', 'I']))

[131, 9, 40, 464, 0]


In [14]:
'''
Prepare the text processing pipeline with the tokenizer and vocabulary.
The text and label pipelines will be used to process the raw data strings from the dataset iterators.
'''
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [15]:
'''
The text pipeline converts a text string into a list of integers based on the lookup table defined in the vocabulary.
The label pipeline converts the label into integers. For example,
'''
print(text_pipeline('here is the an example'))
print(label_pipeline('10'))

[131, 9, 1, 40, 464]
9


In [49]:
LEARNING_RATE = 3e-4
BATCH_SIZE = 128
EPOCHS = 16

EMBEDDING_DIM = 64
HIDDEN_DIM = 256
NUM_LAYERS = 16

# NUM_CLASSES = len(set([label for (label, text) in imdb_trainset]))
NUM_CLASSES = 1
VOCABULARY_SIZE = len(vocab)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [50]:
'''
    torch.utils.data.DataLoader is recommended for PyTorch users.
    It works with a map-style dataset that implements the getitem() and len() protocols, and represents a map from indices/keys to data samples.
    It also works with an iterable dataset with the shuffle argument of False.

    Before sending to the model, collate_fn function works on a batch of samples generated from DataLoader.
    The input to collate_fn is a batch of data with the batch size in DataLoader, and collate_fn processes them according to the data processing pipelines declared previously.
    Pay attention here and make sure that collate_fn is declared as a top level def.
    This ensures that the function is available in each worker.

    In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of nn.EmbeddingBag.
    The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor.
    Label is a tensor saving the labels of individual text entries.
'''

'''
    The offsets list in the collate_batch function is used to keep track of the starting indices of each text sample in the batch, 
        so that the word indices of each sample can be properly aligned when they are concatenated into a single tensor.
    In more detail, during the loop over the samples in the batch, the processed_text tensor is created for each sample, 
        which contains the word indices of the tokenized text. The length of this tensor corresponds to the number of tokens in the text.
    The offsets list is then updated to include the cumulative sum of the lengths of all the previous tensors, 
        so that the value at each index of the offsets list represents the starting index of the corresponding text sample in the concatenated tensor.
    For example, suppose we have a batch of two text samples, where the first sample has 10 tokens and the second sample has 8 tokens.
    When the processed_text tensor is created for the first sample, its length is 10, so the current value of the offsets list is [0].
    When the processed_text tensor is created for the second sample, its length is 8, so the offsets list is updated to [0, 10], 
        since the second sample starts at index 10 in the concatenated tensor.
    These offsets are later used to split the concatenated tensor into individual text samples during the forward pass of the model,
        so that each sample is processed independently and the output of the model corresponds to the correct sample in the batch.
'''

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]

    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

train_loader = DataLoader(imdb_trainset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)
test_loader = DataLoader(imdb_testset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch, drop_last=False)

In [51]:
# BoW (Bag of Words) model
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        '''
        The model is composed of the nn.EmbeddingBag layer plus a linear layer for the classification purpose.
        nn.EmbeddingBag with the default mode of “mean” computes the mean value of a “bag” of embeddings.
        Although the text entries here have different lengths, nn.EmbeddingBag module requires no padding here since the text lengths are saved in offsets.
        Additionally, since nn.EmbeddingBag accumulates the average across the embeddings on the fly,
            nn.EmbeddingBag can enhance the performance and memory efficiency to process a sequence of tensors.
        '''
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 5e-1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [56]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, hidden_dim, embedding_dim, num_classes, num_layers):
        super().__init__()
 
        self.output_dim = num_classes
        self.hidden_dim = hidden_dim
 
        self.no_layers = num_layers
        self.vocab_size = vocab_size
    
        # embedding and RNN layers
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=self.hidden_dim,
                           num_layers=num_layers, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, text, offsets):
        # initialize hidden state
        hidden = self.init_hidden()

        # embed the input
        embedded = self.embedding(text, offsets)

        # pass through the RNN layer
        rnn_out, hidden = self.rnn(embedded, hidden)
        rnn_out = rnn_out.contiguous().view(-1, self.hidden_dim) 

        # apply dropout
        out = self.dropout(rnn_out)

        # pass through the linear layer
        out = self.fc(out).squeeze() # squeeze as it is binary classification

        # apply sigmoid activation
        sig_out = self.sig(out)

        return sig_out

    '''
        Why do we need to init_hidden every epoch? Shouldn't it be that the model inherit the hidden parameters from last epoch and continue training on them.
        >   The answer lies in init_hidden. It is not the hidden layer weights but the initial hidden state in RNN/LSTM, which is h0 in the formulas.
            For every epoch, we should re-initialize a new beginner hidden state, this is because during the testing,
            our model will have no information about the test sentence and will have a zero initial hidden state.
    '''
    def init_hidden(self):
        # Initializes hidden state
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of RNN

        # if using the LSTM, we need a tuple of hidden states but for RNN and GRU we need only one
        # h0 = torch.zeros((self.no_layers, BATCH_SIZE, self.hidden_dim)).to(DEVICE)
        # c0 = torch.zeros((self.no_layers, BATCH_SIZE, self.hidden_dim)).to(DEVICE)
        # hidden = (h0, c0)

        hidden = torch.zeros((self.no_layers, self.hidden_dim)).to(DEVICE)
        return hidden

In [68]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_dim, embedding_dim, num_classes, num_layers):
        super().__init__()
 
        self.output_dim = num_classes
        self.hidden_dim = hidden_dim
 
        self.no_layers = num_layers
        self.vocab_size = vocab_size
    
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim,
                           num_layers=num_layers, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
    
        self.fc = nn.Linear(self.hidden_dim, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, text, offsets):
        hidden = self.init_hidden()

        embedded = self.embedding(text, offsets)

        lstm_out, hidden = self.lstm(embedded, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 

        out = self.dropout(lstm_out)

        out = self.fc(out).squeeze()

        sig_out = self.sig(out)

        return sig_out

    def init_hidden(self):
        h0 = torch.zeros((self.no_layers, self.hidden_dim)).to(DEVICE)
        c0 = torch.zeros((self.no_layers, self.hidden_dim)).to(DEVICE)
        hidden = (h0, c0)

        return hidden

In [76]:
class SentimentGRU(nn.Module):
    def __init__(self, vocab_size, hidden_dim, embedding_dim, num_classes, num_layers):
        super().__init__()
 
        self.output_dim = num_classes
        self.hidden_dim = hidden_dim
 
        self.no_layers = num_layers
        self.vocab_size = vocab_size
    
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=self.hidden_dim,
                           num_layers=num_layers, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)
    
        self.fc = nn.Linear(self.hidden_dim, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, text, offsets):
        hidden = self.init_hidden()

        embedded = self.embedding(text, offsets)

        gru_out, hidden = self.gru(embedded, hidden)
        gru_out = gru_out.contiguous().view(-1, self.hidden_dim) 

        out = self.dropout(gru_out)

        out = self.fc(out).squeeze()

        sig_out = self.sig(out)

        return sig_out

    def init_hidden(self):
        hidden = torch.zeros((self.no_layers, self.hidden_dim)).to(DEVICE)
        return hidden

In [77]:
# criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCELoss()
# model = TextClassificationModel(VOCABULARY_SIZE, EMBEDDING_DIM, NUM_CLASSES).to(DEVICE)
# model = SentimentRNN(VOCABULARY_SIZE, HIDDEN_DIM, EMBEDDING_DIM, NUM_CLASSES, NUM_LAYERS).to(DEVICE)
# model = SentimentLSTM(VOCABULARY_SIZE, HIDDEN_DIM, EMBEDDING_DIM, NUM_CLASSES, NUM_LAYERS).to(DEVICE)
model = SentimentGRU(VOCABULARY_SIZE, HIDDEN_DIM, EMBEDDING_DIM, NUM_CLASSES, NUM_LAYERS).to(DEVICE)

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [78]:
import time

def train(dataloader):
    model.train()
    log_interval = 50
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted = model(text, offsets)

        loss = criterion(predicted, label.float())
        loss.backward()
        optimizer.step()

        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| loss {:3f}  |'.format(epoch, idx, 99, loss,))
            start_time = time.time()

def evaluate(dataloader):
    model.eval()

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted = model(text, offsets)
            loss = criterion(predicted, label.float())
    return loss

In [79]:
import time

total_accu = 0
for epoch in tqdm(range(1, EPOCHS + 1), total=EPOCHS):
    epoch_start_time = time.time()
    train(train_loader)
    loss = evaluate(test_loader)
    print('-' * 60)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          '| loss {:3f} '.format(epoch, time.time() - epoch_start_time,
                                           loss))
    print('-' * 60)

  0%|          | 0/16 [00:00<?, ?it/s]

| epoch   1 |    50/   99 batches | loss 0.693362  |


KeyboardInterrupt: ignored