# Example notebook to download and use PyTorch text datasets
In this notebook, [YelpReviewPolarity](https://pytorch.org/text/stable/datasets.html#torchtext.datasets.YelpReviewPolarity) is used.

**References:**
- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [1]:
import torch
import torchtext
from torchtext.datasets import YelpReviewPolarity

import os

In [2]:
DATA_DIR = './data_tmp'
if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

In [3]:
t_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t_device
#TODO Use this device in all the places

device(type='cpu')

In [4]:
# WARN: If  you get an error regarding confirm_token, restart the notebook and run again
train_dataset_iter, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
(train_dataset_iter, test_dataset_iter)

(<torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f4a26b1b280>,
 <torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f4a26b10a00>)

## Dataset Configurations
Get the dataset configurations necessary for building the neural network.

In [5]:
# First, create a tokenizer and vocabulary for the given dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("subword")

def yield_tokenizer(data_iterator):
    # We don't need the label_id here
    for label_id, text in data_iterator:
        yield tokenizer(text)

# Create a vocabulary from the training corpus
# Note: these iterators are one-off and cannot be reused 
train_dataset_iter = YelpReviewPolarity(DATA_DIR, split='train')
MIN_FREQ=2 # Include only those tokens with a frequency of MIN_FREQ or greater in the vocabulary 
vocabulary = build_vocab_from_iterator(yield_tokenizer(train_dataset_iter), min_freq=MIN_FREQ, specials=["<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])
oov_index = vocabulary.get_default_index()
print(f"Index of Out-of-vocabulary words (<unk>) is:{oov_index}")

Index of Out-of-vocabulary words (<unk>) is:0


In [6]:
# In practice, we define various data processing steps as a pipeline
# Convert text to sequence of numbers
text_pipeline = lambda text: vocabulary(tokenizer(text))
# Labels are [1, 2], convert them into [0, 1]
label_pipeline = lambda label: int(label) - 1 

In [7]:
MAX_SEQ_LEN = 128
PADDING_INDEX = 1 #TODO Is this right thing to do?

def preprocess_data_using_collate_batch(batch):
    """Given a batch of samples generated by DataLoader. This collate function uses the data processing \
        pipelines defined earlier.

        Both padding and truncation (upto MAX_SEQ_LEN) are applied within every batch.
        Reference: https://pytorch.org/text/main/tutorials/sst2_classification_non_distributed.html

    Args:
        batch (iterator): A batch of (label, text) samples

    Returns:
        tuple: (batch_label_tensor, batch_token_id_tensor)
    """
    label_list, text_tensor_list = [], []

    # Process and accumulate the samples within this batch
    for label, text in batch:
        # Preprocess & Accumulate the labels
        label = label_pipeline(label)
        label_list.append(label)

        #  Preprocess & Accumulate the texts
        text_token_ids = text_pipeline(text)
        # TODO Find if pytorchtext (0.11 version) supports truncation & padding in any way
        if len(text_token_ids) > MAX_SEQ_LEN:
            # Truncation
            text_token_ids = text_token_ids[:MAX_SEQ_LEN]
        else:
            # Padding
            len_to_be_padded = MAX_SEQ_LEN - len(text_token_ids)
            pad = [PADDING_INDEX] * len_to_be_padded
            text_token_ids = text_token_ids + pad
        
        # Create the tensor from the token-id sequence
        text_tensor = torch.tensor(text_token_ids, dtype=torch.int64)
        text_tensor_list.append(text_tensor) # A list of tensors
    
    # Aggregate the sample tensors into bulk tensors; 
    
    # Bulk tensor for label
    batch_label_tensor = torch.tensor(label_list, dtype=torch.int64)

    # Bulk tensor for text; convert a list of tensor objects into a single tensor object
    batch_token_id_tensor = torch.stack(text_tensor_list)

    return batch_label_tensor, batch_token_id_tensor

In [8]:
# # [Only for Test] Create the one-off iterators and then the dataloaders
# from torch.utils.data import DataLoader
# _, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
# test_dataloader = DataLoader(test_dataset_iter, batch_size=8, shuffle=False, collate_fn=preprocess_data_using_collate_batch)
# for label,text in test_dataloader:
#     print(label.size())
#     print(text.size())
#     break

In [9]:
# The configurations
EPOCHS = 10
BATCH_SIZE = 32
EMB_SIZE = 128
VOCAB_SIZE = len(vocabulary)
# Note: these iterators are one-off and cannot be reused 
train_dataset_iter = YelpReviewPolarity(DATA_DIR, split='train')
CLASSES = set([label for label,_ in train_dataset_iter])
NUM_CLASSES = len(CLASSES)

print('BATCH_SIZE', 'EPOCHS', 'EMB_SIZE', 'VOCAB_SIZE', 'MAX_SEQ_LEN', 'CLASSES')
print(BATCH_SIZE, '\t', EPOCHS, '\t', EMB_SIZE, '\t', VOCAB_SIZE, '\t', MAX_SEQ_LEN, '\t', CLASSES)

BATCH_SIZE EPOCHS EMB_SIZE VOCAB_SIZE MAX_SEQ_LEN CLASSES
32 	 10 	 128 	 153157 	 128 	 {1, 2}


### Utility functions to train and test the classifier

In [10]:
import time

from torch.nn.utils import clip_grad_norm_

def train_one_epoch(model, train_dataloader, optimizer, criterion, clip_norm, cur_epoch, log_interval=500):
    """Train the 'model' on one epoch on the 'train_dataloader'.
    Returns the model object back.
    """
    model.train() # Train mode
    train_size = len(train_dataloader)
    cur_true_count, cur_sample_count = 0,0
    start_time = time.time()

    for iteration, (batch_labels, batch_token_seq) in enumerate(train_dataloader):
        # Reset grads
        optimizer.zero_grad()
        # Predict
        batch_prediction = model(batch_token_seq)
        # Compute loss
        loss = criterion(batch_prediction, batch_labels)
        # Compute the gradients
        loss.backward()
        # Clip the gradients; to prevent exploding gradients
        clip_grad_norm_(model.parameters(), max_norm=clip_norm)
        # Update the network parameters
        optimizer.step()

        # Compute the accuracy metrics
        pred_labels = batch_prediction.argmax(dim=1)
        true_count = (pred_labels == batch_labels).sum().item()
        cur_true_count += true_count
        cur_sample_count += batch_labels.size(dim=0) # Same as doing .size()[0]
        cur_accuracy = cur_true_count/cur_sample_count

        # Log the metrics
        if iteration % log_interval == 0 and iteration > 0:
            elapsed_time = time.time() - start_time
            print(f"Epoch: {cur_epoch:3d} \t Batches: {iteration:5d}/{train_size:5d} \t \
                Time: {elapsed_time:5.2f}s \t Train Accuracy: {cur_accuracy:8.3f}")

            # reset the metrics
            cur_true_count, cur_sample_count = 0,0
            start_time = time.time()

    # Return the model
    return model

In [18]:
def evaluate(model, eval_dataloader):
    """Evaluate the model on eval_dataloader and return the accuracy
    """
    final_true_count, final_sample_count = 0,0

    with torch.no_grad():
        for i, (batch_labels, batch_token_seq) in enumerate(eval_dataloader):
            batch_prediction = model(batch_token_seq)
            pred_labels = batch_prediction.argmax(dim=1)
            true_count = (pred_labels == batch_labels).sum().item()

            final_true_count += true_count
            final_sample_count += batch_labels.size(dim=0)

        # Compute & return the accuracy
        return (final_true_count/final_sample_count)

**Prepare train & validation splits**

In [12]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Create the one-off iterators and then the dataloaders
train_dataset_iter, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
# Convert iterable-type datasets into map-type datasets
train_dataset = to_map_style_dataset(train_dataset_iter)
test_dataset = to_map_style_dataset(test_dataset_iter)

# Split 95% and 5% for train & validation sets
total_train_size = len(train_dataset)
train_size = int(total_train_size * 0.95)
train_split, val_split = random_split(train_dataset, [train_size, total_train_size-train_size])

In [13]:
# Create the dataloaders for train and test datasets
from torch.utils.data import DataLoader

# Create the dataloaders
train_dataloader = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=preprocess_data_using_collate_batch)
val_dataloader = DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess_data_using_collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess_data_using_collate_batch)

## Train & Test
**Train the model**

In [14]:
# Import the model class
import sys
sys.path.insert(0, '../mini-self-attention/')
from transformer_classifier import TransformerClassifier

NUM_HEADS = 8
NUM_OF_BLOCKS = 6
classifier_model = TransformerClassifier(emb_size=EMB_SIZE, heads=NUM_HEADS, num_of_blocks=NUM_OF_BLOCKS, seq_len=MAX_SEQ_LEN, \
    vocab_size=VOCAB_SIZE, num_classes=NUM_CLASSES)

classifier_model

TransformerClassifier(
  (token_embedding_layer): Embedding(153157, 128)
  (position_embedding_layer): Embedding(128, 128)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (multihead_attention): MultiHeadSelfAttention(
        (layer_weight_keys): Linear(in_features=128, out_features=1024, bias=False)
        (layer_weight_queries): Linear(in_features=128, out_features=1024, bias=False)
        (layer_weight_values): Linear(in_features=128, out_features=1024, bias=False)
        (layer_merge_attention_heads): Linear(in_features=1024, out_features=128, bias=True)
      )
      (norm_layer1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=128, bias=True)
      )
      (norm_layer2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (multihead_attention

In [15]:
LEARNING_RATE=0.1
lr_warmup_after_epochs=5
gradient_clip = 1.0

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier_model.parameters(), lr=LEARNING_RATE)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_warmup_after_epochs, gamma=0.1)

In [16]:
# The Training Process
total_accuracy = None
for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    # Train one full epoch
    train_one_epoch(classifier_model, train_dataloader, optimizer, criterion, gradient_clip, epoch)

    # Evaluate
    val_accuracy = evaluate(classifier_model, val_dataloader)
    if total_accuracy is not None and total_accuracy > val_accuracy:
        # update the learning rate
        lr_scheduler.step()
    else:
        # Consider the current accuracy as best accuracy
        total_accuracy = val_accuracy
    
    # Print the metrics at the end of every epoch
    print('-' * 60)
    time_taken = time.time() - epoch_start_time
    print(f"End of epoch:{epoch:3d} \t Time taken:{time_taken:5.2f}s \t Val Accuracy:{val_accuracy:8.3f}")
    print('-' * 60)

print("End of training process")

Epoch:   1 	 Batches:   500/16625 	                 Time: 659.16s 	 Train Accuracy:    0.568
Epoch:   1 	 Batches:  1000/16625 	                 Time: 654.65s 	 Train Accuracy:    0.651
Epoch:   1 	 Batches:  1500/16625 	                 Time: 607.33s 	 Train Accuracy:    0.708
Epoch:   1 	 Batches:  2000/16625 	                 Time: 611.95s 	 Train Accuracy:    0.728
Epoch:   1 	 Batches:  2500/16625 	                 Time: 610.04s 	 Train Accuracy:    0.754
Epoch:   1 	 Batches:  3000/16625 	                 Time: 610.88s 	 Train Accuracy:    0.767
Epoch:   1 	 Batches:  3500/16625 	                 Time: 606.71s 	 Train Accuracy:    0.778
Epoch:   1 	 Batches:  4000/16625 	                 Time: 611.28s 	 Train Accuracy:    0.791
Epoch:   1 	 Batches:  4500/16625 	                 Time: 605.99s 	 Train Accuracy:    0.797
Epoch:   1 	 Batches:  5000/16625 	                 Time: 610.87s 	 Train Accuracy:    0.802
Epoch:   1 	 Batches:  5500/16625 	                 Time: 606.64s 	 Tr

ValueError: too many values to unpack (expected 2)

**Testing the model**

In [19]:
print("Model Performance on the Test data")
test_accuracy = evaluate(classifier_model, test_dataloader)
print(f"Test Accuracy:{test_accuracy:8.3f}")

Model Performance on the Test data
Test Accuracy:   0.847
