# Example notebook to download and use PyTorch text datasets
In this notebook, [YelpReviewPolarity](https://pytorch.org/text/stable/datasets.html#torchtext.datasets.YelpReviewPolarity) is used.

**References:**
- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [1]:
import torch
import torchtext
from torchtext.datasets import YelpReviewPolarity

import os

In [2]:
DATA_DIR = './data_tmp'
if not os.path.isdir(DATA_DIR):
    os.mkdir(DATA_DIR)

In [3]:
t_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
t_device

device(type='cpu')

In [4]:
# WARN: If  you get an error regarding confirm_token, restart the notebook and run again
train_dataset_iter, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
(train_dataset_iter, test_dataset_iter)

(<torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f9d107c14f0>,
 <torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f9d91092f10>)

## Dataset Configurations
Get the dataset configurations necessary for building the neural network.

In [5]:
# First, create a tokenizer and vocabulary for the given dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("subword")

def yield_tokenizer(data_iterator):
    # We don't need the label_id here
    for label_id, text in data_iterator:
        yield tokenizer(text)

# Create a vocabulary from the training corpus
# Note: these iterators are one-off and cannot be reused 
train_dataset_iter = YelpReviewPolarity(DATA_DIR, split='train')
vocabulary = build_vocab_from_iterator(yield_tokenizer(train_dataset_iter), specials=["<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])
oov_index = vocabulary.get_default_index()
print(f"Index of Out-of-vocabulary words (<unk>) is:{oov_index}")

Index of Out-of-vocabulary words (<unk>) is:0


In [6]:
# In practice, we define various data processing steps as a pipeline
# Convert text to sequence of numbers
text_pipeline = lambda text: vocabulary(tokenizer(text))
# Labels are [1, 2], convert them into [0, 1]
label_pipeline = lambda label: int(label) - 1 

In [7]:
def preprocess_data_using_collate_batch(batch):
    """Given a batch of samples generated by DataLoader. This collate function uses the data processing \
        pipelines defined earlier.

        All the text samples within a batch are merged into one 1-D tensor object. In order to recognize \
            where one text sample ends and where the next starts, we use an array of indices called 'offests'.

        Offsets indicate the starting index of each of the samples within the bulk text tensor object.

    Args:
        batch (iterator): A batch of (label, text) samples

    Returns:
        tuple: (batch_label_tensor, batch_token_id_tensor, batch_offset_tensor)
    """
    label_list, text_tensor_list, offsets = [], [], [0]

    # Process and accumulate the samples within this batch
    for label, text in batch:
        # Preprocess & Accumulate the labels
        label = label_pipeline(label)
        label_list.append(label)

        #  Preprocess & Accumulate the texts
        text_token_ids = text_pipeline(text)
        text_tensor = torch.tensor(text_token_ids, dtype=torch.int64)
        text_tensor_list.append(text_tensor) # A list of tensors

        # TODO: Shouldn't I do padding and truncation here??

        # Keep track of the offsets: the starting positions of each sequence of token ids
        offsets.append(text_tensor.size()[0]) #Note this is same as tensor.size(dim=0)  
    
    # Aggregate the sample tensors into 3 individual bulk tensors; 
    # Bulk tensor for offset
    batch_offset_tensor = torch.tensor(offsets[:-1])
    batch_offset_tensor = batch_offset_tensor.cumsum(dim=0)
    # Note above, cumulative sum produces something like [0, 15, 40, 68, ...],
    # indicating the starting positions of every sequence-of-token-ids in the bulk token tensor

    # Bulk tensor for label
    batch_label_tensor = torch.tensor(label_list, dtype=torch.int64)

    # Bulk tensor for text; concatenate a list of tensor objects into a single tensor object
    batch_token_id_tensor = torch.cat(text_tensor_list)

    # Each of these tensor objects are 1-D tensors; label & offset tensors will be same size as batch, \
    #   the size of token-id tensor will vary among batch to batch
    return batch_label_tensor, batch_token_id_tensor, batch_offset_tensor

In [8]:
# # [Only for Test] Create the one-off iterators and then the dataloaders
# from torch.utils.data import DataLoader
# train_dataset_iter, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
# train_dataloader = DataLoader(train_dataset_iter, batch_size=8, shuffle=False, collate_fn=preprocess_data_using_collate_batch)
# test_dataloader = DataLoader(test_dataset_iter, batch_size=8, shuffle=False, collate_fn=preprocess_data_using_collate_batch)

In [9]:
# The configurations
EPOCHS = 10
BATCH_SIZE = 16
EMB_SIZE = 512
VOCAB_SIZE = len(vocabulary)
# Note: these iterators are one-off and cannot be reused 
train_dataset_iter = YelpReviewPolarity(DATA_DIR, split='train')
CLASSES = set([label for label,_ in train_dataset_iter])
NUM_CLASSES = len(CLASSES)

print('BATCH_SIZE', 'EPOCHS', 'EMB_SIZE', 'VOCAB_SIZE', 'CLASSES')
print(BATCH_SIZE, '\t', EPOCHS, '\t', EMB_SIZE, '\t', VOCAB_SIZE, CLASSES)

BATCH_SIZE EPOCHS EMB_SIZE VOCAB_SIZE CLASSES
16 	 10 	 512 	 293202 {1, 2}


### Utility functions to train and test the classifier

In [10]:
import time

from torch.nn.utils import clip_grad_norm_

def train_one_epoch(model, train_dataloader, optimizer, criterion, clip_norm, cur_epoch, log_interval=500):
    """Train the 'model' on one epoch on the 'train_dataloader'.
    Returns the model object back.
    """
    model.train() # Train mode
    train_size = len(train_dataloader)
    cur_true_count, cur_sample_count = 0,0
    start_time = time.time()

    for iteration, (batch_labels, batch_token_seq, batch_offset) in enumerate(train_dataloader):
        # Reset grads
        optimizer.zero_grad()
        # Predict
        batch_prediction = model(batch_token_seq, batch_offset)
        # Compute loss
        loss = criterion(batch_prediction, batch_labels)
        # Compute the gradients
        loss.backward()
        # Clip the gradients; to prevent exploding gradients
        clip_grad_norm_(model.parameters(), max_norm=clip_norm)
        # Update the network parameters
        optimizer.step()

        # Compute the accuracy metrics
        pred_labels = batch_prediction.argmax(dim=1)
        true_count = (pred_labels == batch_labels).sum().item()
        cur_true_count += true_count
        cur_sample_count += batch_labels.size(dim=0) # Same as doing .size()[0]
        cur_accuracy = cur_true_count/cur_sample_count

        # Log the metrics
        if iteration % log_interval == 0 and iteration > 0:
            elapsed_time = time.time() - start_time
            print(f"Epoch: {cur_epoch:3d} \t Batches: {iteration:5d}/{train_size:5d} \t \
                Time: {elapsed_time:5.2f}s \t Accuracy: {cur_accuracy:8.3f}")

            # reset the metrics
            cur_true_count, cur_sample_count = 0,0
            start_time = time.time()

    # Return the model
    return model

In [11]:
def evaluate(model, eval_dataloader):
    """Evaluate the model on eval_dataloader and return the accuracy
    """
    final_true_count, final_sample_count = 0,0

    with torch.no_grad():
        for i, (batch_labels, batch_token_seq, batch_offset) in eval_dataloader:
            batch_prediction = model(batch_token_seq, batch_offset)
            pred_labels = batch_prediction.argmax(dim=1)
            true_count = (pred_labels == batch_labels).sum().item()

            final_true_count += true_count
            final_sample_count += batch_labels.size(dim=0)

        # Compute & return the accuracy
        return (final_true_count/final_sample_count)

**Prepare train & validation splits**

In [12]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Create the one-off iterators and then the dataloaders
train_dataset_iter, test_dataset_iter = YelpReviewPolarity(DATA_DIR)
# Convert iterable-type datasets into map-type datasets
train_dataset = to_map_style_dataset(train_dataset_iter)
test_dataset = to_map_style_dataset(test_dataset_iter)

# Split 95% and 5% for train & validation sets
total_train_size = len(train_dataset)
train_size = int(total_train_size * 0.95)
train_split, val_split = random_split(train_dataset, [train_size, total_train_size-train_size])

In [13]:
# Create the dataloaders for train and test datasets
from torch.utils.data import DataLoader

# Create the dataloaders
train_dataloader = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=preprocess_data_using_collate_batch)
val_dataloader = DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess_data_using_collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess_data_using_collate_batch)

**Train the model**

In [14]:
# Import the model class
import sys
sys.path.insert(0, '../mini-self-attention/')
from transformer_classifier import TransformerClassifier

classifier_model = TransformerClassifier(emb_size=EMB_SIZE, heads=8, num_of_blocks=6, seq_len=512, \
    vocab_size=VOCAB_SIZE, num_classes=NUM_CLASSES)

classifier_model

TransformerClassifier(
  (token_embedding_layer): Embedding(293202, 512)
  (position_embedding_layer): Embedding(512, 512)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (multihead_attention): MultiHeadSelfAttention(
        (layer_weight_keys): Linear(in_features=512, out_features=4096, bias=False)
        (layer_weight_queries): Linear(in_features=512, out_features=4096, bias=False)
        (layer_weight_values): Linear(in_features=512, out_features=4096, bias=False)
        (layer_merge_attention_heads): Linear(in_features=4096, out_features=512, bias=True)
      )
      (norm_layer1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm_layer2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (multihead_attenti

In [15]:
LEARNING_RATE=0.1
lr_warmup_step=5
gradient_clip = 1.0

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(classifier_model.parameters(), lr=LEARNING_RATE)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_warmup_step, gamma=0.1)

In [16]:
# The Training Process
total_accuracy = None
for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    # Train one full epoch
    train_one_epoch(classifier_model, train_dataloader, optimizer, criterion, gradient_clip, epoch)

    # Evaluate
    val_accuracy = evaluate(classifier_model, val_dataloader)
    if total_accuracy is not None and total_accuracy > val_accuracy:
        # update the learning rate
        lr_scheduler.step()
    else:
        # Consider the current accuracy as best accuracy
        total_accuracy = val_accuracy
    
    # Print the metrics at the end of every epoch
    print('-' * 60)
    time_taken = time.time() - epoch_start_time
    print(f"End of epoch:{epoch:3d} \t Time taken:{time_taken:5.2f}s \t Val Acc:{val_accuracy:8.3f}")
    print('-' * 60)

print("End of training process")

TypeError: forward() takes 2 positional arguments but 3 were given

**Testing the model**

In [None]:
print("Model Performance on the Test data")
test_accuracy = evaluate(test_dataloader)
print(f"Test Accuracy:{test_accuracy:8.3f}")