BucketIterator functionality to group examples(sequences) of similar lengths into batches. This allows us to provide the most optimal batches when training models with text data.

Provides a lot of gain for recurrent models (RNN, GRU, LSTM) and transformers models (bert, roBerta, gpt2, xlnet, etc.) where padding will be minimal.

The BucketIterator is similar in applying Dataloader to a PyTorch Dataset.

PyTorchText can handle splits! No need to write same line of code again for train and validation split.

.tsv file (Tab-Separated Values).
.csv comma-separated values

An Iterator will sample a batch of sentences, each will (usually) have different lengths, say: 32, 25, 10 tokens in each sentence. It will then look at the longest sentence within the batch, here 32 tokens, and then all sentences shorter than that will be padded to that length, 32 tokens. This will be done by appending a pad_token, which is <pad> by default, to the end of the sentences.

bucketiterator: don't fix length in data.field

In [1]:
# _*_ coding: utf-8 _*_

import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

def load_dataset(test_sen=None):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together 
                to minimize the amount of padding needed.
    
    """
    
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, 
                      include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(dtype=torch.FloatTensor)
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((
        train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), 
        repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter

In [3]:
load_dataset()

Length of Text Vocabulary: 251639
Vector size of Text Vocabulary:  torch.Size([251639, 300])
Label Length: 2




(<torchtext.data.field.Field at 0x7fa1bbc87d68>,
 251639,
 tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 <torchtext.data.iterator.BucketIterator at 0x7fa0e97d2fd0>,
 <torchtext.data.iterator.BucketIterator at 0x7fa0e97d2da0>,
 <torchtext.data.iterator.BucketIterator at 0x7fa0e97d2e48>)

In [None]:
def files_to_tsv(partition_path, save_path='./'):
    """Parse each file in partition and keep track of sentiments.
    Create a list of pairs [tag, text]
    Arguments:
        partition_path (:obj:`str`):
        Partition used: train or test.
        save_path (:obj:`str`):
            Path where to save the final .tsv file.
    Returns:
        :obj:`str`: Filename of created .tsv file.
    """

    # List of all examples in format [tag, text].
    examples = []

    # Print partition.
    print(partition_path)

    # Loop through each sentiment.
    for sentiment in ['pos', 'neg']:

        # Find path for sentiment.
        sentiment_path = os.path.join(partition_path, sentiment)

        # Get all files from path sentiment.
        files_names = os.listdir(sentiment_path)

        # For each file in path sentiment.
        for file_name in tqdm(files_names, desc=f'{sentiment} Files'):

            # Get file content.
            file_content = io.open(os.path.join(sentiment_path, file_name), mode='r', encoding='utf-8').read()
            # Fix any format errors.
            file_content = fix_text(file_content)
            # Append sentiment and file content.
            examples.append([sentiment, file_content])

    # Create a TSV file with same format `sentiment  text`.
    examples = ["%s\t%s"%(example[0], example[1]) for example in examples]

    # Create file name.
    tsv_filename = os.path.basename(partition_path) + '_pos_neg_%d.tsv'%len(examples)
    # Write to TSV file.
    io.open(os.path.join(save_path, tsv_filename), mode='w', encoding='utf-8').write('\n'.join(examples))

    # Return TSV file name.
    return tsv_filename
  

# Path where to save tsv file.
data_path = '/content'

# Convert train files to tsv file.
train_filename = files_to_tsv(partition_path='/content/aclImdb/train', save_path=data_path)

# Convert test files to tsv file.
test_filename = files_to_tsv(partition_path='/content/aclImdb/test', save_path=data_path)

In [None]:
# Text tokenizer function - dummy tokenizer to return same text.
# Here you will use your own tokenizer.
text_tokenizer = lambda x : x

# Label tokenizer - dummy label encoder that returns same label.
# Here you will add your own label encoder.
label_tokenizer = lambda x: x

# Data field for text column - invoke tokenizer.
TEXT = torchtext.data.Field(sequential=True, tokenize=text_tokenizer, lower=False)

# Data field for labels - invoke tokenize label encoder.
LABEL = torchtext.data.Field(sequential=True, tokenize=label_tokenizer, use_vocab=False)

# Create data fields as tuples of description variable and data field.
datafields = [("label", LABEL),
              ("text", TEXT)]


# Group similar length text sequences together in batches.
torchtext_train_dataloader, torchtext_valid_dataloader = torchtext.data.BucketIterator.splits(
    
                              # Datasets for iterator to draw data from
                              (train_dataset, valid_dataset),

                              # Tuple of train and validation batch sizes.
                              batch_sizes=(train_batch_size, valid_batch_size),

                              # Device to load batches on.
                              device=device, 

                              # Function to use for sorting examples.
                              sort_key=lambda x: len(x.text),


                              # Repeat the iterator for multiple epochs.
                              repeat=True, 

                              # Sort all examples in data using `sort_key`.
                              sort=False, 

                              # Shuffle data on each epoch run.
                              shuffle=True,

                              # Use `sort_key` to sort examples in each batch.
                              sort_within_batch=True,
                              )

In [None]:
# Example of number of epochs.
epochs = 1

# Example of loop through each epoch.
for epoch in range(epochs):

  # Create batches - needs to be called before each loop.
  torchtext_train_dataloader.create_batches()

  # Loop through BucketIterator.
  for sample_id, batch in enumerate(torchtext_train_dataloader.batches):
    # Put all example.text of batch in single array.
    batch_text = [example.text for example in batch]

    print('Batch examples lengths: %s'.ljust(20) % str([len(text) for text in batch_text]))

    # Let's break early, you get the idea.
    if sample_id == 10:
        break