# Classic Fixed padding to whole dataset

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"] , examples["sentence2"] , padding = "max_length" , truncation = True , max_length= 128
        )
    
tokenized_datasets = raw_datasets.map(tokenize_function, batched= True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2" , "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label" , "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"] , batch_size= 16 , shuffle= True)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 3:
        break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


# Dynamic Padding, Pad batches seperately

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["sentence1"] , examples["sentence2"] , truncation= True) # We don't use paddint = True parameter this time.

tokenized_datasets = raw_datasets.map(tokenize_function, batched= True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [10]:
# We will use data collator for pad our samples dynamically.

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn= data_collator)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step > 5:
        break
    
# As we see, every batch padding length is different. This is dynamic padding.
# As you see the all batches below the 128 fixed padding.
# Dynamic batching will almost always be better faster on CPUs and GPUs. (Not for TPUs)

torch.Size([16, 72])
torch.Size([16, 81])
torch.Size([16, 84])
torch.Size([16, 74])
torch.Size([16, 73])
torch.Size([16, 85])
torch.Size([16, 88])
