In [2]:
from datasets  import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset('glue','mrpc')
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding = 'max_length', truncation = True, max_length = 128)

tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets = tokenized_datasets.with_format('torch')

Reusing dataset glue (/home/jnavio/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 4/4 [00:00<00:00,  7.34ba/s]
100%|██████████| 1/1 [00:00<00:00, 15.84ba/s]
100%|██████████| 2/2 [00:00<00:00,  6.75ba/s]


In [3]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1725
    })
})

In [4]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True)

In [10]:
for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step>5:
        break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


In [8]:
raw_datasets['train'][0]

{'idx': 0,
 'label': 1,
 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}

In [11]:
# Applying dynamic padding:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation = True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets = tokenized_datasets.with_format('torch')

100%|██████████| 4/4 [00:00<00:00,  7.32ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.35ba/s]
100%|██████████| 2/2 [00:00<00:00,  8.73ba/s]


In [13]:
from torch.utils.data import DataLoader
# to dynamic padding we need data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
    print(batch['input_ids'].shape)
    if step>10:
        break

# this will be faster in CPU& GPU but not works with TPU which need fixed length

torch.Size([16, 77])
torch.Size([16, 98])
torch.Size([16, 78])
torch.Size([16, 73])
torch.Size([16, 68])
torch.Size([16, 83])
torch.Size([16, 73])
torch.Size([16, 83])
torch.Size([16, 80])
torch.Size([16, 75])
torch.Size([16, 74])
torch.Size([16, 84])
