In [7]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

batch = tokenizer(sequences, padding = True, truncation = True, return_tensors = "pt")

batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 120566.97 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 101867.73 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 290068.33 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [8]:
tokenized_sentences_1 = tokenizer(raw_datasets['train']['sentence1'])
tokenized_sentences_2 = tokenizer(raw_datasets['train']['sentence2'])

In [14]:
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation = True)

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 14100.41 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 13325.10 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 16014.96 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### Dynamic Padding

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)


In [None]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[52, 59, 47, 69, 60, 50, 66, 32]

In [18]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 69]),
 'token_type_ids': torch.Size([8, 69]),
 'attention_mask': torch.Size([8, 69]),
 'labels': torch.Size([8])}

In [21]:
# Attempt to tokenize the GLUE SST-2 dataset
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")
dataset

Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 1831362.56 examples/s]
Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 275949.38 examples/s]
Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 392549.09 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
def tokenize_sentence(example):
    return tokenizer(example['sentence'], truncation = True)

dataset = dataset.map(tokenize_sentence, batched = True)

Map: 100%|██████████| 67349/67349 [00:02<00:00, 30991.79 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 26365.20 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 28457.94 examples/s]


In [None]:
dataset = {k: v for k, v in dataset.items()}

{'train': Dataset({
     features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 67349
 }),
 'validation': Dataset({
     features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 872
 }),
 'test': Dataset({
     features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1821
 })}