In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


1. Remove the columns corresponding to values the model does not expect.
2. Rename the column "label" to "labels".
3. Set the format of the datasets so they return PyTorch tensors.


In [2]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2","idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
from torch.utils.data import DataLoader

# "data_collator" precesses batch data.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn= data_collator)

In [4]:
for batch in train_dataloader:
    continue
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([4]),
 'input_ids': torch.Size([4, 61]),
 'token_type_ids': torch.Size([4, 61]),
 'attention_mask': torch.Size([4, 61])}

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

All 🤗 Transformers models will return the loss when labels are provided, and we also get the logits

In [6]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)




In [7]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_training_steps=num_training_steps, num_warmup_steps=0)

# The training loop

In [8]:
import torch
import evaluate
from tqdm.auto import tqdm


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
metric = evaluate.load("glue", "mrpc")

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    loss_per_epoch = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss_per_epoch += loss.item()
        loss.backward()
        

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        # Add a batch of predictions and references for the evaluation module's stack.
        metric.add_batch(predictions=predictions, references=batch["labels"])

    acc , f1 = metric.compute().values()
    print("Epoch {}, loss: {:.4f}, eval_acc: {:.4f}, eval_f1: {:.4f}".format(epoch, loss_per_epoch, acc, f1))

        



 10%|█         | 461/4590 [00:29<12:30,  5.50it/s]

Epoch 0, loss: 249.5071, eval_acc: 0.8309, eval_f1: 0.8882


 20%|██        | 921/4590 [00:58<08:55,  6.85it/s]

Epoch 1, loss: 120.6058, eval_acc: 0.8113, eval_f1: 0.8744


 30%|███       | 1379/4590 [01:27<09:46,  5.47it/s]

Epoch 2, loss: 32.6165, eval_acc: 0.8113, eval_f1: 0.8702


 40%|████      | 1839/4590 [01:57<06:37,  6.92it/s]

Epoch 3, loss: 13.2402, eval_acc: 0.8309, eval_f1: 0.8770


 50%|█████     | 2297/4590 [02:26<07:02,  5.42it/s]

Epoch 4, loss: 6.2114, eval_acc: 0.8382, eval_f1: 0.8858


 60%|██████    | 2757/4590 [02:56<04:29,  6.79it/s]

Epoch 5, loss: 4.0725, eval_acc: 0.8333, eval_f1: 0.8803


 70%|███████   | 3215/4590 [03:25<04:14,  5.40it/s]

Epoch 6, loss: 3.7532, eval_acc: 0.8284, eval_f1: 0.8789


 80%|████████  | 3675/4590 [03:54<02:15,  6.76it/s]

Epoch 7, loss: 1.7521, eval_acc: 0.8235, eval_f1: 0.8792


 90%|█████████ | 4133/4590 [04:24<01:24,  5.43it/s]

Epoch 8, loss: 0.8437, eval_acc: 0.8211, eval_f1: 0.8781


100%|█████████▉| 4589/4590 [04:53<00:00, 15.70it/s]

Epoch 9, loss: 0.4370, eval_acc: 0.8211, eval_f1: 0.8781


100%|██████████| 4590/4590 [05:10<00:00, 15.70it/s]