In [1]:
import transformers
transformers.logging.set_verbosity(transformers.logging.CRITICAL)

import datasets
datasets.logging.set_verbosity(datasets.logging.ERROR)

import warnings
warnings.filterwarnings(category=FutureWarning ,action='ignore')

  from .autonotebook import tqdm as notebook_tqdm


#### Simple training in pytorch

In [2]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

#### Tokenizing a DatasetDict

In [3]:
# Define the checkpoint
checkpoint = 'bert-base-uncased'

# Import the DatasetDict
from datasets import load_dataset
raw_dataset = load_dataset("glue", "mrpc")
print("\nraw_dataset['train'].features.keys(): ", raw_dataset['train'].features.keys())
print("raw_dataset['train'].num_rows: ", raw_dataset['train'].num_rows)

# Tokenize the Datasets in DatasetDict
# The tokenizer returns a dictionnary. We want to keep the DatasetDict type.
# So we use the .map() method with with_indices=True and batched=True
from transformers import AutoTokenizer
def tokenize_function(sequences):
    return tokenizer(sequences['sentence1'], sequences['sentence2'], truncation=True)
tokenized_datasets = raw_dataset.map(tokenize_function , batched=True)
print("\tokenized_datasets['train'].features.keys(): ", tokenized_datasets['train'].features.keys())
print("tokenized_datasets['train'].num_rows: ", tokenized_datasets['train'].num_rows)

print("\nNote that the batched=True argument does not result in a batched output")
print("raw_dataset['train] and tokenized_datasets['train'] have the same number of rows.")
print("=> batched=True instructs the tokenizer to process per batch and so speed up")

100%|██████████| 3/3 [00:00<00:00, 181.68it/s]



raw_dataset['train'].features.keys():  dict_keys(['sentence1', 'sentence2', 'label', 'idx'])
raw_dataset['train'].num_rows:  3668


                                                                  

	okenized_datasets['train'].features.keys():  dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])
tokenized_datasets['train'].num_rows:  3668

Note that the batched=True argument does not result in a batched output
raw_dataset['train] and tokenized_datasets['train'] have the same number of rows.
=> batched=True instructs the tokenizer to process per batch and so speed up




#### Batching the DatasetDict

In [4]:
# Define the data collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Take a batch of data, keep only the relevant information for training
batch_samples = tokenized_datasets["train"][:8]
batch_samples = {k: v for k, v in batch_samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print("\nlen of samples['input_ids']: ", [len(x) for x in batch_samples["input_ids"]])

# Pad the batch
dyn_padded_batch = data_collator(batch_samples)
print("\nlen of dyn_padded_batch['input_ids']: ", [len(x) for x in dyn_padded_batch["input_ids"]])


len of samples['input_ids']:  [50, 59, 47, 67, 59, 50, 62, 32]

len of dyn_padded_batch['input_ids']:  [67, 67, 67, 67, 67, 67, 67, 67]


#### Finetuning with Trainer

In [6]:
# Define the TrainingArguments and Trainer
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

# Define the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Define the Trainer
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Quick training
trainer.train()

{'loss': 0.5231, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'loss': 0.3052, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 179.3616, 'train_samples_per_second': 61.351, 'train_steps_per_second': 7.677, 'train_loss': 0.3422743077717922, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.3422743077717922, metrics={'train_runtime': 179.3616, 'train_samples_per_second': 61.351, 'train_steps_per_second': 7.677, 'train_loss': 0.3422743077717922, 'epoch': 3.0})

#### Predictions and Evaluations

In [7]:
# Use the trainer to do predictions
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

# Convert logits into integers
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

# Evaluate the predictions
import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

(408, 2) (408,)


{'accuracy': 0.8627450980392157, 'f1': 0.9044368600682594}

#### Training with Predictions and Evaluations

In [8]:
# Define a function that compute metrics
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Redefine the Trainer
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Retrain the model and get metric after each epoch/ step
trainer.train()

{'eval_loss': 0.46431678533554077, 'eval_accuracy': 0.8357843137254902, 'eval_f1': 0.8870151770657673, 'eval_runtime': 2.4582, 'eval_samples_per_second': 165.978, 'eval_steps_per_second': 20.747, 'epoch': 1.0}
{'loss': 0.5664, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'eval_loss': 0.46998754143714905, 'eval_accuracy': 0.8284313725490197, 'eval_f1': 0.8852459016393442, 'eval_runtime': 2.4604, 'eval_samples_per_second': 165.826, 'eval_steps_per_second': 20.728, 'epoch': 2.0}
{'loss': 0.3747, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'eval_loss': 0.5406590104103088, 'eval_accuracy': 0.8651960784313726, 'eval_f1': 0.9056603773584906, 'eval_runtime': 2.789, 'eval_samples_per_second': 146.29, 'eval_steps_per_second': 18.286, 'epoch': 3.0}
{'train_runtime': 187.5065, 'train_samples_per_second': 58.686, 'train_steps_per_second': 7.344, 'train_loss': 0.4061395981732537, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.4061395981732537, metrics={'train_runtime': 187.5065, 'train_samples_per_second': 58.686, 'train_steps_per_second': 7.344, 'train_loss': 0.4061395981732537, 'epoch': 3.0})