This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course).

In [None]:
#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Dh9CL8fyG80?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
! pip install datasets transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
from datasets import load_dataset
# AutoTokenizer: класс из transformers, автоматически подбирающий токенизатор для указанной модели.
# DataCollatorWithPadding: класс, который автоматически дополняет входные данные до одной длины, чтобы можно было эффективно обрабатывать их батчами.
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")


In [7]:
# Загружается набор данных GLUE, поднабор MRPC (Microsoft Research Paraphrase Corpus).
# MRPC — это задача определения, являются ли две строки перефразировкой друг друга (классификация бинарных меток).
# Возвращаемый объект — словарь с разделением на train, validation, и (иногда) test части.
# {
#   'train': Dataset({
#       'sentence1': [...],  # Первая строка
#       'sentence2': [...],  # Вторая строка
#       'label': [...],      # Метки (0 или 1)
#       'idx': [...]         # Индексы записей
#   }),
#   'validation': ...,
#   'test': ...
# }

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [8]:
# checkpoint: определяет конкретную предобученную модель. Здесь используется модель bert-base-cased, которая учитывает регистр букв.
# AutoTokenizer.from_pretrained(checkpoint): загружает токенизатор, соответствующий указанной модели (в данном случае BERT). Этот токенизатор превращает текст в числовые представления (токены), которые могут быть поданы в модель.

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
# Функция принимает на вход батч примеров (например, из датасета).
# Она токенизирует два текстовых поля (sentence1 и sentence2) одновременно, обрезая их до максимальной длины, допустимой для модели (truncation=True).
# Возвращает результат токенизации в виде словаря, содержащего токены (input_ids, attention_mask и, при необходимости, token_type_ids).

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

# raw_datasets.map: применяет функцию токенизации к каждому элементу датасета.
# batched=True: функция применяется к батчам данных, а не к отдельным строкам, что ускоряет обработку.
# После этого tokenized_datasets содержит токенизированные данные.
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [14]:
tokenized_datasets['train']["sentence1"][:5]

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [15]:
tokenized_datasets['train']["sentence2"][:5]

['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
 "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',
 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .']

In [16]:
tokenized_datasets['train']["label"][:5]

[1, 0, 1, 0, 1]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
  tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
  tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
print({k: v.shape for k, v in batch.items()})

{'attention_mask': torch.Size([8, 63]), 'input_ids': torch.Size([8, 63]), 'labels': torch.Size([8]), 'token_type_ids': torch.Size([8, 63])}


In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7512, grad_fn=<NllLossBackward>) torch.Size([8, 2])


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
loss = outputs.loss
loss.backward()
optimizer.step()

# Don't forget to zero your gradients once your optimizer step is done!
optimizer.zero_grad()

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

HBox(children=(FloatProgress(value=0.0, max=1377.0), HTML(value='')))




In [None]:
from datasets import load_metric

metric= load_metric("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8284313725490197, 'f1': 0.8809523809523808}