<a href="https://colab.research.google.com/github/Bieguni101/hugging-face-nlp-cource/blob/main/fine_tuning_a_pretrained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
# 加载数据集
from datasets import load_dataset

# glue dataset: https://huggingface.co/datasets/glue
# mrpc: 是一个从在线新闻资源中自动提取的句子语料库，带有人工注释以判断句子对中的句子是否在语义上等价
# load 得到一个包含 train 训练集、validation 验证集、test 测试集的 DatasetDict
raw_datasets = load_dataset("glue", "mrpc")

# raw_datasets:
# DatasetDict({
#     train: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 3668
#     })
#     validation: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 408
#     })
#     test: Dataset({
#         features: ['sentence1', 'sentence2', 'label', 'idx'],
#         num_rows: 1725
#     })
# })

# 按照索引可以得到具体的数据（句子对）
# raw_train_dataset = raw_datasets["train"]
# raw_train_dataset[15]
# {'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
#  'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
#  'label': 0,
#  'idx': 16}

# 按照特征可以得到该特征下的所有数据集
# raw_datasets["train"]["sentence1"]


In [None]:
# 预处理数据集
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"

# 将自然语言 token 化
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# map 方法对数据集中的每个元素（每一类数据集）执行同一个函数
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# 动态填充: 按照序列中的最大字符填充
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# 开始训练
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

# 包括用于训练和评估的所有超参数
training_args = TrainingArguments("test-trainer")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# 对训练后的模型评估
import numpy as np
import evaluate

predictions = trainer.predict(tokenized_datasets["validation"])
# predictions 包括 predictions、label_ids、metrics 三个字段
# predictions 是一个二维数组集

preds = np.argmax(predictions.predictions, axis=-1)

# 这里可以得到模型在验证集上的准确率和 F1 分数
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

In [20]:
# 改造数据集
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

# 定义 loader 加载训练后的数据集
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# 将数据集分批传给模型
for batch in train_dataloader:
    break
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
# 优化器和学习率调度器
import torch
from transformers import get_scheduler

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# 使用从最大值（5e-5）到 0 的线性衰减作为学习率调度器
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
# print(num_training_steps)

# 使用 GPU 训练，在 CPU 上训练可能需要几个小时，在 GPU 上只需要几分钟
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()