# 文本分类微调

This guide will show you how to：
1. 在IMDb上微调DistilBERT来分辨一部电影是positive还是negative 

In [1]:
from datasets import load_dataset
# 可以为数据集的名字
imdb = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from /home/dyd/.cache/huggingface/modules/datasets_modules/datasets/imdb/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0 (last modified on Tue Aug 15 08:13:01 2023) since it couldn't be found locally at imdb., or remotely on the Hugging Face Hub.


In [2]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

Tokenizer:
1. splitting the input into words, subwords, or symbols that can called tokens.
2. mapping each token to an integer.
3. adding additional inputs that may be useful to the model.

In [3]:
# 导入DistilBERT的预训练好的分词器
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [4]:
preprocess_function(imdb["test"][0])

{'input_ids': [101, 1045, 2293, 16596, 1011, 10882, 1998, 2572, 5627, 2000, 2404, 2039, 2007, 1037, 2843, 1012, 16596, 1011, 10882, 5691, 1013, 2694, 2024, 2788, 2104, 11263, 25848, 1010, 2104, 1011, 12315, 1998, 28947, 1012, 1045, 2699, 2000, 2066, 2023, 1010, 1045, 2428, 2106, 1010, 2021, 2009, 2003, 2000, 2204, 2694, 16596, 1011, 10882, 2004, 17690, 1019, 2003, 2000, 2732, 10313, 1006, 1996, 2434, 1007, 1012, 10021, 4013, 3367, 20086, 2015, 1010, 10036, 19747, 4520, 1010, 25931, 3064, 22580, 1010, 1039, 2290, 2008, 2987, 1005, 1056, 2674, 1996, 4281, 1010, 1998, 16267, 2028, 1011, 8789, 3494, 3685, 2022, 9462, 2007, 1037, 1005, 16596, 1011, 10882, 1005, 4292, 1012, 1006, 1045, 1005, 1049, 2469, 2045, 2024, 2216, 1997, 2017, 2041, 2045, 2040, 2228, 17690, 1019, 2003, 2204, 16596, 1011, 10882, 2694, 1012, 2009, 1005, 1055, 2025, 1012, 2009, 1005, 1055, 18856, 17322, 2094, 1998, 4895, 7076, 8197, 4892, 1012, 1007, 2096, 2149, 7193, 2453, 2066, 7603, 1998, 2839, 2458, 1010, 16596, 1011,

In [5]:
# To apply the preprocessing function over the entire dataset, use 🤗 Datasets map function
tokenized_imdb = imdb.map(preprocess_function, batched=True)

#  DataCollatorWithPadding is more efficient to dynamically pad the sentences to the longest 
#  length in a batch during collation, instead of padding the whole dataset to the maximum length
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
tokenized_imdb
# 查看构建的数据集的结构，25000为train，25000为test

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

## 定义评估函数

In [7]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [8]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}

label2id = {"NEGATIVE": 0, "POSITIVE": 1}

## 分析一下模型的输出

In [9]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.

In [10]:
test_inputs = tokenizer(text, return_tensors="pt")
test_inputs
# tokenizer的输出为{input_ids:tensor, attention_mask:tensor}

{'input_ids': tensor([[  101,  2023,  2001,  1037, 17743,  1012,  2025,  3294, 11633,  2000,
          1996,  2808,  1010,  2021,  4372,  2705,  7941,  2989,  2013,  2927,
          2000,  2203,  1012,  2453,  2022,  2026,  5440,  1997,  1996,  2093,
          1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

1. Model output is a SequenceClassifierOutput object.
[Doc](https://huggingface.co/docs/transformers/v4.32.0/en/main_classes/output#transformers.modeling_outputs.SequenceClassifierOutput)
2. Use `output_attentions=True` in model to get the attention.

3. `output.attentions` Tuple of torch.FloatTensor (one for each layer) of shape
4. Get hidden_states when `output_hidden_states=True` is passed to model or when `config.output_hidden_states=True`.


In [11]:
import torch

with torch.no_grad():
    output = model(**test_inputs, output_attentions=True)
output.attentions[0].size()
# output is a SequenceClassifierOutput object


torch.Size([1, 12, 32, 32])

## 开始训练

In [19]:
from transformers import TrainingArguments, Trainer
# use TrainingArguments to define the arguments

training_args = TrainingArguments(
    output_dir="/media/dyd/UDISK/output_model/",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.1,
    evaluation_strategy="epoch",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

100%|██████████| 3126/3126 [33:02<00:00,  1.58it/s]
 16%|█▌        | 500/3126 [03:26<18:29,  2.37it/s]

{'loss': 0.1163, 'learning_rate': 1.6801023672424827e-05, 'epoch': 0.32}


In [13]:
test_inputs = tokenizer(text, return_tensors="pt")
test_inputs
# tokenizer的输出为{input_ids:tensor, attention_mask:tensor}

{'input_ids': tensor([[  101,  2023,  2001,  1037, 17743,  1012,  2025,  3294, 11633,  2000,
          1996,  2808,  1010,  2021,  4372,  2705,  7941,  2989,  2013,  2927,
          2000,  2203,  1012,  2453,  2022,  2026,  5440,  1997,  1996,  2093,
          1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [17]:
import torch

with torch.no_grad():
    model.to("cpu")
    output = model(**test_inputs, output_attentions=True)
output.attentions[0].size()
# output is a SequenceClassifierOutput object


torch.Size([1, 12, 32, 32])

In [18]:
logits = output["logits"]
pred_class_id = logits.argmax().item()
model.config.id2label[pred_class_id]

'POSITIVE'