# 文本分类实例（v2.0）

## 1. 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## 2. 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="../4-datasets/dataset/ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 29.49it/s]
Generating train split: 7766 examples [00:00, 140285.22 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 181088.68 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## 3. 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## 4. 创建Dataloader

In [4]:
import torch
tokenizer = AutoTokenizer.from_pretrained("../3-model/rb3_download")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 8080.31 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 7280.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_datasets["train"], tokenized_datasets["test"]

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [6]:
next(enumerate(train_loader))[1]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101, 4384, 1862,  ...,    0,    0,    0],
        [ 101, 1057,  857,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ..., 1369, 1912,  102],
        ...,
        [ 101,  126, 3299,  ...,  119, 6844,  102],
        [ 101, 4384, 1862,  ...,    0,    0,    0],
        [ 101, 2791, 7313,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 0, 1, 1, 0])}

## 5. 创建模型及优化器

In [7]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("../3-model/rb3_download")

if torch.cuda.is_available():
         model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../3-model/rb3_download and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.parameters

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [9]:
optimizer = Adam(model.parameters(), lr=2e-5)

## 6. 训练与验证

In [13]:
import evaluate

clf_mertics = evaluate.combine(["accuracy", "recall", "f1", "precision"])

In [14]:
def my_evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            clf_mertics.add_batch(predictions=pred, references=batch["labels"].long())
            
    return clf_mertics.compute()


def train(epoch=5, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        clf = my_evaluate()
        print(f"ep: {ep}, acc: {clf}")

## 7. 模型训练

In [15]:
train()

ep: 0, global_step: 0, loss: 0.2725081741809845
ep: 0, global_step: 100, loss: 0.19376027584075928
ep: 0, global_step: 200, loss: 0.135680690407753
ep: 0, acc: {'accuracy': 0.9034749034749034, 'recall': 0.9466666666666667, 'f1': 0.9298409728718428, 'precision': 0.9136029411764706}
ep: 1, global_step: 300, loss: 0.10361789911985397
ep: 1, global_step: 400, loss: 0.17135611176490784
ep: 1, acc: {'accuracy': 0.8918918918918919, 'recall': 0.9276190476190476, 'f1': 0.9206049149338374, 'precision': 0.9136960600375235}
ep: 2, global_step: 500, loss: 0.04030793532729149
ep: 2, global_step: 600, loss: 0.22250525653362274
ep: 2, acc: {'accuracy': 0.8893178893178894, 'recall': 0.9542857142857143, 'f1': 0.9209558823529412, 'precision': 0.8898756660746003}
ep: 3, global_step: 700, loss: 0.04941809922456741
ep: 3, global_step: 800, loss: 0.06463012844324112
ep: 3, acc: {'accuracy': 0.9073359073359073, 'recall': 0.9428571428571428, 'f1': 0.9322033898305084, 'precision': 0.9217877094972067}
ep: 4, glo

In [33]:
data = my_evaluate()
print(data)

{'accuracy': 0.8931788931788932, 'recall': 0.9314285714285714, 'f1': 0.9217719132893496, 'precision': 0.9123134328358209}


## 8. 模型预测

In [16]:
sen = "我觉得这家酒店不错，饭很好吃"
id2lable = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen} \n模型预测结果：{id2lable.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃 
模型预测结果：好评


In [17]:
from transformers import pipeline

pipe = pipeline("text-classification", model = model, tokenizer=tokenizer, device=0)

In [18]:
pipe(sen)

[{'label': 'LABEL_1', 'score': 0.9957742094993591}]

In [19]:
from transformers import pipeline

model.config.id2label = id2lable

pipe = pipeline("text-classification", model = model, tokenizer=tokenizer, device=0)

In [20]:
pipe(sen)

[{'label': '好评', 'score': 0.9957742094993591}]