# 文本分类实例（v2.0）

## 1. 导入相关包

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

## 2. 加载数据集

In [6]:
dataset = load_dataset("csv", data_files="./dataset/ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## 3. 划分数据集

In [7]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## 4. 创建Dataloader

In [11]:
import torch
tokenizer = AutoTokenizer.from_pretrained("../3-model/rb3_download")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [12]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_datasets["train"], tokenized_datasets["test"]

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [13]:
next(enumerate(train_loader))[1]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101,  857, 4638,  ..., 3215, 5277,  102],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        ...,
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ..., 7215, 8024,  102],
        [ 101, 3302, 1218,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 1, 1, 0, 0, 1])}

## 6. 创建模型及优化器

In [14]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("../3-model/rb3_download")

if torch.cuda.is_available():
         model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../3-model/rb3_download and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.parameters

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [16]:
optimizer = Adam(model.parameters(), lr=2e-5)

## 7. 训练与验证

In [19]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(valid_set)


def train(epoch=5, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")

## 8. 模型训练

In [20]:
train()

ep: 0, global_step: 0, loss: 0.08012499660253525
ep: 0, global_step: 100, loss: 0.06912457197904587
ep: 0, global_step: 200, loss: 0.048224736005067825
ep: 0, acc: 0.9021878838539124
ep: 1, global_step: 300, loss: 0.12312793731689453
ep: 1, global_step: 400, loss: 0.1710444837808609
ep: 1, acc: 0.8918918967247009
ep: 2, global_step: 500, loss: 0.017726244404911995
ep: 2, global_step: 600, loss: 0.0028615002520382404
ep: 2, acc: 0.8751608729362488
ep: 3, global_step: 700, loss: 0.04014812409877777
ep: 3, global_step: 800, loss: 0.06961947679519653
ep: 3, acc: 0.8906049132347107
ep: 4, global_step: 900, loss: 0.006873792503029108
ep: 4, global_step: 1000, loss: 0.020569581538438797
ep: 4, acc: 0.8931788802146912


## 9. 模型预测

In [26]:
sen = "我觉得这家酒店不错，饭很好吃"
id2lable = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen} \n模型预测结果：{id2lable.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃 
模型预测结果：好评


In [22]:
from transformers import pipeline

pipe = pipeline("text-classification", model = model, tokenizer=tokenizer, device=0)

In [23]:
pipe(sen)

[{'label': 'LABEL_1', 'score': 0.9999467134475708}]

In [24]:
from transformers import pipeline

model.config.id2label = id2lable

pipe = pipeline("text-classification", model = model, tokenizer=tokenizer, device=0)

In [25]:
pipe(sen)

[{'label': '好评', 'score': 0.9999467134475708}]