### 文本分类 demo


In [2]:
from modelscope import AutoTokenizer, AutoModelForSequenceClassification

2023-11-14 22:09:38,368 - modelscope - INFO - PyTorch version 2.0.1+cu117 Found.
2023-11-14 22:09:38,372 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2023-11-14 22:09:38,373 - modelscope - INFO - Loading ast index from C:\Users\gu\.cache\modelscope\ast_indexer
2023-11-14 22:09:38,744 - modelscope - INFO - Loading done! Current index file version is 1.9.4, with md5 6894a85867070fb123f27e8238dc0219 and a total number of 945 components indexed


In [3]:
import pandas as pd

data = pd.read_csv("ChnSentiCorp_htl_all.csv")
data = data.dropna()
data[0:5]
# 清除空数据

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


#### 创建 dataset


In [4]:
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self):
        self.data = pd.read_csv("ChnSentiCorp_htl_all.csv")
        self.data = data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

    def __len__(self):
        return len(self.data)

In [5]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', 1)
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', 1)
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', 1)
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', 1)


#### 划分数据集


In [6]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, lengths=[0.9, 0.1])  # 训练集和验证集各占的比例

#### 创建 dataloder


In [7]:
import torch

tokenizer = AutoTokenizer.from_pretrained("dienstag/rbt4-h312", revision="v1.0.0")


def collate_func(batch):
    """将batch_size的数据集汇总聚合"""
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    # print(texts)
    # print(labels)
    inputs = tokenizer(
        texts,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    # 定义长度max_len,少了padding补0，多了截断truncation
    inputs["labels"] = torch.tensor(labels)
    # 增加labels字段
    # print(labels)
    return inputs

2023-11-14 22:09:40,822 - modelscope - INFO - Use user-specified model revision: v1.0.0


In [8]:
from torch.utils.data import DataLoader

trainloader = DataLoader(
    trainset, batch_size=32, shuffle=True, collate_fn=collate_func
)  # shuffle是否做打乱
validloader = DataLoader(
    validset, batch_size=64, shuffle=False, collate_fn=collate_func
)
next(enumerate(trainloader))[1]

{'input_ids': tensor([[ 101, 2347, 5307,  ...,    0,    0,    0],
        [ 101, 3302, 1218,  ...,    0,    0,    0],
        [ 101, 2161, 7667,  ...,    0,    0,    0],
        ...,
        [ 101, 1199, 3517,  ...,    0,    0,    0],
        [ 101,  671,  702,  ...,    0,    0,    0],
        [ 101, 7478, 2382,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 1])}

#### 创建模型和优化器


In [9]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("dienstag/rbt4-h312")

optimizer = Adam(model.parameters(), lr=2e-5)

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:\Users\gu\.cache\modelscope\hub\dienstag\rbt4-h312 and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 训练与验证


In [10]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)


def train(epoch, log_step):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()  # 更新
            if global_step % log_step == 0:
                print(f"ep:{ep}, glabal_step:{global_step}, loss:{output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep:{ep}, acc:{acc}")

#### 模型训练


In [11]:
epoch, log_step = 5, 100
train(epoch, log_step)

ep:0, glabal_step:0, loss:0.7202074527740479
ep:0, glabal_step:100, loss:0.5324186682701111
ep:0, glabal_step:200, loss:0.24276354908943176
ep:0, acc:0.8853092789649963
ep:1, glabal_step:300, loss:0.2996819019317627
ep:1, glabal_step:400, loss:0.24471639096736908
ep:1, acc:0.8891752362251282
ep:2, glabal_step:500, loss:0.18358942866325378
ep:2, glabal_step:600, loss:0.1821025311946869
ep:2, acc:0.8865979313850403
ep:3, glabal_step:700, loss:0.1744198054075241
ep:3, glabal_step:800, loss:0.2696484923362732
ep:3, acc:0.8917525410652161
ep:4, glabal_step:900, loss:0.30255070328712463
ep:4, glabal_step:1000, loss:0.08854479342699051
ep:4, acc:0.9007731676101685


#### 模型预测


In [20]:
sen = "哎呦！好垃圾！"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    # 转化成tensor向量
    inputs = tokenizer(sen, return_tensors="pt")
    # 存到gpu
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：哎呦！好垃圾！
模型预测结果:好评
