## Step1 导入相关包

In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from transformers import AutoTokenizer,AutoModelForSequenceClassification

## Step2 加载数据

In [3]:
# import pandas as pd
# data = pd.read_csv('ChnSentiCorp_htl_all.csv')
# data
from datasets import *
data = load_dataset('csv',data_files='./ChnSentiCorp_htl_all.csv',split="train")
data

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [5]:
# data = data.dropna()
# data
data = data.filter(lambda x: x['review'] is not None)
data

Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [7]:
data["review"][:3]

['距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.',
 '商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!',
 '早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。']

## Step3 创建Dataset

## Step4 划分数据集

In [8]:
# from torch.utils.data import random_split

# trainset,validset = random_split(dataset,lengths=[0.9,0.1])
# len(trainset),len(validset)
data = data.train_test_split(test_size=0.1)
data

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step5 创建DataLoader

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

In [19]:
def process_fun(examples):
    tokenizer_e = tokenizer(examples["review"],max_length=128,truncation=True)
    tokenizer_e["labels"] = examples['label']
    return tokenizer_e

In [34]:
tokenizer_data = data.map(process_fun,batched=True,remove_columns=data["train"].column_names)

In [48]:
tokenizer_data_train = tokenizer_data['train']
tokenizer_data_test = tokenizer_data['test']
tokenizer_data_test

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 777
})

In [36]:
from transformers import DataCollatorWithPadding

In [37]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
from torch.utils.data import DataLoader
# trainloader = DataLoader(data["train"],batch_size=32,shuffle=True,collate_fn=collate_func)
# valiloader = DataLoader(data["test"],batch_size=64,shuffle=False,collate_fn=collate_func)
trainloader = DataLoader(tokenizer_data_train,batch_size=32,shuffle=True,collate_fn=collator)
valiloader = DataLoader(tokenizer_data_test,batch_size=64,shuffle=False,collate_fn=collator)

In [39]:
trainloader

<torch.utils.data.dataloader.DataLoader at 0x7fa2b2f81b80>

In [41]:
next(enumerate(valiloader))[1]

{'input_ids': tensor([[ 101, 3341, 7270,  ...,    0,    0,    0],
        [ 101, 1765, 4415,  ...,  749, 2990,  102],
        [ 101, 2456, 6379,  ...,    0,    0,    0],
        ...,
        [ 101, 5018,  753,  ...,    0,    0,    0],
        [ 101,  122,  119,  ...,    0,    0,    0],
        [ 101, 2595,  817,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
        1, 1, 0

## Step6 创建模型及优化器

In [42]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
model.parameters()

<generator object Module.parameters at 0x7fa2b2d086d0>

In [44]:
optimizer = Adam(model.parameters(),lr=2e-5)

## Step7 训练与验证

In [45]:
import torch


In [64]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valiloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits,dim=-1)
            acc_num += (pred.long() == batch['labels'].long()).float().sum()
    return acc_num / tokenizer_data_test.num_rows

def train(epoch=3,log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step ==0:
                print(f"ep:{ep}, global_step:{global_step}, loss:{output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"e:{ep}, acc:{acc}")


## Step8 模型训练

In [59]:
tokenizer_data_test.num_rows


777

In [65]:
train()

ep:0, global_step:0, loss:0.06920930743217468
ep:0, global_step:100, loss:0.07966431230306625
ep:0, global_step:200, loss:0.002455147448927164
e:0, acc:0.8803088665008545
ep:1, global_step:300, loss:0.010180889628827572
ep:1, global_step:400, loss:0.00998788420110941
e:1, acc:0.8764479160308838
ep:2, global_step:500, loss:0.0695374384522438
ep:2, global_step:600, loss:0.0030672855209559202
e:2, acc:0.8828828930854797


## Step9 模型预测

In [66]:
sen = '我觉得这家酒店中等偏差，其中隔音效果不好！'
id2_lable = {0:'差评！',1:'好评！'}
model.eval()
with torch.inference_mode():
    input = tokenizer(sen,return_tensors='pt')
    input = {k:v.cuda() for k,v in input.items()}
    output = model(**input)
    pred = torch.argmax(output.logits,dim=-1)
    print(f"输入：{sen}，模型预测结果：{id2_lable.get(pred.item())}")

输入：我觉得这家酒店中等偏差，其中隔音效果不好！，模型预测结果：差评！


In [67]:
from transformers import pipeline
# model.config.id2label
model.config.id2label = id2_lable
pipe = pipeline("text-classification",model=model,tokenizer=tokenizer,device=0)

In [68]:
pipe(sen) 

[{'label': '差评！', 'score': 0.9614468216896057}]