## Step1 导入相关包

In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from transformers import AutoTokenizer,AutoModelForSequenceClassification

## Step2 加载数据

In [2]:
# import pandas as pd
# data = pd.read_csv('ChnSentiCorp_htl_all.csv')
# data
from datasets import *
data = load_dataset('csv',data_files='./ChnSentiCorp_htl_all.csv',split="train")
data

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [3]:
# data = data.dropna()
# data
data = data.filter(lambda x: x['review'] is not None)
data

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [4]:
data["review"][:3]

['距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.',
 '商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!',
 '早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。']

## Step3 创建Dataset

## Step4 划分数据集

In [5]:
# from torch.utils.data import random_split

# trainset,validset = random_split(dataset,lengths=[0.9,0.1])
# len(trainset),len(validset)
data = data.train_test_split(test_size=0.1)
data

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step5 创建DataLoader

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

In [7]:
def process_fun(examples):
    tokenizer_e = tokenizer(examples["review"],max_length=128,truncation=True)
    tokenizer_e["labels"] = examples['label']
    return tokenizer_e

In [8]:
tokenizer_data = data.map(process_fun,batched=True,remove_columns=data["train"].column_names)

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

In [9]:
tokenizer_data_train = tokenizer_data['train']
tokenizer_data_test = tokenizer_data['test']
tokenizer_data_test

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 777
})

In [10]:
from transformers import DataCollatorWithPadding

In [11]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
from torch.utils.data import DataLoader
# trainloader = DataLoader(data["train"],batch_size=32,shuffle=True,collate_fn=collate_func)
# valiloader = DataLoader(data["test"],batch_size=64,shuffle=False,collate_fn=collate_func)
trainloader = DataLoader(tokenizer_data_train,batch_size=32,shuffle=True,collate_fn=collator)
valiloader = DataLoader(tokenizer_data_test,batch_size=64,shuffle=False,collate_fn=collator)

In [13]:
trainloader

<torch.utils.data.dataloader.DataLoader at 0x7f023e0ff3a0>

In [14]:
next(enumerate(valiloader))[1]

{'input_ids': tensor([[ 101, 6820, 6121,  ...,    0,    0,    0],
        [ 101,  671, 2458,  ...,  833, 1036,  102],
        [ 101, 1762, 1920,  ...,  689,  704,  102],
        ...,
        [ 101,  122,  119,  ..., 4960, 4197,  102],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101, 2791, 7313,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 0, 0

## Step6 创建模型及优化器

In [15]:
from torch.optim import Adam
import torch
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model.parameters()

<generator object Module.parameters at 0x7f023e116510>

In [17]:
optimizer = Adam(model.parameters(),lr=2e-5)

## Step7 训练与验证

In [22]:
import evaluate

accuracy = evaluate.load("accuracy")

In [36]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valiloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits,dim=-1)
            accuracy.add_batch(predictions=pred.long(), references=batch['labels'].long())
    return accuracy.compute()

def train(epoch=3,log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step ==0:
                print(f"ep:{ep}, global_step:{global_step}, loss:{output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"e:{ep}, {acc}")


## Step8 模型训练

In [37]:
tokenizer_data_test.num_rows


777

In [38]:
train()

ep:0, global_step:0, loss:0.02355360798537731
ep:0, global_step:100, loss:0.044752880930900574
ep:0, global_step:200, loss:0.003578264033421874
e:0, {'accuracy': 0.8738738738738738}
ep:1, global_step:300, loss:0.005413041915744543
ep:1, global_step:400, loss:0.0007925404352135956
e:1, {'accuracy': 0.8751608751608752}
ep:2, global_step:500, loss:0.0641748458147049
ep:2, global_step:600, loss:0.021910175681114197
e:2, {'accuracy': 0.8764478764478765}


## Step9 模型预测

In [39]:
sen = '我觉得这家酒店中等偏差，其中隔音效果不好！'
id2_lable = {0:'差评！',1:'好评！'}
model.eval()
with torch.inference_mode():
    input = tokenizer(sen,return_tensors='pt')
    input = {k:v.cuda() for k,v in input.items()}
    output = model(**input)
    pred = torch.argmax(output.logits,dim=-1)
    print(f"输入：{sen}，模型预测结果：{id2_lable.get(pred.item())}")

输入：我觉得这家酒店中等偏差，其中隔音效果不好！，模型预测结果：差评！


In [40]:
from transformers import pipeline
# model.config.id2label
model.config.id2label = id2_lable
pipe = pipeline("text-classification",model=model,tokenizer=tokenizer,device=0)

In [41]:
pipe(sen) 

[{'label': '差评！', 'score': 0.9921389222145081}]