In [37]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("../models/rbt3/")
model = AutoModelForSequenceClassification.from_pretrained("../models/rbt3/")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/rbt3/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("../Datasets/ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()
    
    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]
    
    def __len__(self):
        return len(self.data)




In [50]:
import torch
def collate_fn(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")    
    inputs["labels"] = torch.tensor(labels)
    return inputs

dataset = MyDataset()
# for i in range(5):
    # print(dataset[i])
from torch.utils.data import random_split,DataLoader

trainset, validset = random_split(dataset, lengths=[0.9, 0.1])
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=64, shuffle=True, collate_fn=collate_fn)

# next(enumerate(trainloader)) # 这样才能读取trainloader

In [None]:
for batch in trainloader:
    print(batch)

In [53]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr = 2e-5)
if torch.cuda.is_available():
    model = model.cuda()

def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if(torch.cuda.is_available()):
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")


In [54]:
train()

ep: 0, global_step: 0, loss: 0.20925581455230713
ep: 0, global_step: 100, loss: 0.42007529735565186
ep: 0, global_step: 200, loss: 0.2036401778459549
ep: 0, acc: 0.89304119348526
ep: 1, global_step: 300, loss: 0.12355365604162216
ep: 1, global_step: 400, loss: 0.10119751840829849
ep: 1, acc: 0.89304119348526
ep: 2, global_step: 500, loss: 0.08520162105560303
ep: 2, global_step: 600, loss: 0.16565218567848206
ep: 2, acc: 0.8878865838050842


In [64]:
sen = "我觉得这个酒店装修挺好的，但是有点脏"
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs)
    print(logits)
    # pred = torch.argmax(logits, dim=-1)
    # print(f"输入: {sen}\n 模型输出结果:{pred.item()}")

SequenceClassifierOutput(loss=None, logits=tensor([[-1.4047,  0.8411]], device='cuda:0'), hidden_states=None, attentions=None)
