In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW 
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 初始化tokenizer和模型
tokenizer = BertTokenizer.from_pretrained("./bert-base-chinese")
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# 准备微调数据（简化版，实际需要更多数据）
train_texts = [
    "看完只觉得浪费了两个小时，再也不想看第二遍。",
    "包装环保、整洁美观，整体体验非常好。",
    "这部电影太糟糕了，剧情毫无逻辑。",
    "食物新鲜，配送速度快，非常满意。",
    "演员演技差，导演水平也不行。",
    "包装精美，服务态度好，下次还会光顾。"
]
train_labels = [1, 0, 1, 0, 1, 0]  # 1表示负面，0表示正面

# 创建Dataset类
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 创建数据加载器
MAX_LEN = 64
BATCH_SIZE = 2
EPOCHS = 3

train_dataset = ReviewDataset(train_texts, train_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# 设置优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# 训练设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 训练循环
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def predict_sentiment(text):
    # 准备输入
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    # 移动到设备
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # 预测
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # 获取预测结果
    _, preds = torch.max(outputs.logits, dim=1)
    
    return "正面" if preds == 0 else "负面"

In [8]:
# 待分类的句子
sentences = [
    "看完只觉得浪费了两个小时，再也不想看第二遍。",
    "包装环保、整洁美观，整体体验非常好。"
]

# 进行分类
for sentence in sentences:
    sentiment = predict_sentiment(sentence)
    print(f"句子: '{sentence}'")
    print(f"情感倾向: {sentiment}\n")

句子: '看完只觉得浪费了两个小时，再也不想看第二遍。'
情感倾向: 负面

句子: '包装环保、整洁美观，整体体验非常好。'
情感倾向: 正面

