In [None]:
import pandas as pd
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, csv_file, tokenizer, label2id, max_length=512):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        label = self.label2id[self.data.iloc[idx]['label']]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class MiniRBTClassifier(nn.Module):
    def __init__(self, model_name="hfl/minirbt-h256", num_labels=3):
        super(MiniRBTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_attentions=True)  # 启用返回 attention
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] 的输出
        x = self.dropout(cls_output)
        logits = self.classifier(x)
        return logits

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

# 标签映射
label2id = {'积极': 0, '中性': 1, '消极': 2}

# 初始化分词器
tokenizer = BertTokenizer.from_pretrained('hfl/minirbt-h256')

# 创建数据集
train_dataset = SentimentDataset('data/base_train.csv', tokenizer, label2id)
val_dataset = SentimentDataset('data/base_val.csv', tokenizer, label2id)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [None]:
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers import logging

logging.set_verbosity_error()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MiniRBTClassifier('hfl/minirbt-h256').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
epochs = 4

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Average training loss: {avg_loss:.4f}')

In [None]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader, desc=f'Epoch {epoch+1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
f1 = f1_score(all_labels, all_preds, average='macro')
print(f'Validation F1 Score: {f1:.4f}')
print(classification_report(all_labels, all_preds, target_names=label2id.keys()))

In [None]:
# 保存整个模型
torch.save(model.state_dict(), 'model/minirbt.pth')

In [None]:
from transformers import BertTokenizer
import torch

# 初始化模型并加载训练权重
model = MiniRBTClassifier("hfl/minirbt-h256")
model.load_state_dict(torch.load("model/minirbt.pth", weights_only=True))
model.eval()

tokenizer = BertTokenizer.from_pretrained("hfl/minirbt-h256")
inputs = tokenizer("这个产品真不错", return_tensors="pt", max_length=512, padding="max_length", truncation=True)

torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"]),
    "model/minirbt.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size"},
    },
    opset_version=17,
)

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType


quantize_dynamic(
    model_input="model/minirbt.onnx",
    model_output="model/minirbt_quant.onnx",
    weight_type=QuantType.QInt8,  # 或 QuantType.QUInt8
)