In [None]:
import polars as pl

pl.read_database_uri(
    query="SELECT comment,status FROM wl_Comment",
    uri="sqlite://waline.sqlite",
    engine="adbc",
).select(
    pl.col("comment"), pl.col("status").eq("waiting").cast(pl.UInt8).alias("label")
).write_ndjson(
    "dataset.ndjson"
)

In [None]:
import polars as pl

df = pl.read_ndjson("dataset.ndjson").select(
    pl.col("comment").str.replace_all(r"<[^>]*>", ""), pl.col("label").cast(pl.UInt8)
)
# df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 3. 训练 / 测试集划分
X_train, X_test, y_train, y_test = train_test_split(
    df["comment"], df["label"], test_size=0.2, random_state=42
)

# 4. TF-IDF 向量化
vectorizer = TfidfVectorizer()  # 只保留 5000 个最常见的词
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 5. 训练逻辑回归分类器
# model = LogisticRegression()
model = SVC(class_weight="balanced")
model.fit(X_train_tfidf, y_train)

# 6. 评估模型
y_pred = model.predict(X_test_tfidf)
print("准确率:", accuracy_score(y_test, y_pred))
print("分类报告:\n", classification_report(y_test, y_pred))

# 7. 预测新评论
new_comments = [
    "打电话是我想你聊个天把误会说清楚毕竟我是真心对你好",
    "我觉得这个服务有点问题。",
]
new_comments_tfidf = vectorizer.transform(new_comments)
predictions = model.predict(new_comments_tfidf)
print("预测结果:", predictions)  # 1 表示该用户的风格，0 表示不是

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")


# 数据集定义
class CommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len=128):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.comments[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.uint8),
        }


# 划分数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["comment"].to_list(), df["label"].to_list(), test_size=0.2, random_state=42
)

train_dataset = CommentDataset(train_texts, train_labels, tokenizer)
val_dataset = CommentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 加载模型
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.mps.is_available() else "cpu"
)
model.to(device)

# 训练设置
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()


def train_epoch(model, train_loader, optimizer, criterion):
    model.train()
    total_loss, correct = 0, 0
    for batch in train_loader:
        input_ids, attention_mask, labels = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["label"].to(device),
        )
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
    return total_loss / len(train_loader), correct / len(train_dataset)


def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["label"].to(device),
            )
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            total_loss += loss.item()
            correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
    return total_loss / len(val_loader), correct / len(val_dataset)


# 训练循环
epochs = 2
for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    print(
        f"Epoch {epoch+1}/{epochs}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}"
    )

# 保存模型
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

In [None]:
import torch

from transformers import BertTokenizer, BertForSequenceClassification

# model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)
model = BertForSequenceClassification.from_pretrained("./fine_tuned_bert")
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.mps.is_available() else "cpu"
)
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_bert")
model.to(device)


def predict_single_comment(comment, model, tokenizer, max_len=128):
    model.eval()
    encoding = tokenizer(
        comment,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt",
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return prediction  # 0 表示其他用户，1 表示目标用户


# 测试单条评论
comment = r"12321"
predicted_label = predict_single_comment(comment, model, tokenizer)
print(f"Predicted Label: {predicted_label}")

In [None]:
import onnxruntime as ort
import numpy as np


def evaluate(model_file: str, val_texts: list[str], val_labels: list[str]):
    session = ort.InferenceSession(
        model_file,
        providers=["CPUExecutionProvider"],  # CPU 部署
        sess_options=ort.SessionOptions(),
    )
    # 启用内存优化（减少 GPU/CPU 内存占用）
    session.set_providers(
        ["CPUExecutionProvider"],
        provider_options=[{"arena_extend_strategy": "kSameAsRequested"}],
    )
    correct = 0
    wrong = []

    for i in range(len(val_texts)):
        encoding = tokenizer(
            val_texts[i],
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="np",
        )
        input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
        outputs = session.run(
            ["logits"], {"input_ids": input_ids, "attention_mask": attention_mask}
        )
        output = int(np.argmax(outputs[0], axis=1)[0])
        if output == val_labels[i]:
            correct += 1
        else:
            wrong.append(val_texts[i])
        # correct += (outputs.logits.argmax(dim=1) == val_labels[i]).sum().item()
    return correct / len(val_dataset), wrong


original, quantized = evaluate("bert_model.onnx", val_texts, val_labels), evaluate(
    "bert_model_quantized.onnx", val_texts, val_labels
)
original, quantized

In [None]:
import torch

from transformers import BertTokenizer, BertForSequenceClassification

# model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)
model = BertForSequenceClassification.from_pretrained("./fine_tuned_bert")
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.mps.is_available() else "cpu"
)
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_bert")
model.to(device)


def predict_single_comment(comment, model, tokenizer, max_len=128):
    model.eval()
    encoding = tokenizer(
        comment,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt",
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return prediction  # 0 表示其他用户，1 表示目标用户


# 测试单条评论
comment = r"12321"
predicted_label = predict_single_comment(comment, model, tokenizer)
print(f"Predicted Label: {predicted_label}")