<a href="https://colab.research.google.com/github/DevinTang-cs/bert_test_new/blob/main/test_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
!pip install transformers datasets accelerate torch

In [3]:
import time

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="uer/roberta-base-finetuned-jd-binary-chinese")
print(classifier("烦死了"))

In [5]:
time1 = time.time()
print(classifier("该政府机关文档管理信息系统以解决政务文档管理痛点、助力电子政务建设为核心目标，研发逻辑清晰、实践价值突出，整体设计与实现符合政府信息化管理需求，具体评推动电子政务发展的有效工具，仅需在后续迭代中进一步强化数据加密、多终端适配等细节，可更好满足政府日益精细化的信息化管理需求。"))
time2 = time.time()
print(time2-time1)

[{'label': 'positive (stars 4 and 5)', 'score': 0.5878700613975525}]
0.17975926399230957


In [6]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
# 1. 加载中文BERT
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=15)

In [None]:
# 2. 加载示例数据（情感分类）
dataset = load_dataset("clue", "tnews")  # CLUE官方新闻分类数据
label_names = dataset["train"].features["label"].names
# 构建数字→中文的映射字典（可选，方便查看）
label_map = {i: name for i, name in enumerate(label_names)}
print("标签映射关系：", label_map)

def preprocess(examples):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

encoded = dataset.map(preprocess, batched=True)

train_labels = encoded["train"]["label"]
print("训练集标签最大值：", max(train_labels))  # 若输出14，则需num_labels=15

In [None]:
# 3. 训练配置（修正参数名称）
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # 关键修正：将 evaluation_strategy 改为 eval_strategy
    per_device_train_batch_size=8,
    num_train_epochs=3,
    fp16=True,  # 若设备不支持混合精度，可设为 False
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"].select(range(1000)),  # 取少量样本快速测试
    eval_dataset=encoded["validation"].select(range(200)),
)

# 4. 开始训练
trainer.train()

In [12]:
model_path = "./results/checkpoint-375"
# 加载原始分词器（bert-base-chinese）
original_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 将分词器文件保存到 checkpoint-1250 目录（补充缺失文件）
original_tokenizer.save_pretrained(model_path)

('./results/checkpoint-375/tokenizer_config.json',
 './results/checkpoint-375/special_tokens_map.json',
 './results/checkpoint-375/vocab.txt',
 './results/checkpoint-375/added_tokens.json')

In [13]:
# 加载训练好的模型和分词器
# model_path = "./results/checkpoint-1250"  # 模型保存路径
tokenizer = BertTokenizer.from_pretrained(model_path)  # 加载训练时用的分词器
model = BertForSequenceClassification.from_pretrained(model_path)  # 加载训练好的模型

In [14]:
def predict_preprocess(text):
    # 分词、截断、填充，保持与训练时相同的max_length
    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128,  # 必须与训练时的max_length一致
        return_tensors="pt"  # 返回PyTorch张量
    )
    return inputs

In [15]:
import torch

In [17]:
def predict(text):
    # 预处理文本
    inputs = predict_preprocess(text)
    # 模型推理（关闭梯度计算，加快速度）
    with torch.no_grad():
        outputs = model(**inputs)  # 输出包含logits（未归一化的概率）
    # 取概率最大的类别（多分类任务）
    predictions = torch.argmax(outputs.logits, dim=1).item()  # item()转换为Python整数
    return predictions

# 示例：预测一条新闻的类别（tnews数据集的类别是0-14，对应不同新闻类型）
test_text = "哈哈哈哈哈哈哈哈哈哈"
predicted_label = predict(test_text)

print(f"预测类别：{predicted_label}")  # 输出如：5（假设5对应"财经"类）

预测类别：2
