本次未使用混淆数据测试，例如筋膜枪这种易混淆数据，尝试经典测试效果


处理初始数据集，初始安全数据集为THUC的词库数据集，这里由于时间限制，选择这个简单实现。其他的思路有，选取现在有的词向量数据集，
例如该连接提及的https://github.com/Embedding/Chinese-Word-Vectors/blob/master/README_zh.md
只需要取他们的词表就行。
汇总处理代码

```
import os

# 汇总文件的名称
output_file = "汇总结果.txt"

# 打开汇总文件以写入模式
with open(output_file, "w", encoding="utf-8") as outfile:
    # 遍历当前目录下的所有文件
    for file_name in os.listdir("."):
        # 检查文件是否以 .txt 结尾且不是汇总文件本身
        if file_name.endswith(".txt") and file_name != output_file:
            with open(file_name, "r", encoding="utf-8") as infile:
                for line in infile:
                    # 提取行中的第一个字段（以制表符为分隔）
                    first_field = line.split("\t")[0]
                    outfile.write(first_field + "\n")

print(f"汇总完成，结果已保存到 {output_file}")

```



敏感词数据集为 腾讯的严格敏感词数据集 https://github.com/cjh0613/tencent-sensitive-words

此外，进行的操作还有将安全词库和危险词库中重叠部分处理，处理方法为，将安全词库和危险词库重叠的部分从安全词库中去除

```
import os

# 定义文件夹路径
safe_folder = "safe"
unsafe_folder = "unsafe"
output_file = "filtered_safe.txt"

# 用于存储 safe 和 unsafe 文件夹中的词语
safe_words = set()
unsafe_words = set()

# 读取 safe 文件夹中的词语
for file_name in os.listdir(safe_folder):
    if file_name.endswith(".txt"):
        with open(os.path.join(safe_folder, file_name), "r", encoding="utf-8") as infile:
            for line in infile:
                # 将词语加入 safe 集合
                safe_words.add(line.strip())

# 读取 unsafe 文件夹中的词语
for file_name in os.listdir(unsafe_folder):
    if file_name.endswith(".txt"):
        with open(os.path.join(unsafe_folder, file_name), "r", encoding="utf-8") as infile:
            for line in infile:
                # 将词语加入 unsafe 集合
                unsafe_words.add(line.strip())

# 从 safe 集合中移除 unsafe 集合的词语
filtered_safe_words = safe_words - unsafe_words

# 将结果写入输出文件
with open(output_file, "w", encoding="utf-8") as outfile:
    for word in sorted(filtered_safe_words):  # 按字母顺序排序
        outfile.write(word + "\n")

print(f"处理完成，过滤后的词语已保存到 {output_file}")
```


In [8]:
# 导入必要的库
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 设置路径
data_dir = "./data"
safe_dir = os.path.join(data_dir, "safe")
unsafe_dir = os.path.join(data_dir, "unsafe")

# 数据预处理函数
# 数据预处理函数
def load_texts_from_directory(directory, label):
    texts = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as file:
                # 按回车符分割文本，并过滤掉空行
                lines = file.read().splitlines()
                lines = [line.strip() for line in lines if line.strip()]  # 去除空行
                # 为每行文本添加对应的标签
                texts.extend({"text": line, "label": label} for line in lines)
    return texts

# 加载数据
safe_texts = load_texts_from_directory(safe_dir, 0)  # safe -> label 0
unsafe_texts = load_texts_from_directory(unsafe_dir, 1)  # unsafe -> label 1

# 构建数据集
all_texts = safe_texts + unsafe_texts
dataset = Dataset.from_list(all_texts)

# 划分训练集和测试集
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({"train": dataset["train"], "test": dataset["test"]})


In [10]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 159090
})

设置huggingface环境变量，服务器无法直接访问

In [None]:
import os

# 设置 HF_HOME 环境变量
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com/"

# 打印结果以验证
print(f"The HF_HOME environment variable is now set to: {os.getenv('HF_ENDPOINT')}")


分词时候由于只是词汇级别的分类 所以将max_length设置成为64以提高训练速度

In [15]:
# 加载RoBERTa分词器
model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 定义分词函数
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)

# 对数据集进行分词
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Map:   0%|          | 0/159090 [00:00<?, ? examples/s]

Map:   0%|          | 0/39773 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# 检查是否有可用 GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 确保模型加载到 GPU 上
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# 定义评估指标
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# 定义训练参数（调整批量大小以适应显存）
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=256,  # 根据 GPU 显存调整批量大小
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,  # 开启混合精度训练（节省显存，提高性能）
)

# 使用 Trainer 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()

# 评估模型
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# 保存模型
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

Using device: cuda:0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.102,0.091224,0.968019,0.92353,0.949679,0.898783
2,0.0706,0.092971,0.970407,0.927928,0.973282,0.886614
3,0.0501,0.097566,0.970885,0.930725,0.952142,0.91025


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Results: {'eval_loss': 0.09756579250097275, 'eval_accuracy': 0.9708847710758555, 'eval_f1': 0.9307250538406318, 'eval_precision': 0.9521419828641371, 'eval_recall': 0.9102504095483267, 'eval_runtime': 88.0386, 'eval_samples_per_second': 451.768, 'eval_steps_per_second': 14.119, 'epoch': 3.0}


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 加载保存的模型和分词器
model_path = "./saved_model"  # 确保这个路径与训练时保存的一致
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# 定义测试函数
def predict(text):
    # 对输入文本进行分词和编码
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # 将输入转移到 GPU/CPU

    # 模型推断
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()  # 获取预测类别
        confidence = torch.softmax(logits, dim=1).max().item()  # 获取预测置信度

    # 返回结果
    return predicted_class, confidence

# 测试样例
text_pairs = [["枪", "筋膜枪"], ["刀", "餐刀"], ["炸弹", "气球"],["枪", "鱼枪"]]

# 处理每一对输入文本
results = []
for pair in text_pairs:
    pair_results = []
    for text in pair:
        predicted_class, confidence = predict(text)
        pair_results.append({
            "text": text,
            "predicted_class": "Safe" if predicted_class == 0 else "Unsafe",
            "confidence": f"{confidence:.2f}"
        })
    results.append(pair_results)

# 并排打印测试结果
print(f"{'Text 1':<10} {'Class 1':<10} {'Conf 1':<10} | {'Text 2':<10} {'Class 2':<10} {'Conf 2':<10}")
print("=" * 60)
for pair in results:
    text1, text2 = pair[0]['text'], pair[1]['text']
    class1, class2 = pair[0]['predicted_class'], pair[1]['predicted_class']
    conf1, conf2 = pair[0]['confidence'], pair[1]['confidence']
    print(f"{text1:<10} {class1:<10} {conf1:<10} | {text2:<10} {class2:<10} {conf2:<10}")


Using device: cuda
Text 1     Class 1    Conf 1     | Text 2     Class 2    Conf 2    
枪          Unsafe     1.00       | 筋膜枪        Unsafe     1.00      
刀          Unsafe     0.97       | 餐刀         Safe       0.83      
炸弹         Unsafe     1.00       | 气球         Safe       0.86      
枪          Unsafe     1.00       | 鱼枪         Safe       0.99      


总结，目前的模型是具有一定的泛化能力的，我并没有针对一些混淆数据集单独添加，但是模型任然能够处理一定量的混淆数据集。例如枪和鱼枪


但是对于筋膜枪这个，模型百分百认定未不安全词汇，首先考虑我上诉词库中并没有对于筋膜类似数据的正例，这里考虑后期定期做增量训练或者丰富原本正例数据集。


这里我的正例词库只有15万条左右，


但是百度百科词条高达28586901。而对于腾讯这个危险词汇数据库，只有四万多条数据，数据相对不平衡，不过这也符合真实世界的数据分布。


此外可以考虑对危险词汇数据集做数据增强，例如使用拼音或者谐音字替换·选取单个字或者词使用使用英文翻译，在词组中添加~，。等符号进行分割，对某些词进行简写，以及随机重组词语等等。


再次基础上再结合大模型对数据集可以进行少量扩充，在实际落地的时候可能达到更好的效果。