数据

In [5]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import os
from models.dnf import DNF, DeltaDelayedExponentialDecayScheduler
import numpy as np
from pymongo import MongoClient

class WeiboDataset(Dataset):
    def __init__(self, mongo_uri=None, db_name=None, collection_name=None, folder_path=None, selected_features=None):
        # 设置默认特征（如果未指定则使用P1-P8）
        self.selected_features = selected_features if selected_features else [f"P{i}" for i in range(1, 9)]
        records = []
        
        # 从MongoDB读取数据
        if mongo_uri and db_name and collection_name:
            try:
                client = MongoClient(mongo_uri)
                db = client[db_name]
                collection = db[collection_name]
                
                # 查询文档
                cursor = collection.find({})
                for doc in cursor:
                    records.append(doc)
                
                print(f"从MongoDB加载了 {len(records)} 条记录")
                client.close()
            except Exception as e:
                print(f"连接MongoDB出错: {str(e)}")
                return
        # 从本地文件加载数据（保留原功能作为备选）
        elif folder_path:
            for filename in os.listdir(folder_path):
                if filename.endswith('.json'):
                    file_path = os.path.join(folder_path, filename)
                    with open(file_path, 'r', encoding="utf-8") as f:
                        data = json.load(f)
                        records.append(data)
        else:
            raise ValueError("必须提供MongoDB连接信息或本地数据文件夹路径")
            
        self.features = []
        self.labels = []
        
        for item in records:
            try:
                # 显式按顺序提取P1-P8并处理空值
                p_values = []
                for key in self.selected_features:  # 改为遍历选中特征
                    val = item.get('general', {}).get(key, 0.0)  # 获取值，若缺失则默认为0.0
                    # 处理空字符串
                    if isinstance(val, str):
                        if val.strip() == "":
                            val = 0.0  # 空字符串替换为0.0
                        else:
                            val = float(val)  # 尝试转换非空字符串
                    p_values.append(float(val))
                
                self.features.append(p_values)
                
                # 标签处理（兼容多种格式）
                label_val = item.get('label', 0)
                label = 1 if str(label_val).strip() in ("1", "true", "True") else 0
                self.labels.append(label)
                
            except (KeyError, ValueError) as e:
                print(f"数据 {item.get('_id', '未知')} 异常: {str(e)}")
                continue  # 跳过无效样本
                
        self.num_features = len(self.selected_features)
        # 新增诊断信息
        print(f"\n数据集诊断信息:")
        print(f"总样本数: {len(self.labels)}")
        if len(self.labels) > 0:
            print(f"正样本比例: {sum(self.labels)/len(self.labels):.2%}")
            print(f"特征维度: {len(self.features[0]) if self.features else 0}")
            print(f"示例特征: {self.features[0] if self.features else []}")
            print(f"对应标签: {self.labels[0] if self.labels else []}")
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        one_hot = torch.zeros(2)
        one_hot[label] = 1.0
        return torch.FloatTensor(self.features[idx]), one_hot# MongoDB连接信息
mongo_uri = "mongodb://root:CV3d!GXxZp4aApx@1.95.190.99:27017/admin"
db_name = "TRAING_DATA"  # 或者您实际使用的数据库名
collection_name = "WEIBO_21"  # 实际存储数据的集合名

# 特征选择
selected_features = ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"]  # 可自由组合

# 使用MongoDB加载数据
dataset = WeiboDataset(
    mongo_uri=mongo_uri,
    db_name=db_name,
    collection_name=collection_name,
    selected_features=selected_features
)

# 输出加载结果
print(f"成功加载 {len(dataset)} 个样本")
if len(dataset) > 0:
    print(dataset[0])

ModuleNotFoundError: No module named 'torch'

In [None]:
class DNFClassifier(nn.Module):
    def __init__(self, num_preds, num_conjuncts, n_out, delta=0.01, weight_init_type="normal"):
        super(DNFClassifier, self).__init__()
        self.dnf = DNF(num_preds, num_conjuncts, n_out, delta, weight_init_type=weight_init_type)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        return self.sigmoid(self.dnf(x))

In [None]:
# 参数配置
num_preds = dataset.num_features       # P1-P8特征维度
num_conjuncts = 10  # 合取项数量（可调整）
n_out = 2         # 二分类输出

model = DNFClassifier(num_preds, num_conjuncts, n_out)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Delta调度器（关键参数需根据实验调整）
delta_scheduler = DeltaDelayedExponentialDecayScheduler(
    initial_delta=0.01,
    delta_decay_delay=100,
    delta_decay_steps=50,
    delta_decay_rate=0.1
)

# ------------------------------------------------------------------
# 数据加载及划分训练集和验证集
train_size = int(0.8 * len(dataset))      # 80%作为训练集
val_size = len(dataset) - train_size      # 剩余作为验证集
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# ------------------------------------------------------------------


TRAIN

In [None]:
import matplotlib.pyplot as plt

# 训练参数
num_epochs = 50
patience = 5  # 早停容忍的 epoch 数量
best_val_loss = float("inf")
trigger_times = 0

train_loss_list = []
val_loss_list = []
accuracy_list = []

for epoch in range(num_epochs):
    # --------------------- 训练阶段 ---------------------
    model.train()
    running_train_loss = 0
    for step, (inputs, labels) in enumerate(train_loader):
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 更新 delta 值（假设 delta_scheduler 及 model.dnf 已定义）
        current_step = epoch * len(train_loader) + step
        delta_scheduler.step(model.dnf, current_step)
        
        running_train_loss += loss.item() * inputs.size(0)
    
    avg_train_loss = running_train_loss / len(train_loader.dataset)
    train_loss_list.append(avg_train_loss)
    
    # --------------------- 验证阶段 ---------------------
    model.eval()
    total_loss = 0
    total_samples = 0
    corrects = 0  # 累计正确预测数量
    # 修改验证阶段代码部分
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            total_samples += inputs.size(0)
            
            # 二分类预测（阈值0.5）
            predicted = outputs.argmax(dim=1)
            true_labels = labels.argmax(dim=1)
            corrects += (predicted == true_labels).sum().item()  # 计算正确预测数
            
    avg_val_loss = total_loss / total_samples
    accuracy = corrects / total_samples
    val_loss_list.append(avg_val_loss)
    accuracy_list.append(accuracy)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, " 
          f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    # --------------------- 早停策略 ---------------------
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0
        # 可以在这里保存模型最佳权重
        best_model_state = model.state_dict()
    else:
        trigger_times += 1
        print(f"EarlyStopping Trigger Times: {trigger_times}")
        if trigger_times >= patience:
            print("Early stopping!\n")
            break



# 保存最佳模型
torch.save(best_model_state, "./models/best_model.pth")

Epoch 1/50, Training Loss: 0.6802, Validation Loss: 0.6679, Accuracy: 0.5274
Epoch 2/50, Training Loss: 0.6439, Validation Loss: 0.6225, Accuracy: 0.6946
Epoch 3/50, Training Loss: 0.6024, Validation Loss: 0.5957, Accuracy: 0.7123
Epoch 4/50, Training Loss: 0.5854, Validation Loss: 0.5891, Accuracy: 0.7135
Epoch 5/50, Training Loss: 0.5807, Validation Loss: 0.5868, Accuracy: 0.7095
Epoch 6/50, Training Loss: 0.5791, Validation Loss: 0.5843, Accuracy: 0.7100
Epoch 7/50, Training Loss: 0.5783, Validation Loss: 0.5835, Accuracy: 0.7112
Epoch 8/50, Training Loss: 0.5772, Validation Loss: 0.5835, Accuracy: 0.7118
Epoch 9/50, Training Loss: 0.5767, Validation Loss: 0.5823, Accuracy: 0.7100
Epoch 10/50, Training Loss: 0.5765, Validation Loss: 0.5820, Accuracy: 0.7123
Epoch 11/50, Training Loss: 0.5759, Validation Loss: 0.5817, Accuracy: 0.7106
Epoch 12/50, Training Loss: 0.5761, Validation Loss: 0.5810, Accuracy: 0.7123
Epoch 13/50, Training Loss: 0.5757, Validation Loss: 0.5814, Accuracy: 0.

eval

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 加载保存的最佳模型状态
model.load_state_dict(torch.load("./models/best_model.pth"))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        # 获取预测类别和真实类别的索引，而不是one-hot编码
        preds = outputs.argmax(dim=1).cpu().numpy()
        true_labels = labels.argmax(dim=1).cpu().numpy()
        
        all_preds.extend(preds)
        all_labels.extend(true_labels)

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# 现在可以正常计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='binary')  # 对于类别索引，可以使用binary

print("Validation Accuracy: {:.4f}".format(accuracy))
print("Validation F1 Score: {:.4f}".format(f1))
print("\nClassification Report:\n", classification_report(all_labels, all_preds))

Validation Accuracy: 0.7135
Validation F1 Score: 0.7161

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.69      0.71       895
           1       0.69      0.74      0.72       857

    accuracy                           0.71      1752
   macro avg       0.71      0.71      0.71      1752
weighted avg       0.71      0.71      0.71      1752



In [None]:
# 训练完成后调用
rules = model.dnf.get_rules(threshold=0.5)  # 可调整阈值

# selected_features
print("==== 合取规则 ====")
for rule in rules["conjuncts"]:
    if "∅" not in rule:  # 过滤空规则
        print(rule)

print("\n==== 最终分类规则 ====")
for class_idx, rule in rules["disjuncts"].items():
    print(f"Class {class_idx}: {rule}")

==== 合取规则 ====
conj0 = P2 ∧ ¬P5
conj1 = ¬P5
conj2 = P2 ∧ ¬P5
conj3 = P2 ∧ ¬P5
conj4 = P2 ∧ ¬P5
conj5 = P2
conj6 = P2 ∧ ¬P3 ∧ ¬P4 ∧ ¬P5 ∧ ¬P6
conj7 = ¬P2
conj8 = P2
conj9 = P2

==== 最终分类规则 ====
Class 0: conj2 ∨ conj4 ∨ conj5 ∨ conj6 ∨ ¬conj7 ∨ conj9
Class 1: ¬conj1 ∨ ¬conj2 ∨ ¬conj3 ∨ ¬conj4 ∨ ¬conj5 ∨ ¬conj6 ∨ conj7 ∨ ¬conj9


In [None]:
# def explain_rule(rule: str, feature_map: dict) -> str:
#     """将符号规则转换为自然语言"""
#     explanation = []
#     for term in rule.split():
#         if "P" in term:
#             # 提取特征和符号
#             sign = "非" if "¬" in term else ""
#             p_num = term.split("P")[-1]
#             explanation.append(f"{sign}{feature_map[int(p_num)]}")
#         elif "conj" in term:
#             explanation.append(f"({term})")
#     return " 或 ".join(explanation).replace("∧", " 且 ").replace("∨", " 或 ")
# feature_map = {
#     1: "信息充分性", 2: "信息准确性", 
#     3: "内容完整性", 4: "意图正当性",
#     5: "发布者信誉", 6: "情感中立性",
#     7: "无诱导行为", 8: "信息一致性"
# }

In [None]:
# def generate_semantic_report(rules, feature_map):
#     """生成完整语义报告"""
#     report = []
#     # 合取规则解释
#     report.append("## 基础特征组合规则")
#     for conj in rules["conjuncts"]:
#         if "∅" not in conj:
#             _, expr = conj.split("=")
#             report.append(f"- 当 {explain_rule(expr.strip(), feature_map)} 时触发该规则")
    
#     # 最终决策规则解释
#     report.append("\n## 最终决策逻辑")
#     for cls, rule in rules["disjuncts"].items():
#         if "∅" not in rule:
#             cls_name = "虚假信息" if cls == 0 else "真实信息"
#             report.append(f"### {cls_name}判定条件")
#             report.append(f"满足以下任一条件即判定为{cls_name}：")
#             for term in rule.split("∨"):
#                 report.append(f"  - {explain_rule(term.strip(), feature_map)}")
#     return "\n".join(report)
# print(generate_semantic_report(rules, feature_map))

## 基础特征组合规则
- 当 信息准确性 或 非发布者信誉 时触发该规则
- 当 非发布者信誉 时触发该规则
- 当 信息准确性 或 非发布者信誉 时触发该规则
- 当 信息准确性 或 非发布者信誉 时触发该规则
- 当 信息准确性 或 非发布者信誉 时触发该规则
- 当 信息准确性 时触发该规则
- 当 信息准确性 或 非内容完整性 或 非意图正当性 或 非发布者信誉 或 非情感中立性 时触发该规则
- 当 非信息准确性 时触发该规则
- 当 信息准确性 时触发该规则
- 当 信息准确性 时触发该规则

## 最终决策逻辑
### 虚假信息判定条件
满足以下任一条件即判定为虚假信息：
  - (conj2)
  - (conj4)
  - (conj5)
  - (conj6)
  - (¬conj7)
  - (conj9)
### 真实信息判定条件
满足以下任一条件即判定为真实信息：
  - (¬conj1)
  - (¬conj2)
  - (¬conj3)
  - (¬conj4)
  - (¬conj5)
  - (¬conj6)
  - (conj7)
  - (¬conj9)
