In [None]:
from sklearn.datasets import fetch_openml

# 获取波士顿房价数据集
boston = fetch_openml(name='boston', version=1, as_frame=True)

# 数据（作为 pandas 数据帧）
X = boston.data
y = boston.target

# 选择需要的特征
# 波士顿房价数据集包含以下特征：
# ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
#  'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
# 您可以选择全部特征或部分特征进行实验
features_to_use = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
                  'RM', 'AGE', 'DIS', 'RAD', 'TAX',
                  'PTRATIO', 'B', 'LSTAT']

# 处理目标变量
# 在波士顿房价数据集中，目标变量为 'MEDV'（Median value of owner-occupied homes in $1000's）
y = y.astype(float)  # 确保目标变量为浮点数

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X[features_to_use], y.rename('MEDV')], axis=1).dropna()
X = data[features_to_use]
y = data['MEDV']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use

In [None]:
# ============================
# 导入必要的库
# ============================
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
from scipy.stats import ttest_rel
from sklearn.metrics import pairwise_distances

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================

from sklearn.datasets import fetch_openml

# 获取波士顿房价数据集
boston = fetch_openml(name='boston', version=1, as_frame=True)

# 数据（作为 pandas 数据帧）
X = boston.data
y = boston.target

# 选择需要的特征
# 波士顿房价数据集包含以下特征：
# ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
#  'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
# 您可以选择全部特征或部分特征进行实验
features_to_use = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
                  'RM', 'AGE', 'DIS', 'RAD', 'TAX',
                  'PTRATIO', 'B', 'LSTAT']

# 处理目标变量
# 在波士顿房价数据集中，目标变量为 'MEDV'（Median value of owner-occupied homes in $1000's）
y = y.astype(float)  # 确保目标变量为浮点数

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X[features_to_use], y.rename('MEDV')], axis=1).dropna()
X = data[features_to_use]
y = data['MEDV']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use

# 更新特征名称以便后续使用
feature_labels = features_to_use

# 定义扩展后的 FuzzyLayer 类，加入属性注意力机制
class AttentionFuzzyLayer(nn.Module):
    def __init__(self, input_dim, n_rules):
        super(AttentionFuzzyLayer, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules

        # 初始化中心 c 和宽度 sigma
        self.c = nn.Parameter(torch.randn(n_rules, input_dim))
        self.sigma = nn.Parameter(torch.ones(n_rules, input_dim))

        # 初始化属性注意力权重
        self.attention_weights = nn.Parameter(torch.randn(n_rules, input_dim))

        # 初始化属性掩码（1 表示活跃，0 表示被剪除）
        self.register_buffer('attribute_mask', torch.ones(n_rules, input_dim))

    def forward(self, x):
        # x: (batch_size, input_dim)
        batch_size = x.size(0)

        # 计算属性注意力权重（使用 sigmoid 激活函数并应用属性掩码）
        attention = torch.sigmoid(self.attention_weights) * self.attribute_mask  # (n_rules, input_dim)

        # 扩展维度以进行广播
        x_expanded = x.unsqueeze(1)  # (batch_size, 1, input_dim)
        c_expanded = self.c.unsqueeze(0)  # (1, n_rules, input_dim)
        sigma_expanded = self.sigma.unsqueeze(0)  # (1, n_rules, input_dim)

        # 确保 sigma 为正数，避免除以零
        sigma_expanded = torch.clamp(sigma_expanded, min=1e-3)

        # 计算高斯隶属度函数的对数
        diff = x_expanded - c_expanded  # (batch_size, n_rules, input_dim)
        exponent = -0.5 * ((diff / sigma_expanded) ** 2)

        # 使用属性注意力权重并应用属性掩码
        exponent_weighted = exponent * attention.unsqueeze(0)  # (batch_size, n_rules, input_dim)

        # 对输入属性维度求和
        sum_exponent = exponent_weighted.sum(dim=2)  # (batch_size, n_rules)

        # 计算规则激活度
        phi = torch.exp(sum_exponent)  # (batch_size, n_rules)

        return phi, attention

# 定义扩展后的 NormalizedLayer 类，加入规则注意力机制
class AttentionNormalizedLayer(nn.Module):
    def __init__(self, n_rules):
        super(AttentionNormalizedLayer, self).__init__()
        self.n_rules = n_rules

        # 初始化规则注意力权重
        self.rule_attention_weights = nn.Parameter(torch.ones(n_rules))

    def forward(self, phi):
        # phi: (batch_size, n_rules)
        # 计算规则注意力权重（使用 sigmoid 激活函数）
        rule_attention = torch.sigmoid(self.rule_attention_weights)  # (n_rules,)

        # 使用规则注意力权重调整规则激活度
        phi_weighted = phi * rule_attention.unsqueeze(0)  # (batch_size, n_rules)

        # 计算归一化的激活度
        phi_sum = phi_weighted.sum(dim=1, keepdim=True) + 1e-8  # 防止除以零
        psi = phi_weighted / phi_sum  # (batch_size, n_rules)
        return psi, rule_attention

# 定义扩展后的 WeightedLayer 类
class AttentionWeightedLayer(nn.Module):
    def __init__(self, input_dim, n_rules):
        super(AttentionWeightedLayer, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules

        # 初始化后件参数 a，包括偏置项
        self.a = nn.Parameter(torch.randn(n_rules, input_dim + 1))

    def forward(self, x, psi):
        # x: (batch_size, input_dim)
        # psi: (batch_size, n_rules)
        batch_size = x.size(0)
        # 添加偏置项
        ones = torch.ones(batch_size, 1).to(x.device)
        x_with_bias = torch.cat([ones, x], dim=1)  # (batch_size, input_dim + 1)

        # 扩展 x 和 a 的维度以进行广播
        x_expanded = x_with_bias.unsqueeze(1)  # (batch_size, 1, input_dim + 1)
        a_expanded = self.a.unsqueeze(0)       # (1, n_rules, input_dim + 1)

        # 计算每个规则的输出（元素级乘法后在特征维度上求和）
        w = (x_expanded * a_expanded).sum(dim=2)  # (batch_size, n_rules)

        f = psi * w  # (batch_size, n_rules)
        return f

# 定义 OutputLayer 类
class OutputLayer(nn.Module):
    def forward(self, f):
        # f: (batch_size, n_rules)
        output = f.sum(dim=1)  # (batch_size,)
        return output

# 定义扩展后的 SOFENN 模型
class AttentionDynamicAttributeAndRuleSOFENN(nn.Module):
    def __init__(self, input_dim, n_rules, attention_threshold=0.1):
        super(AttentionDynamicAttributeAndRuleSOFENN, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules
        self.attention_threshold = attention_threshold

        self.fuzzy_layer = AttentionFuzzyLayer(input_dim, n_rules)
        self.normalized_layer = AttentionNormalizedLayer(n_rules)
        self.weighted_layer = AttentionWeightedLayer(input_dim, n_rules)
        self.output_layer = OutputLayer()

    def forward(self, x):
        phi, attention = self.fuzzy_layer(x)
        psi, rule_attention = self.normalized_layer(phi)
        f = self.weighted_layer(x, psi)
        output = self.output_layer(f)
        return output, phi, attention, rule_attention

    def train_step(self, x, target, optimizer, lambda_attention=1e-7, lambda_rule_attention=1e-8, lambda_diversity=1e-4):
        """
        执行一次训练步骤。

        参数：
        - x: 输入数据，形状：(batch_size, input_dim)
        - target: 目标数据，形状：(batch_size,)
        - optimizer: 优化器实例
        - lambda_attention: 属性注意力权重的正则化系数
        - lambda_rule_attention: 规则注意力权重的正则化系数
        - lambda_diversity: 多样性正则化的系数

        返回：
        - loss.item(): 当前批次的总损失
        - output: 模型的输出
        """
        self.train()
        optimizer.zero_grad()
        output, _, attention, rule_attention = self.forward(x)
        # 计算预测损失（均方误差损失）
        loss_pred = nn.functional.mse_loss(output, target)

        # 添加属性注意力正则化损失（L1 正则化）
        loss_attention = lambda_attention * attention.abs().sum()

        # 计算规则注意力正则化损失（L1 正则化）
        loss_rule_attention = lambda_rule_attention * rule_attention.abs().sum()

        # 添加多样性正则化损失（鼓励不同规则的注意力权重不同）
        if self.n_rules > 1:
            # 计算注意力权重的余弦相似度矩阵
            attention_norm = attention / (attention.norm(dim=1, keepdim=True) + 1e-8)
            similarity_matrix = torch.matmul(attention_norm, attention_norm.t())
            # 计算非对角线的平均相似度
            diversity_loss = torch.sum(similarity_matrix) - torch.diag(similarity_matrix).sum()
            diversity_loss = diversity_loss / (self.n_rules * (self.n_rules - 1))
        else:
            diversity_loss = torch.tensor(0.0).to(attention.device)

        loss_diversity = lambda_diversity * diversity_loss

        # 总损失
        loss = loss_pred + loss_attention + loss_rule_attention + loss_diversity

        if torch.isnan(loss):
            print("Loss is NaN. Stopping training.")
            return loss.item(), output

        loss.backward()

        optimizer.step()

        return loss.item(), output

    def prune_attributes_per_rule(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01):
        """
        剪除每个规则中注意力权重低于阈值的属性，并冻结其相关参数。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝。

        参数：
        - threshold: 剪枝阈值，默认为 0.1
        - X_val: 验证集特征，形状：(num_val_samples, input_dim)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为 0.01（即 1%）

        返回：
        - pruned_dict: 字典，键为规则索引，值为被剪除的属性索引列表
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 保存剪枝前的模型状态和验证损失
        original_state = copy.deepcopy(self.state_dict())
        self.eval()
        with torch.no_grad():
            output_before = self.infer(X_val)
            loss_before = nn.functional.mse_loss(output_before, y_val)

        # 执行剪枝操作
        pruned_dict = {}
        with torch.no_grad():
            attention = torch.sigmoid(self.fuzzy_layer.attention_weights)  # (n_rules, input_dim)

            for rule_idx in range(self.n_rules):
                if torch.all(self.fuzzy_layer.attribute_mask[rule_idx] == 0):
                    continue  # 跳过已被完全剪除的规则

                prune_indices = torch.where(
                    (attention[rule_idx] < threshold) & (self.fuzzy_layer.attribute_mask[rule_idx] == 1)
                )[0].tolist()

                if prune_indices:
                    # 更新属性掩码
                    self.fuzzy_layer.attribute_mask[rule_idx, prune_indices] = 0.0

                    # 冻结被剪除属性的相关参数
                    self.fuzzy_layer.attention_weights[rule_idx, prune_indices].requires_grad = False
                    self.fuzzy_layer.c[rule_idx, prune_indices].requires_grad = False
                    self.fuzzy_layer.sigma[rule_idx, prune_indices].requires_grad = False
                    # 对 prune_indices 中的每个索引加 1，因为偏置项占用了第一个位置
                    prune_indices_plus_one = [idx + 1 for idx in prune_indices]
                    self.weighted_layer.a[rule_idx, prune_indices_plus_one].requires_grad = False  # +1 是因为有偏置项

                    pruned_dict[rule_idx] = prune_indices

        # 检查是否有规则的所有属性都被剪除，并处理
        with torch.no_grad():
            all_pruned_rules = torch.where(self.fuzzy_layer.attribute_mask.sum(dim=1) == 0)[0].tolist()

        if all_pruned_rules:
            print(f"Rules with all attributes pruned: {all_pruned_rules}")
            # 设置这些规则的规则注意力权重为非常低，确保被 prune_rules 剪除
            self.normalized_layer.rule_attention_weights.data[all_pruned_rules] = -1e6  # 安全地进行赋值操作

        # 剪枝后的验证损失
        self.eval()
        with torch.no_grad():
            output_after = self.infer(X_val)
            loss_after = nn.functional.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - loss_before) / loss_before

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，撤销剪枝操作
            self.load_state_dict(original_state)
            print(f"Pruning was reverted due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned_dict = {}  # 清空剪枝记录
            pruned = False
        else:
            # 性能没有显著下降，执行规则剪枝
            pruned = self.prune_rules(threshold=0.005)
            if pruned:
                print(f"Pruned rules after attribute pruning.")

        return pruned_dict

    def prune_rules(self, threshold=0.1):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。

        参数：
        - threshold: 剪枝阈值，默认值为 0.1

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        pruned = False
        with torch.no_grad():
            # 获取规则注意力权重
            rule_attention = torch.sigmoid(self.normalized_layer.rule_attention_weights)
            # 找到需要移除的规则索引（rule_attention < threshold）
            low_attention_indices = torch.where(rule_attention < threshold)[0]

            # 找到需要移除的规则索引（所有属性已被剪除）
            no_active_attrs_indices = torch.where(self.fuzzy_layer.attribute_mask.sum(dim=1) == 0)[0]

            # 合并需要移除的规则索引
            prune_indices = torch.cat([low_attention_indices, no_active_attrs_indices])
            prune_indices = torch.unique(prune_indices)

            if len(prune_indices) == 0:
                # 没有需要移除的规则
                return pruned

            # 保留的规则索引（rule_attention >= threshold AND 有活跃属性）
            keep_indices = torch.where(
                (rule_attention >= threshold) & (self.fuzzy_layer.attribute_mask.sum(dim=1) > 0)
            )[0]

            # 更新模型参数，移除低重要性的规则
            self.fuzzy_layer.c = nn.Parameter(self.fuzzy_layer.c.data[keep_indices])
            self.fuzzy_layer.sigma = nn.Parameter(self.fuzzy_layer.sigma.data[keep_indices])
            self.fuzzy_layer.attention_weights = nn.Parameter(self.fuzzy_layer.attention_weights.data[keep_indices])
            self.normalized_layer.rule_attention_weights = nn.Parameter(
                self.normalized_layer.rule_attention_weights.data[keep_indices]
            )
            self.weighted_layer.a = nn.Parameter(self.weighted_layer.a.data[keep_indices])

            # 更新属性掩码，移除被剪除规则的掩码行
            self.fuzzy_layer.attribute_mask = self.fuzzy_layer.attribute_mask.data[keep_indices].clone()

            # 更新规则数量
            self.n_rules = len(keep_indices)
            self.fuzzy_layer.n_rules = self.n_rules
            self.normalized_layer.n_rules = self.n_rules
            self.weighted_layer.n_rules = self.n_rules

            pruned = True  # 标记为已剪枝

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.fuzzy_layer.attribute_mask.shape[0] == self.n_rules, \
            f"After pruning, attribute_mask has shape {self.fuzzy_layer.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"After pruning, n_rules: {self.n_rules}, attribute_mask shape: {self.fuzzy_layer.attribute_mask.shape}")

        return pruned

    def grow_rule(self, X_new):
        """
        添加一个新的规则。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, input_dim)
        """
        # 获取设备和数据类型
        device = self.fuzzy_layer.c.device
        dtype = self.fuzzy_layer.c.dtype

        # 使用 X_new 计算新的规则中心和标准差
        new_c = torch.tensor(X_new.mean(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, input_dim)
        new_sigma = torch.tensor(X_new.std(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, input_dim)

        # 计算现有规则的属性注意力权重的平均值
        if self.n_rules > 0:
            existing_attention_weights = torch.sigmoid(self.fuzzy_layer.attention_weights).data  # (n_rules, input_dim)
            attention_mean = existing_attention_weights.mean(dim=0, keepdim=True)  # (1, input_dim)
        else:
            attention_mean = torch.ones(1, self.input_dim, dtype=dtype).to(device)  # 初始化为 1

        # 将新规则的属性注意力权重初始化为平均值并加入随机扰动
        noise = torch.randn_like(attention_mean) * 0.05  # 调整扰动大小以控制多样性
        new_attention_weights = (attention_mean + noise).clamp(0, 1).detach()  # 保持在 [0, 1] 范围内

        # 将新规则的规则注意力权重初始化为与现有权重的均值 logit 相同，并加入随机扰动
        if self.n_rules > 0:
            existing_rule_attention_logits = self.normalized_layer.rule_attention_weights.data  # (n_rules,)
            rule_attention_mean_logit = existing_rule_attention_logits.mean().unsqueeze(0)  # (1,)
            rule_attention_noise = torch.randn_like(rule_attention_mean_logit) * 0.05  # 调整扰动大小
            new_rule_attention_weight = (rule_attention_mean_logit + rule_attention_noise).detach()
        else:
            rule_attention_mean_logit = torch.tensor([0.0], dtype=dtype).to(device)  # 中性 logit
            new_rule_attention_weight = rule_attention_mean_logit.clone().detach()  # (1,)

        # 初始化后件参数为小的随机值
        new_a = torch.randn(1, self.input_dim + 1, dtype=dtype).to(device) * 0.01  # (1, input_dim + 1)

        # 将新的参数添加到模型中
        self.fuzzy_layer.c = nn.Parameter(torch.cat([self.fuzzy_layer.c.data, new_c], dim=0))  # (n_rules + 1, input_dim)
        self.fuzzy_layer.sigma = nn.Parameter(torch.cat([self.fuzzy_layer.sigma.data, new_sigma], dim=0))  # (n_rules + 1, input_dim)
        self.fuzzy_layer.attention_weights = nn.Parameter(torch.cat([self.fuzzy_layer.attention_weights.data, new_attention_weights], dim=0))  # (n_rules + 1, input_dim)
        self.normalized_layer.rule_attention_weights = nn.Parameter(torch.cat([self.normalized_layer.rule_attention_weights.data, new_rule_attention_weight], dim=0))  # (n_rules + 1,)
        self.weighted_layer.a = nn.Parameter(torch.cat([self.weighted_layer.a.data, new_a], dim=0))  # (n_rules + 1, input_dim + 1)

        # 更新属性掩码，添加新规则的掩码行
        new_attribute_mask = torch.ones(1, self.input_dim, dtype=self.fuzzy_layer.attribute_mask.dtype).to(device)  # (1, input_dim)
        self.fuzzy_layer.attribute_mask = torch.cat([self.fuzzy_layer.attribute_mask, new_attribute_mask], dim=0)  # (n_rules + 1, input_dim)

        # 更新规则数量
        self.n_rules += 1
        self.fuzzy_layer.n_rules = self.n_rules
        self.normalized_layer.n_rules = self.n_rules
        self.weighted_layer.n_rules = self.n_rules

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.fuzzy_layer.attribute_mask.shape[0] == self.n_rules, \
            f"After growing, attribute_mask has shape {self.fuzzy_layer.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"New rule added. Total rules: {self.n_rules}")

    def infer(self, x, targets=None):
        """
        执行推理。

        参数：
        - x: 输入数据，形状：(batch_size, input_dim)
        - targets: 目标数据，形状：(batch_size,)，可选

        返回：
        - 如果 targets 为 None，返回模型输出。
        - 否则，返回模型输出和损失值。
        """
        with torch.no_grad():
            self.eval()
            output, _, _, _ = self.forward(x)
            if targets is None:
                return output
            else:
                loss = nn.functional.mse_loss(output, targets)
                return output, loss.item()

    def extract_rules(self, scaler_X, scaler_y, feature_names=None):
        """
        提取模型的模糊规则。

        参数：
        - scaler_X: 输入数据的标准化器
        - scaler_y: 输出数据的标准化器
        - feature_names: 特征名称列表

        返回：
        - rules: 包含规则字符串的列表
        """
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(self.input_dim)]

        rules = []
        c = self.fuzzy_layer.c.detach().cpu().numpy()  # (n_rules, input_dim)
        sigma = self.fuzzy_layer.sigma.detach().cpu().numpy()  # (n_rules, input_dim)
        attention_weights = torch.sigmoid(self.fuzzy_layer.attention_weights).detach().cpu().numpy()  # (n_rules, input_dim)
        attribute_mask = self.fuzzy_layer.attribute_mask.detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(self.normalized_layer.rule_attention_weights).detach().cpu().numpy()
        a = self.weighted_layer.a.detach().cpu().numpy()  # (n_rules, input_dim + 1)

        # 反标准化
        c_orig = c * scaler_X.scale_.reshape(1, -1) + scaler_X.mean_.reshape(1, -1)
        sigma_orig = sigma * scaler_X.scale_.reshape(1, -1)
        a_orig = a.copy()
        a_orig[:, 1:] = a[:, 1:] / scaler_X.scale_.reshape(1, -1) * scaler_y.scale_[0]
        a_orig[:, 0] = scaler_y.scale_[0] * a[:, 0] + scaler_y.mean_[0] - np.sum(
            a[:, 1:] * scaler_X.mean_.reshape(1, -1) / scaler_X.scale_.reshape(1, -1) * scaler_y.scale_[0],
            axis=1
        )

        for j in range(self.n_rules):
            # 包含规则注意力权重
            rule_str = (f"Rule {j+1} (Rule Attention: {rule_attention_weights[j]:.4f}): IF ")
            antecedents = []
            for i in range(self.input_dim):
                if attribute_mask[j, i] == 0:
                    continue  # 忽略被剪枝的属性
                attention_value = attention_weights[j, i]
                mu = c_orig[j, i]
                sigma_i = sigma_orig[j, i]
                antecedents.append(
                    f"[{feature_names[i]} (Attn: {attention_value:.4f}) is Gaussian(c={mu:.4f}, σ={sigma_i:.4f})]"
                )
            antecedent_str = " AND ".join(antecedents) if antecedents else "True"
            rule_str += antecedent_str + " THEN Output = "

            # 构建后件部分
            a_j = a_orig[j, :]  # (input_dim + 1,)
            consequent_terms = [f"{a_j[0]:.4f}"]
            for idx, coef in enumerate(a_j[1:]):
                if attribute_mask[j, idx] == 0:
                    continue  # 忽略被剪枝的属性
                attention_value = attention_weights[j, idx]
                if coef >= 0:
                    term = f"+ {coef:.4f} * {feature_names[idx]} (Attn: {attention_value:.4f})"
                else:
                    term = f"- {abs(coef):.4f} * {feature_names[idx]} (Attn: {attention_value:.4f})"
                consequent_terms.append(term)
            consequent_str = " ".join(consequent_terms)
            rule_str += consequent_str
            rules.append(rule_str)
        return rules

    def save_model(self, path):
        torch.save(self.state_dict(), path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        self.load_state_dict(torch.load(path))
        print(f"Model loaded from {path}")

from scipy.stats import norm  # 确保导入了 norm 函数

def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    """
    使用解析解计算两个高斯隶属度函数的重叠面积。

    参数：
    - c1, sigma1: 第一个高斯函数的中心和标准差
    - c2, sigma2: 第二个高斯函数的中心和标准差

    返回：
    - overlap_area: 两个高斯函数的重叠面积
    """
    denominator = np.sqrt(sigma1**2 + sigma2**2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(model):
    """
    计算 Average Overlap Index (Iov)。

    参数：
    - model: 训练好的 SOFENN 模型

    返回：
    - average_iov: 平均重叠指数
    """
    c = model.fuzzy_layer.c.detach().cpu().numpy()       # (n_rules, input_dim)
    sigma = model.fuzzy_layer.sigma.detach().cpu().numpy() # (n_rules, input_dim)
    attribute_mask = model.fuzzy_layer.attribute_mask.detach().cpu().numpy() # (n_rules, input_dim)
    n_rules, input_dim = c.shape

    total_max_overlap = 0
    valid_attributes = 0

    for attr in range(input_dim):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, attr] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算重叠

        max_overlap = -np.inf
        for i in range(len(active_rules)):
            for j in range(i+1, len(active_rules)):
                rule_i = active_rules[i]
                rule_j = active_rules[j]
                c1 = c[rule_i, attr]
                sigma1 = sigma[rule_i, attr]
                c2 = c[rule_j, attr]
                sigma2 = sigma[rule_j, attr]
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0  # 避免除以零

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(model):
    """
    计算 Average Fuzzy Set Position Index (Ifspe)。

    参数：
    - model: 训练好的 SOFENN 模型

    返回：
    - average_ifspe: 平均模糊集位置指数
    """
    c = model.fuzzy_layer.c.detach().cpu().numpy()       # (n_rules, input_dim)
    sigma = model.fuzzy_layer.sigma.detach().cpu().numpy() # (n_rules, input_dim)
    attribute_mask = model.fuzzy_layer.attribute_mask.detach().cpu().numpy() # (n_rules, input_dim)
    n_rules, input_dim = c.shape

    total_ifspe = 0
    valid_terms = 0

    for attr in range(input_dim):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, attr] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算 Ifspe

        # 按中心值排序
        sorted_indices = active_rules[np.argsort(c[active_rules, attr])]
        sorted_centers = c[sorted_indices, attr]
        sorted_sigma = sigma[sorted_indices, attr]

        # 计算相邻规则对的 phi 和 psi
        for l in range(len(sorted_centers) - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sorted_sigma[l]
            s_lp1 = sorted_sigma[l + 1]

            phi = np.exp(-0.5 * ((v_l + v_lp1) / (s_l + s_lp1))**2)
            denominator = s_l - s_lp1
            if denominator == 0:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator)**2)

            # 使用绝对值确保 Ifspe_term 为非负数
            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0  # 避免除以零

    average_ifspe = total_ifspe / (n_rules * input_dim)
    return average_ifspe

# 定义训练函数
def train_attention_dynamic_attribute_and_rule_sofenn(
    X_train_np, y_train_np, X_val_np, y_val_np,
    initial_n_rules=3, epochs=1500, batch_size=32, lr=0.01,
    prune_frequency=190, prune_threshold=0.1,
    best_model_path='best_sofenn_model.pth'
):
    """
    训练 AttentionDynamicAttributeAndRuleSOFENN 模型。
    并在训练过程中保存验证集上表现最好的模型。

    参数：
    - X_train_np: 训练集特征，形状为 (num_samples, input_dim)
    - y_train_np: 训练集目标，形状为 (num_samples,)
    - X_val_np: 验证集特征，形状为 (num_val_samples, input_dim)
    - y_val_np: 验证集目标，形状为 (num_val_samples,)
    - initial_n_rules: 初始规则数量，默认值为 3
    - epochs: 训练轮数，默认值为 1500
    - batch_size: 每批次的样本数量，默认值为 32
    - lr: 学习率，默认值为 0.01
    - prune_frequency: 进行属性剪枝的频率（每隔多少个 epoch）
    - prune_threshold: 属性剪枝的阈值
    - best_model_path: 最佳模型保存的文件路径，默认值为 'best_sofenn_model.pth'

    返回：
    - model: 训练好的 SOFENN 模型（加载了最佳模型状态）
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    """
    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    input_dim = X_train_scaled_tensor.shape[1]
    model = AttentionDynamicAttributeAndRuleSOFENN(
        input_dim=input_dim,
        n_rules=initial_n_rules,
        attention_threshold=prune_threshold
    ).to(device)

    # 初始化优化器和调度器
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'active_rules': [],
        'total_rules': [],
        'val_rmse': [],
        'attribute_weights': [],
        'rule_attention_weights': [],
        'pruned_attributes': [],
        'total_active_attributes': []  # 新增
    }

    # 初始化变量以记录最佳验证损失和最佳模型状态
    best_val_loss = float('inf')
    best_model_state = None

    # 设置规则生长和剪枝的参数
    patience = 25  # 等待多少个 epoch 后触发规则生长
    grow_threshold = 0.0001  # 训练损失下降低于该阈值，触发规则生长
    no_improve_epochs = 0
    prev_val_loss = float('inf')

    max_rules = 3  # 设置规则数量上限，防止无限生长
    attention_threshold = 0.05  # 定义活跃规则的注意力权重阈值

    # 设置特征名称
    feature_labels = features_to_use  # 请根据您的数据集调整

    # 创建结果保存的目录
    os.makedirs('results_sofenn', exist_ok=True)

    # 训练模型
    for epoch in range(epochs):
        model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = model.train_step(
                batch_x,
                batch_y,
                optimizer,
                lambda_attention=1e-7,
                lambda_rule_attention=1e-8,
                lambda_diversity=1e-4
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        model.eval()

        with torch.no_grad():
            output, phi, attention, rule_attention = model.forward(X_val_scaled_tensor)
            loss_val = nn.functional.mse_loss(output, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(output.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

        # 调整学习率
        scheduler.step()

        # 计算当前活跃规则的数量
        rule_attention_np = torch.sigmoid(model.normalized_layer.rule_attention_weights).detach().cpu().numpy()
        num_active_rules = np.sum(rule_attention_np >= attention_threshold)

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['active_rules'].append(num_active_rules)
        training_info['total_rules'].append(model.n_rules)
        training_info['val_rmse'].append(val_rmse)

        # 提取注意力权重
        attention_weights = torch.sigmoid(model.fuzzy_layer.attention_weights).detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(model.normalized_layer.rule_attention_weights).detach().cpu().numpy()

        # 计算平均属性权重
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules

        # 保存注意力权重
        training_info['attribute_weights'].append(avg_attribute_weights)
        training_info['rule_attention_weights'].append(rule_attention_weights)

        # 计算当前所有规则中激活的属性总数
        attribute_mask = model.fuzzy_layer.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask)

        # 保存激活的属性总数
        training_info['total_active_attributes'].append(total_active_attributes)

        # 检查是否为最佳验证损失
        if loss_val.item() < best_val_loss:
            best_val_loss = loss_val.item()
            best_model_state = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), best_model_path)
            print(f"Epoch {epoch+1}: New best validation loss: {loss_val.item():.4f}. Model saved.")

        # 显示注意力权重信息
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {loss_val.item():.4f} - Val RMSE: {val_rmse:.4f} - Total Rules: {model.n_rules} - Active Rules: {num_active_rules}")

        # 检查验证损失的改进情况
        if loss_val.item() < prev_val_loss - grow_threshold:
            no_improve_epochs = 0
            prev_val_loss = loss_val.item()
        else:
            no_improve_epochs += 1

        # 如果验证损失在连续若干个 epoch 中没有显著改进，触发规则生长
        if no_improve_epochs >= patience and model.n_rules < max_rules:
            print(f"Epoch {epoch+1}: No significant improvement in validation loss, growing a new rule. Current rules: {model.n_rules}")
            # 找出当前误差较大的数据点，用于初始化新规则
            residuals = (y_val_scaled_tensor.cpu().numpy() - output.cpu().numpy())
            high_error_indices = np.argsort(np.abs(residuals))[-int(0.1 * len(residuals)):]  # 选取误差最大的 10% 数据
            X_new_rule = X_val_np[high_error_indices]
            # 添加新规则
            model.grow_rule(X_new_rule)
            print(f"Epoch {epoch+1}: Added new rule. Total rules: {model.n_rules}")
            no_improve_epochs = 0  # 重置计数器

            # 重新初始化优化器和调度器
            optimizer = optim.AdamW(model.parameters(), lr=lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
            print("Optimizer and scheduler re-initialized after growing a new rule.")

        # 设置剪枝停止的 epoch 阈值
        pruning_stop_epoch = int(epochs * 0.95)  # 在 80% 的训练过程中进行剪枝

        # 每隔 prune_frequency 个 epoch 进行属性剪枝
        if (epoch + 1) % prune_frequency == 0 and epoch < pruning_stop_epoch:
            pruned_dict = model.prune_attributes_per_rule(
                threshold=prune_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01  # 性能下降容忍度，可根据需要调整
            )
            training_info['pruned_attributes'].append(pruned_dict)
            if pruned_dict:
                print(f"Epoch {epoch+1}: Pruned attributes per rule: {pruned_dict}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch:
            pass  # 不再进行剪枝操作

        # 每隔若干个 epoch 进行规则剪枝
        if (epoch + 1) % 50 == 0 and epoch < pruning_stop_epoch:
            pruned = model.prune_rules(threshold=attention_threshold)
            if pruned:
                print(f"Epoch {epoch+1}: Pruned rules. Total rules: {model.n_rules}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch:
            pass  # 不再进行规则剪枝

    # 在训练结束后，加载最佳模型的状态字典
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Loaded the best model based on validation loss.")
    else:
        print("No improvement during training. Using the final model.")

    # 返回最佳模型和标准化器
    return model, scaler_X, scaler_y

# 现在，我们可以使用这个扩展后的 SOFENN 模型进行训练和测试
# 定义实验参数
initial_n_rules = 3
learning_rate = 0.01
epochs = 1500
batch_size = 512
prune_frequency = 25
prune_threshold = 0.25
repeats = 5  # 重复次数

# 记录实验结果
results_sofenn = []
test_rmse_list = []
time_list = []
for repeat in range(repeats):
    start_time = time.time()

    # 进一步将训练集拆分为训练和验证集
    X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
        X_train_np, y_train_np, test_size=0.2, random_state=repeat
    )

    # 定义最佳模型保存路径
    best_model_path = f'results_sofenn/best_sofenn_model_repeat{repeat+1}.pth'

    # 训练模型
    sofenn_model, scaler_X, scaler_y = train_attention_dynamic_attribute_and_rule_sofenn(
        X_train_sub, y_train_sub, X_test_np, y_test_np,
        initial_n_rules=initial_n_rules,
        epochs=epochs,
        batch_size=batch_size,
        lr=learning_rate,
        prune_frequency=prune_frequency,
        prune_threshold=prune_threshold,
        best_model_path=best_model_path
    )

    # 在测试集上测试模型
    sofenn_model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_test_scaled = scaler_X.transform(X_test_np)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
    with torch.no_grad():
        y_pred_scaled = sofenn_model.infer(X_test_tensor)
        y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
        y_true = y_test_np  # 使用原始的 y_test_np
        test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        test_rmse_list.append(test_rmse)
    # 记录时间
    end_time = time.time()
    time_taken = end_time - start_time
    time_list.append(time_taken)
    print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

    # 提取模糊规则
    rules = sofenn_model.extract_rules(scaler_X, scaler_y, feature_names=features_to_use)
    # 打印模糊规则
    print(f"\nFuzzy Rules for Repeat={repeat+1}:")
    for rule in rules:
        print(rule)
        print()

    # # 保存规则到文件
    # with open(f'results_sofenn/rules_repeat{repeat+1}.txt', 'w') as f:
    #     for rule in rules:
    #         f.write(rule + '\n')

    # 保存模型
    torch.save(sofenn_model.state_dict(), f'results_sofenn/sofenn_model_repeat{repeat+1}.pth')

    # 计算 Average Overlap Index (Iov) 和 Average Fuzzy Set Position Index (Ifspe) # 新增
    average_iov = compute_iov(sofenn_model)
    average_ifspe = compute_ifspe(sofenn_model)
    print(f"Repeat {repeat+1}: Average Overlap Index (Iov)={average_iov:.4f}, Average Fuzzy Set Position Index (Ifspe)={average_ifspe:.4f}")  # 新增

    # 保存结果
    result = {
        'repeat': repeat + 1,
        'test_rmse': test_rmse,
        'time_taken': time_taken,
        'total_active_attributes': np.sum(sofenn_model.fuzzy_layer.attribute_mask.detach().cpu().numpy()),
        'average_iov': average_iov,  # 新增
        'average_ifspe': average_ifspe  # 新增
    }
    results_sofenn.append(result)

# 打印所有实验的结果
for res in results_sofenn:
    print(f"Repeat {res['repeat']}: Test RMSE={res['test_rmse']:.4f}, Time={res['time_taken']:.2f}s, Total Active Attributes={res['total_active_attributes']}, Average Iov={res['average_iov']:.4f}, Average Ifspe={res['average_ifspe']:.4f}")  # 修改

# 计算平均 RMSE 和时间
test_rmse_mean = np.mean(test_rmse_list)
test_rmse_std = np.std(test_rmse_list)
time_mean = np.mean(time_list)
time_std = np.std(time_list)

# 计算平均 Iov 和 Ifspe # 新增
average_iov_list = [res['average_iov'] for res in results_sofenn]
average_ifspe_list = [res['average_ifspe'] for res in results_sofenn]
average_iov_mean = np.mean(average_iov_list)
average_iov_std = np.std(average_iov_list)
average_ifspe_mean = np.mean(average_ifspe_list)
average_ifspe_std = np.std(average_ifspe_list)

# 打印结果
print(f"\nResults:")
print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")
print(f"Average Overlap Index (Iov): {average_iov_mean:.4f} ± {average_iov_std:.4f}")  # 新增
print(f"Average Fuzzy Set Position Index (Ifspe): {average_ifspe_mean:.4f} ± {average_ifspe_std:.4f}")  # 新增

# 计算平均的总属性数量
total_attributes_list = [res['total_active_attributes'] for res in results_sofenn]
average_total_attributes = np.mean(total_attributes_list)
print(f"\nAverage Total Active Attributes over {repeats} repeats: {average_total_attributes:.2f}")


## ADAR-SOFENN

In [None]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

# 定义列名称
col_names = ['Year'] + [f'Feature_{i}' for i in range(1, 91)]

# 读取数据集
data = pd.read_csv(data_url, header=None, names=col_names)

# 特征选择
X = data.drop('Year', axis=1)
y = data['Year']

# 更新特征名称以便后续使用
feature_labels = X.columns.tolist()

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

features_to_use=feature_labels

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
import seaborn as sns
import os
import time
from scipy.stats import norm  # 新增

def reinitialize_optimizer(model, old_optimizer, lr=0.01):
    """
    重新初始化优化器，同时尽量保留旧优化器的状态。

    参数：
    - model: 新的模型实例。
    - old_optimizer: 旧的优化器实例。
    - lr: 学习率，默认为0.01。

    返回：
    - new_optimizer: 重新初始化的优化器实例。
    """
    # 创建新的优化器
    new_optimizer = optim.AdamW(model.parameters(), lr=lr)

    # 如果旧优化器存在状态
    if old_optimizer is not None:
        old_state_dict = old_optimizer.state_dict()
        new_state_dict = new_optimizer.state_dict()

        # 创建一个参数名称到参数对象的映射
        param_name_to_param = {name: param for name, param in model.named_parameters()}

        # 迁移优化器的状态
        for group_idx, group in enumerate(old_state_dict['param_groups']):
            new_group = new_state_dict['param_groups'][group_idx]
            new_group['lr'] = group['lr']  # 保留学习率等其他参数

            # 更新每个参数的状态
            new_group_params = []
            for p in group['params']:
                # 找到对应的参数对象
                param = None
                for name, p_new in model.named_parameters():
                    if id(p_new) == p:
                        param = p_new
                        break
                if param is not None and param.requires_grad:
                    new_group_params.append(id(param))
            new_group['params'] = new_group_params

        # 迁移状态
        for param_id, state in old_state_dict['state'].items():
            # 查找新的参数对象
            param = None
            for p in model.parameters():
                if id(p) == param_id:
                    param = p
                    break
            if param is not None and param.requires_grad:
                new_state_dict['state'][id(param)] = state

        # 加载更新后的状态字典
        try:
            new_optimizer.load_state_dict(new_state_dict)
            print("Optimizer state has been partially loaded.")
        except:
            print("Warning: Could not fully load optimizer state. Continuing with new optimizer state.")

    return new_optimizer

class AttentionDynamicAttributeAndRuleANFIS(nn.Module):
    def __init__(self, n_inputs, n_rules, X_train, attention_threshold=0.1):
        super(AttentionDynamicAttributeAndRuleANFIS, self).__init__()
        self.n_inputs = n_inputs
        self.attention_threshold = attention_threshold  # 属性注意力阈值
        self.n_rules = n_rules  # 初始规则数量

        # 使用 KMeans 聚类初始化隶属函数中心
        kmeans = KMeans(n_clusters=n_rules, random_state=42)
        kmeans.fit(X_train)
        cluster_centers = kmeans.cluster_centers_  # 形状：(n_rules, n_inputs)

        # 初始化隶属函数参数
        self.mu = nn.Parameter(torch.tensor(cluster_centers, dtype=torch.float32))  # 均值，形状：(n_rules, n_inputs)
        self.sigma = nn.Parameter(torch.ones(n_rules, n_inputs))  # 标准差

        # 初始化属性注意力权重参数
        self.attention_weights = nn.Parameter(torch.randn(n_rules, n_inputs))

        # 初始化规则注意力权重参数
        self.rule_attention_weights = nn.Parameter(torch.ones(n_rules))

        # 初始化后件参数（对于回归任务）
        self.consequents = nn.Parameter(torch.randn(n_rules, n_inputs))

        # 初始化属性掩码（1表示活跃，0表示被剪除），针对每个规则
        self.register_buffer('attribute_mask', torch.ones(n_rules, n_inputs))

    def forward(self, x):
        batch_size = x.size(0)

        # 将被剪枝的属性的 attention_weights 设置为一个大负数
        masked_attention_weights = self.attention_weights.clone()
        masked_attention_weights[self.attribute_mask == 0] = -1e6

        # 计算属性注意力权重（使用 sigmoid 激活函数并应用属性掩码）
        attention = torch.sigmoid(masked_attention_weights) * self.attribute_mask  # 形状：(n_rules, n_inputs)

        # 计算规则注意力权重（使用 sigmoid 激活函数）
        rule_attention = torch.sigmoid(self.rule_attention_weights)  # 形状：(n_rules,)

        # 扩展维度以匹配批次大小
        x_expanded = x.unsqueeze(1)  # 形状：(batch_size, 1, n_inputs)
        mu_expanded = self.mu.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)
        sigma_expanded = self.sigma.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 确保 sigma 为正数，避免除以零
        sigma_expanded = torch.clamp(sigma_expanded, min=1e-3)

        # 计算高斯隶属度函数的对数
        log_gauss = -0.5 * ((x_expanded - mu_expanded) ** 2) / (sigma_expanded ** 2)

        # 使用属性注意力权重并应用属性掩码
        log_gauss_weighted = log_gauss * attention.unsqueeze(0)  # 形状：(batch_size, n_rules, n_inputs)

        # 对输入属性维度求和
        sum_log_gauss = log_gauss_weighted.sum(dim=2)  # 形状：(batch_size, n_rules)

        # 计算规则的激活度
        firing_strength = torch.exp(sum_log_gauss)  # 形状：(batch_size, n_rules)

        # 使用规则注意力权重调整规则的激活度
        firing_strength_weighted = firing_strength * rule_attention.unsqueeze(0)  # 形状：(batch_size, n_rules)

        # 计算归一化的激活度
        sum_firing_strength = firing_strength_weighted.sum(dim=1, keepdim=True) + 1e-8
        norm_firing_strength = firing_strength_weighted / sum_firing_strength  # 形状：(batch_size, n_rules)

        # 计算后件部分（使用属性注意力权重）
        consequents_weighted = self.consequents * attention  # 形状：(n_rules, n_inputs)
        consequents_weighted_expanded = consequents_weighted.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 计算规则的输出（对于每个规则，后件为被选中属性的线性组合）
        rule_outputs = torch.sum(consequents_weighted_expanded * x_expanded, dim=2)  # 形状：(batch_size, n_rules)

        # 计算总输出
        output = torch.sum(norm_firing_strength * rule_outputs, dim=1)  # 形状：(batch_size,)

        return output, firing_strength, attention, rule_attention

    def train_step(self, x, target, optimizer, lambda_attention=1e-7, lambda_rule_attention=1e-8, lambda_diversity=1e-4):
        """
        执行一次训练步骤。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - target: 目标数据，形状：(batch_size,)
        - optimizer: 优化器实例
        - lambda_attention: 属性注意力权重的正则化系数
        - lambda_rule_attention: 规则注意力权重的正则化系数
        - lambda_diversity: 多样性正则化的系数

        返回：
        - loss.item(): 当前批次的总损失
        - output: 模型的输出
        """
        self.train()
        optimizer.zero_grad()
        output, _, attention, rule_attention = self.forward(x)
        # 计算预测损失（均方误差损失）
        loss_pred = F.mse_loss(output, target)

        # 添加属性注意力正则化损失（L1 正则化）
        loss_attention = lambda_attention * attention.abs().sum()

        # 计算规则注意力正则化损失（L1 正则化）
        loss_rule_attention = lambda_rule_attention * rule_attention.abs().sum()

        # 添加多样性正则化损失（鼓励不同规则的注意力权重不同）
        if self.n_rules > 1:
            # 计算注意力权重的余弦相似度矩阵
            attention_norm = attention / (attention.norm(dim=1, keepdim=True) + 1e-8)
            similarity_matrix = torch.matmul(attention_norm, attention_norm.t())
            # 计算非对角线的平均相似度
            diversity_loss = torch.sum(similarity_matrix) - torch.diag(similarity_matrix).sum()
            diversity_loss = diversity_loss / (self.n_rules * (self.n_rules - 1))
        else:
            diversity_loss = torch.tensor(0.0).to(attention.device)

        loss_diversity = lambda_diversity * diversity_loss

        # 总损失
        loss = loss_pred + loss_attention + loss_rule_attention + loss_diversity

        if torch.isnan(loss):
            print("Loss is NaN. Stopping training.")
            return loss.item(), output

        loss.backward()

        # 应用梯度裁剪（如果需要）
        # torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)

        optimizer.step()

        return loss.item(), output

    def prune_attributes_per_rule(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01):
        """
        剪除每个规则中注意力权重低于阈值的属性，并冻结其相关参数。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝。

        参数：
        - threshold: 剪枝阈值，默认为0.1
        - X_val: 验证集特征，形状：(num_val_samples, n_inputs)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）

        返回：
        - pruned_dict: 字典，键为规则索引，值为被剪除的属性索引列表
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 保存剪枝前的模型状态和验证损失
        original_state = copy.deepcopy(self.state_dict())
        self.eval()
        with torch.no_grad():
            output_before = self.infer(X_val)
            loss_before = F.mse_loss(output_before, y_val)

        # 执行剪枝操作
        pruned_dict = {}
        with torch.no_grad():
            attention = torch.sigmoid(self.attention_weights)  # 形状：(n_rules, n_inputs)

            for rule_idx in range(self.n_rules):
                if torch.all(self.attribute_mask[rule_idx] == 0):
                    continue  # 跳过已被完全剪除的规则

                prune_indices = torch.where((attention[rule_idx] < threshold) & (self.attribute_mask[rule_idx] == 1))[0].tolist()

                if prune_indices:
                    # 更新属性掩码
                    self.attribute_mask[rule_idx, prune_indices] = 0.0

                    # 冻结被剪除属性的相关参数
                    self.attention_weights[rule_idx, prune_indices].requires_grad = False
                    self.consequents[rule_idx, prune_indices].requires_grad = False

                    pruned_dict[rule_idx] = prune_indices

        # 检查是否有规则的所有属性都被剪除，并处理
        with torch.no_grad():
            all_pruned_rules = torch.where(self.attribute_mask.sum(dim=1) == 0)[0].tolist()

        if all_pruned_rules:
            print(f"Rules with all attributes pruned: {all_pruned_rules}")
            # 设置这些规则的规则注意力权重为非常低，确保被 prune_rules 剪除
            self.rule_attention_weights.data[all_pruned_rules] = -1e6  # 安全地进行赋值操作

        # 剪枝后的验证损失
        self.eval()
        with torch.no_grad():
            output_after = self.infer(X_val)
            loss_after = F.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - loss_before) / loss_before

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，撤销剪枝操作
            self.load_state_dict(original_state)
            print(f"Pruning was reverted due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned_dict = {}  # 清空剪枝记录
            pruned = False
        else:
            # 性能没有显著下降，执行规则剪枝
            pruned = self.prune_rules(threshold=0.005)
            if pruned:
                print(f"Pruned rules after attribute pruning.")

        return pruned_dict

    def prune_rules(self, threshold=0.1):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。

        参数：
        - threshold: 剪枝阈值，默认值为0.1

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        pruned = False
        with torch.no_grad():
            # 获取规则注意力权重
            rule_attention = torch.sigmoid(self.rule_attention_weights)
            # 找到需要移除的规则索引（rule_attention < threshold）
            low_attention_indices = torch.where(rule_attention < threshold)[0]

            # 找到需要移除的规则索引（所有属性已被剪除）
            no_active_attrs_indices = torch.where(self.attribute_mask.sum(dim=1) == 0)[0]

            # 合并需要移除的规则索引
            prune_indices = torch.cat([low_attention_indices, no_active_attrs_indices])
            prune_indices = torch.unique(prune_indices)

            if len(prune_indices) == 0:
                # 没有需要移除的规则
                return pruned

            # 保留的规则索引（rule_attention >= threshold AND 有活跃属性）
            keep_indices = torch.where((rule_attention >= threshold) & (self.attribute_mask.sum(dim=1) > 0))[0]

            # 更新模型参数，移除低重要性的规则
            self.mu = nn.Parameter(self.mu.data[keep_indices])
            self.sigma = nn.Parameter(self.sigma.data[keep_indices])
            self.attention_weights = nn.Parameter(self.attention_weights.data[keep_indices])
            self.rule_attention_weights = nn.Parameter(self.rule_attention_weights.data[keep_indices])
            self.consequents = nn.Parameter(self.consequents.data[keep_indices])

            # 更新属性掩码，移除被剪除规则的掩码行
            self.attribute_mask = self.attribute_mask.data[keep_indices].clone()

            # 更新规则数量
            self.n_rules = len(keep_indices)

            pruned = True  # 标记为已剪枝

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.attribute_mask.shape[0] == self.n_rules, \
            f"After pruning, attribute_mask has shape {self.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"After pruning, n_rules: {self.n_rules}, attribute_mask shape: {self.attribute_mask.shape}")

        return pruned

    def grow_rule(self, X_new):
        """
        添加一个新的规则。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, n_inputs)
        """
        # 获取设备和数据类型
        device = self.mu.device
        dtype = self.mu.dtype

        # 使用 X_new 计算新的规则中心和标准差
        new_mu = torch.tensor(X_new.mean(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)
        new_sigma = torch.tensor(X_new.std(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)

        # 计算现有规则的属性注意力权重的平均值
        if self.n_rules > 0:
            existing_attention_weights = torch.sigmoid(self.attention_weights).data  # (n_rules, n_inputs)
            attention_mean = existing_attention_weights.mean(dim=0, keepdim=True)  # (1, n_inputs)
        else:
            attention_mean = torch.ones(1, self.n_inputs, dtype=dtype).to(device)  # 初始化为1

        # 将新规则的属性注意力权重初始化为平均值并加入随机扰动
        noise = torch.randn_like(attention_mean) * 0.05  # 调整扰动大小以控制多样性
        new_attention_weights = (attention_mean + noise).clamp(0, 1).detach()  # 保持在[0,1]范围内

        # 将新规则的规则注意力权重初始化为与现有权重的均值 logit 相同，并加入随机扰动
        if self.n_rules > 0:
            existing_rule_attention_logits = self.rule_attention_weights.data  # (n_rules,)
            rule_attention_mean_logit = existing_rule_attention_logits.mean().unsqueeze(0)  # (1,)
            rule_attention_noise = torch.randn_like(rule_attention_mean_logit) * 0.05  # 调整扰动大小
            new_rule_attention_weight = (rule_attention_mean_logit + rule_attention_noise).detach()
        else:
            rule_attention_mean_logit = torch.tensor([0.0], dtype=dtype).to(device)  # 中性 logit
            new_rule_attention_weight = rule_attention_mean_logit.clone().detach()  # (1,)

        # 初始化后件参数为小的随机值
        new_consequents = torch.randn(1, self.n_inputs, dtype=dtype).to(device) * 0.01  # (1, n_inputs)

        # 将新的参数添加到模型中
        self.mu = nn.Parameter(torch.cat([self.mu.data, new_mu], dim=0))  # (n_rules + 1, n_inputs)
        self.sigma = nn.Parameter(torch.cat([self.sigma.data, new_sigma], dim=0))  # (n_rules + 1, n_inputs)
        self.attention_weights = nn.Parameter(torch.cat([self.attention_weights.data, new_attention_weights], dim=0))  # (n_rules + 1, n_inputs)
        self.rule_attention_weights = nn.Parameter(torch.cat([self.rule_attention_weights.data, new_rule_attention_weight], dim=0))  # (n_rules + 1,)
        self.consequents = nn.Parameter(torch.cat([self.consequents.data, new_consequents], dim=0))  # (n_rules + 1, n_inputs)

        # 更新属性掩码，添加新规则的掩码行
        new_attribute_mask = torch.ones(1, self.n_inputs, dtype=self.attribute_mask.dtype).to(device)  # (1, n_inputs)
        self.attribute_mask = torch.cat([self.attribute_mask, new_attribute_mask], dim=0)  # (n_rules + 1, n_inputs)

        # 更新规则数量
        self.n_rules += 1

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.attribute_mask.shape[0] == self.n_rules, \
            f"After growing, attribute_mask has shape {self.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"New rule added. Total rules: {self.n_rules}")

    def infer(self, x, targets=None):
        """
        执行推理。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - targets: 目标数据，形状：(batch_size,)，可选

        返回：
        - 如果 targets 为 None，返回模型输出。
        - 否则，返回模型输出和损失值。
        """
        with torch.no_grad():
            self.eval()
            output, _, _, _ = self.forward(x)
            if targets is None:
                return output
            else:
                loss = F.mse_loss(output, targets)
                return output, loss.item()

    def save_model(self, path):
        torch.save(self.state_dict(), path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        self.load_state_dict(torch.load(path))
        print(f"Model loaded from {path}")

    def plot_membership_functions(self, feature_names=None):
        """
        绘制训练后的隶属函数图像。

        参数：
        - feature_names: 特征名称列表，默认为 None。
        """
        mus = self.mu.detach().cpu().numpy()
        sigmas = self.sigma.detach().cpu().numpy()
        attentions = torch.sigmoid(self.attention_weights).detach().cpu().numpy()
        rule_attentions = torch.sigmoid(self.rule_attention_weights).detach().cpu().numpy()
        xn = np.linspace(-3, 3, 1000)

        n_inputs = self.n_inputs
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(n_inputs)]

        for r in range(self.n_rules):
            rule_attention_value = rule_attentions[r]
            plt.figure(figsize=(10, 6))
            plt.title(f"Rule {r + 1}, Rule Attention: {rule_attention_value:.4f}")
            for j in range(n_inputs):
                # 使用 attribute_mask 确认是否为活跃属性
                if self.attribute_mask[r, j] == 0:
                    continue  # 跳过被剪除的属性
                attention_value = attentions[r, j]
                # 绘制带有注意力权重的隶属函数
                y = np.exp(-0.5 * ((xn - mus[r, j]) ** 2) / (sigmas[r, j] ** 2 + 1e-8))
                plt.plot(xn, y, label=f"{feature_names[j]} (Attn: {attention_value:.4f})")
            plt.legend()
            plt.xlabel('Input')
            plt.ylabel('Membership degree')
            plt.grid(True)
            plt.show()

def extract_fuzzy_rules(anfis_model, scaler_X, feature_names=None):
    """
    提取 ANFIS 模型的模糊规则，包含所有用于计算的权重。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - scaler_X: 输入数据的标准化器
    - feature_names: 特征名称列表

    返回：
    - rules: 包含规则字符串的列表
    """
    # 获取模型的参数
    mus = anfis_model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = anfis_model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
    rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
    consequents = anfis_model.consequents.detach().cpu().numpy()

    # 反标准化 mu 和 sigma
    c_orig = mus * scaler_X.scale_ + scaler_X.mean_  # (n_rules, n_inputs)
    sigma_orig = sigmas * scaler_X.scale_  # (n_rules, n_inputs)

    # 获取属性掩码
    attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()

    # 如果未提供特征名称，使用默认名称
    n_rules, n_inputs = mus.shape
    if feature_names is None:
        feature_names = [f'Input {i+1}' for i in range(n_inputs)]

    rules = []

    for i in range(n_rules):
        # 包含规则注意力权重
        rule_str = (f"Rule {i+1} (Rule Attention: "
                    f"{rule_attention_weights[i]:.4f}): IF ")
        antecedent = []
        for j in range(n_inputs):
            if attribute_mask[i, j] == 0:
                continue  # 忽略被剪枝的属性
            attention_value = attention_weights[i, j]
            c_val = c_orig[i, j]
            sigma_val = sigma_orig[i, j]
            antecedent.append(
                f"[{feature_names[j]} (Attn: {attention_value:.4f}) "
                f"is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})]"
            )
        antecedent_str = " AND ".join(antecedent) if antecedent else "True"
        rule_str += antecedent_str + " THEN Output = "

        consequent_terms = []
        for j in range(n_inputs):
            if attribute_mask[i, j] == 0:
                continue
            attention_value = attention_weights[i, j]
            coef = consequents[i, j]
            consequent_terms.append(
                f"({coef:.4f} * {feature_names[j]} "
                f"(Attn: {attention_value:.4f}))"
            )
        consequent_str = " + ".join(consequent_terms) if consequent_terms else "0"
        rule_str += consequent_str
        rules.append(rule_str)

    return rules

def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    """
    使用解析解计算两个高斯隶属度函数的重叠面积。

    参数：
    - c1, sigma1: 第一个高斯函数的中心和标准差
    - c2, sigma2: 第二个高斯函数的中心和标准差

    返回：
    - overlap_area: 两个高斯函数的重叠面积
    """
    denominator = np.sqrt(sigma1**2 + sigma2**2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(model):
    """
    计算 Average Overlap Index (Iov)。

    参数：
    - model: 训练好的 ANFIS 模型

    返回：
    - average_iov: 平均重叠指数
    """
    mus = model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attribute_mask = model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)
    n_rules, n_inputs = mus.shape

    total_max_overlap = 0
    valid_attributes = 0

    for j in range(n_inputs):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, j] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算重叠

        max_overlap = -np.inf
        for i in range(len(active_rules)):
            for k in range(i + 1, len(active_rules)):
                rule_i = active_rules[i]
                rule_k = active_rules[k]
                c1 = mus[rule_i, j]
                sigma1 = sigmas[rule_i, j]
                c2 = mus[rule_k, j]
                sigma2 = sigmas[rule_k, j]
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0  # 避免除以零

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(model):
    """
    计算 Average Fuzzy Set Position Index (Ifspe)。

    参数：
    - model: 训练好的 ANFIS 模型

    返回：
    - average_ifspe: 平均模糊集位置指数（非负数）
    """
    mus = model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attribute_mask = model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)
    n_rules, n_inputs = mus.shape

    total_ifspe = 0
    valid_terms = 0

    for j in range(n_inputs):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, j] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算 Ifspe

        # 按中心值排序
        sorted_indices = active_rules[np.argsort(mus[active_rules, j])]
        sorted_centers = mus[sorted_indices, j]
        sorted_sigma = sigmas[sorted_indices, j]

        # 计算相邻规则对的 phi 和 psi
        for l in range(len(sorted_centers) - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sorted_sigma[l]
            s_lp1 = sorted_sigma[l + 1]

            phi = np.exp(-0.5 * ((v_l + v_lp1) / (s_l + s_lp1))**2)
            denominator = s_l - s_lp1
            if denominator == 0:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator)**2)

            # 使用绝对值确保 Ifspe_term 为非负数
            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0  # 避免除以零

    # 归一化因子为 L * D
    average_ifspe = total_ifspe / (n_inputs * n_rules)
    return average_ifspe


def plot_attribute_weights(attribute_weights, feature_names):
    plt.figure(figsize=(10, 6))
    x = np.arange(len(feature_names))
    plt.bar(x, attribute_weights)
    plt.xticks(x, feature_names, rotation=45)
    plt.xlabel('Attributes')
    plt.ylabel('Average Attribute Weights')
    plt.title('Average Attribute Weights over Repeats')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_heatmap(anfis_model, feature_names):
    """
    绘制属性权重的热力图。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - feature_names: 特征名称列表
    """
    # 提取属性掩码
    attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 将被剪枝的属性的 attention_weights 设置为一个大负数
    attention_weights = anfis_model.attention_weights.clone()
    attention_weights[anfis_model.attribute_mask == 0] = -1e6

    # 计算注意力权重，并应用 attribute_mask
    attention = torch.sigmoid(attention_weights) * anfis_model.attribute_mask
    attention_np = attention.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 创建注释字符串
    annotations = []
    for r in range(attention_np.shape[0]):
        row = []
        for a in range(attention_np.shape[1]):
            if attribute_mask_np[r, a] == 0:
                row.append("0.00\nX")  # 被剪枝的属性，值为 0，标记为 X
            else:
                row.append(f"{attention_np[r, a]:.2f}")
        annotations.append(row)

    plt.figure(figsize=(12, 6))
    sns.heatmap(
        attention_np,
        annot=annotations,
        fmt='',
        cmap='viridis',
        xticklabels=feature_names,
        yticklabels=[f'Rule {i+1}' for i in range(anfis_model.n_rules)],
        cbar_kws={'label': 'Attention Weight'}
    )
    plt.title(f'属性注意力权重 (被剪枝的属性标记为 X)')
    plt.xlabel('输入特征')
    plt.ylabel('规则')
    plt.tight_layout()
    plt.show()

def train_attention_dynamic_attribute_and_rule_anfis(
    X_train_np, y_train_np, X_val_np, y_val_np,
    initial_n_rules=3, epochs=1500, batch_size=32, lr=0.01,
    prune_frequency=190, prune_threshold=0.0001,
    best_model_path='best_model.pth'
):
    """
    训练 AttentionDynamicAttributeAndRuleANFIS 模型。
    并在训练过程中保存验证集上表现最好的模型。

    参数：
    - X_train_np: 训练集特征，形状为 (num_samples, n_inputs)
    - y_train_np: 训练集目标，形状为 (num_samples,)
    - X_val_np: 验证集特征，形状为 (num_val_samples, n_inputs)
    - y_val_np: 验证集目标，形状为 (num_val_samples,)
    - initial_n_rules: 初始规则数量，默认值为 3
    - epochs: 训练轮数，默认值为 1500
    - batch_size: 每批次的样本数量，默认值为 32
    - lr: 学习率，默认值为 0.01
    - prune_frequency: 进行属性剪枝的频率（每隔多少个 epoch）
    - prune_threshold: 属性剪枝的阈值
    - best_model_path: 最佳模型保存的文件路径，默认值为 'best_model.pth'

    返回：
    - anfis_model: 训练好的 ANFIS 模型（加载了最佳模型状态）
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    - total_active_attributes: 最优模型的总活跃属性数量
    """
    # 创建结果保存的目录
    os.makedirs('results', exist_ok=True)

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    n_inputs = X_train_scaled_tensor.shape[1]

    anfis_model = AttentionDynamicAttributeAndRuleANFIS(
        n_inputs=n_inputs,
        n_rules=initial_n_rules,
        X_train=X_train_scaled,
        attention_threshold=prune_threshold
    ).to(device)

    # 初始化优化器和调度器
    optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
    # 使用余弦退火学习率调度器
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    # 将优化器赋值给模型
    anfis_model.optimizer = optimizer

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'active_rules': [],
        'total_rules': [],
        'val_rmse': [],
        'attribute_weights': [],
        'rule_attention_weights': [],
        'pruned_attributes': [],
        'total_active_attributes': []  # 新增
    }

    # 初始化变量以记录最佳验证损失和最佳模型状态
    best_val_loss = float('inf')
    best_model_state = None

    # 设置规则生长和剪枝的参数
    patience = 25  # 等待多少个 epoch 后触发规则生长
    grow_threshold = 0.0001  # 训练损失下降低于该阈值，触发规则生长
    no_improve_epochs = 0
    prev_val_loss = float('inf')

    max_rules = 9  # 设置规则数量上限，防止无限生长
    attention_threshold_final = 0.05  # 定义活跃规则的注意力权重阈值

    # 设置特征名称（请根据您的数据集进行调整）
    feature_labels = [f'Input {i+1}' for i in range(n_inputs)]

    # 初始化列表用于统计总属性数量
    total_attributes_list = []

    # 训练模型
    for epoch in tqdm(range(epochs), desc="Training"):
        anfis_model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = anfis_model.train_step(
                batch_x,
                batch_y,
                optimizer,
                lambda_attention=1e-7,
                lambda_rule_attention=1e-8,
                lambda_diversity=1e-4
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        anfis_model.eval()

        with torch.no_grad():
            output, firing_strength, attention, rule_attention = anfis_model.forward(X_val_scaled_tensor)
            loss_val = F.mse_loss(output, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(output.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

        # 调整学习率
        scheduler.step()

        # 计算当前活跃规则的数量
        rule_attention_np = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
        num_active_rules = np.sum(rule_attention_np >= attention_threshold_final)

        # 计算总活跃属性数量（仅在当前 epoch 使用）
        attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()
        num_active_attributes_per_rule = np.sum(attribute_mask, axis=1)  # 每个规则中活跃的属性数量
        total_active_attributes = np.sum(num_active_attributes_per_rule)  # 该模型中总的活跃属性数量

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['active_rules'].append(num_active_rules)
        training_info['total_rules'].append(anfis_model.n_rules)
        training_info['val_rmse'].append(val_rmse)
        training_info['total_active_attributes'].append(total_active_attributes)  # 新增

        # 提取注意力权重
        attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()

        # 计算平均属性权重
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules

        # 保存注意力权重
        training_info['attribute_weights'].append(avg_attribute_weights)
        training_info['rule_attention_weights'].append(rule_attention_weights)

        # 保存总活跃属性数量
        total_attributes_list.append(total_active_attributes)

        # Check for best validation loss
        if loss_val.item() < best_val_loss:
            best_val_loss = loss_val.item()
            best_model_state = copy.deepcopy(anfis_model.state_dict())
            torch.save(anfis_model.state_dict(), best_model_path)
            print(f"Epoch {epoch+1}: New best validation loss: {loss_val.item():.4f}. Model saved.")

        # 显示注意力权重信息
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {loss_val.item():.4f} - Val RMSE: {val_rmse:.4f} - Total Rules: {anfis_model.n_rules} - Active Rules: {num_active_rules}")

        # 检查验证损失的改进情况
        if loss_val.item() < prev_val_loss - grow_threshold:
            no_improve_epochs = 0
            prev_val_loss = loss_val.item()
        else:
            no_improve_epochs += 1

        # 如果验证损失在连续若干个 epoch 中没有显著改进，触发规则生长
        if no_improve_epochs >= patience and anfis_model.n_rules < max_rules:
            print(f"Epoch {epoch+1}: No significant improvement in validation loss, growing a new rule. Current rules: {anfis_model.n_rules}")
            # 找出当前误差较大的数据点，用于初始化新规则
            residuals = (y_val_scaled_tensor.cpu().numpy() - output.cpu().numpy())
            high_error_indices = np.argsort(np.abs(residuals))[-int(0.1 * len(residuals)):]  # 选取误差最大的 10% 数据
            X_new_rule = X_val_scaled[high_error_indices]
            # 添加新规则
            anfis_model.grow_rule(X_new_rule)
            print(f"Epoch {epoch+1}: Added new rule. Total rules: {anfis_model.n_rules}")
            no_improve_epochs = 0  # 重置计数器

            # 重新初始化优化器和调度器
            optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
            # 将优化器赋值给模型
            anfis_model.optimizer = optimizer
            print("Optimizer and scheduler re-initialized after growing a new rule.")

        # 设置剪枝停止的 epoch 阈值
        pruning_stop_epoch = int(epochs * 0.8)  # 在 80% 的训练过程中进行剪枝

        # 每隔 prune_frequency 个 epoch 进行属性剪枝
        if (epoch + 1) % prune_frequency == 0 and epoch < pruning_stop_epoch:
            pruned_dict = anfis_model.prune_attributes_per_rule(
                threshold=prune_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01  # 性能下降容忍度，可根据需要调整
            )
            training_info['pruned_attributes'].append(pruned_dict)
            if pruned_dict:
                print(f"Epoch {epoch+1}: Pruned attributes per rule: {pruned_dict}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                # 将优化器赋值给模型
                anfis_model.optimizer = optimizer
                print("Optimizer and scheduler re-initialized after pruning attributes.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % prune_frequency == 0:
            print(f"Epoch {epoch+1}: Pruning has been stopped to stabilize the model structure.")

        # 每隔若干个 epoch 进行规则剪枝
        if (epoch + 1) % 50 == 0 and epoch < pruning_stop_epoch:
            pruned = anfis_model.prune_rules(threshold=attention_threshold_final)
            if pruned:
                print(f"Epoch {epoch+1}: Pruned rules. Total rules: {anfis_model.n_rules}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                # 将优化器赋值给模型
                anfis_model.optimizer = optimizer
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}: Rule pruning has been stopped to stabilize the model structure.")

    # 在训练结束后，加载最佳模型的状态字典
    if best_model_state is not None:
        anfis_model.load_state_dict(best_model_state)
        print("Loaded the best model based on validation loss.")

        # 显示训练后的隶属函数图像
        anfis_model.plot_membership_functions(feature_names=feature_labels)

        # 绘制属性权重的热力图
        plot_heatmap(anfis_model, feature_names=feature_labels)

        # 提取并保存规则
        rules = extract_fuzzy_rules(anfis_model, scaler_X, feature_names=feature_labels)
        for rule in rules:
            print(rule)
        # 保存规则到文件
        # with open(f'results/rules_nrules{anfis_model.n_rules}_lr{lr}_final.txt', 'w') as f:
        #     for rule in rules:
        #         f.write(rule + '\n')

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")
    else:
        print("No improvement during training. Using the final model.")

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")

    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['train_loss'], label='Train Loss')
    plt.plot(training_info['epoch'], training_info['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 可视化验证集上的 RMSE
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['val_rmse'], label='Validation RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Validation RMSE over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 返回最佳模型和标准化器及总属性数量
    return anfis_model, scaler_X, scaler_y, total_active_attributes

# 实验参数
learning_rates = [0.01]
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results', exist_ok=True)

# 记录实验结果
results = []

# 假设您已经有 X 和 y 数据
# 请确保 X 和 y 是 pandas DataFrame/Series 或 numpy arrays
# 这里假设 X 和 y 已经定义
# 例如：
# import pandas as pd
# X = pd.read_csv('features.csv')
# y = pd.read_csv('targets.csv').values.flatten()

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

n_rules = 9
for lr in learning_rates:
    test_rmse_list = []  # List to store test set RMSEs for each repeat
    val_rmse_list = []   # List to store validation set RMSEs for each repeat
    time_list = []
    attribute_weights_list = []
    overlap_indices_list = []
    position_indices_list = []
    total_attributes_list_experiment = []
    print(f"\nStarting experiments for n_rules={n_rules}, learning_rate={lr}")
    for repeat in range(repeats):
        # Record start time
        start_time = time.time()

        # Further split the training set into training and validation sets
        X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
            X_train_np, y_train_np, test_size=0.2, random_state=repeat
        )

        # Define a unique path for saving the best model for this repeat
        best_model_path = f'results/best_model_nrules{n_rules}_lr{lr}_repeat{repeat+1}.pth'

        # Train the model
        anfis_model, scaler_X, scaler_y, total_active_attributes = train_attention_dynamic_attribute_and_rule_anfis(
            X_train_sub, y_train_sub, X_val_sub, y_val_sub,
            initial_n_rules=n_rules,
            epochs=1500,
            batch_size=512,
            lr=lr,
            prune_frequency=25,
            prune_threshold=0.25,
            best_model_path=best_model_path  # 传递最佳模型保存路径
        )

        # 加载最佳模型的状态字典（确保使用的是最佳模型）
        anfis_model.eval()  # 设置为评估模式
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        anfis_model.to(device)

        # Test the model on the test set using the best model
        X_test_scaled = scaler_X.transform(X_test_np)  # 使用 X_test_np 作为测试集
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
        y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
        with torch.no_grad():
            y_pred_scaled = anfis_model.infer(X_test_tensor)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
            y_true = y_test_np  # Original unstandardized y_test
            test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            test_rmse_list.append(test_rmse)

        # Compute RMSE on the validation set
        X_val_scaled = scaler_X.transform(X_val_sub)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
        y_val_tensor = torch.tensor(y_val_sub, dtype=torch.float32).to(device)
        with torch.no_grad():
            y_val_pred_scaled = anfis_model.infer(X_val_tensor)
            y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = y_val_sub  # 使用原始的 y_val_sub 作为真实值
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
            val_rmse_list.append(val_rmse)

        # Record end time
        end_time = time.time()
        time_taken = end_time - start_time
        time_list.append(time_taken)

        # Extract attribute weights
        attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules
        attribute_weights_list.append(avg_attribute_weights)

        # Compute interpretability indices
        overlap_index = compute_iov(anfis_model)
        position_index = compute_ifspe(anfis_model)
        overlap_indices_list.append(overlap_index)
        position_indices_list.append(position_index)

        # Collect total active attributes from best model
        total_attributes_list_experiment.append(total_active_attributes)

        print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Val RMSE={val_rmse:.4f}, Time={time_taken:.2f}s")

        # Extract and save fuzzy rules
        rules = extract_fuzzy_rules(anfis_model, scaler_X, feature_names=features_to_use)
        for rule in rules:
            print(rule)
        # # Save rules to file
        # with open(f'results/rules_nrules{n_rules}_lr{lr}_repeat{repeat+1}.txt', 'w') as f:
        #     for rule in rules:
        #         f.write(rule + '\n')

    # Compute RMSE mean and std for test set and validation set
    test_rmse_mean = np.mean(test_rmse_list)
    test_rmse_std = np.std(test_rmse_list)
    val_rmse_mean = np.mean(val_rmse_list)
    val_rmse_std = np.std(val_rmse_list)

    # Compute average attribute weights over repeats
    avg_attribute_weights_over_repeats = np.mean(attribute_weights_list, axis=0)

    # Compute average total number of attributes included in all rules
    average_total_attributes = np.mean(total_attributes_list_experiment)

    # Compute average interpretability indices
    avg_overlap_index = np.mean(overlap_indices_list)
    avg_position_index = np.mean(position_indices_list)
    std_overlap_index = np.std(overlap_indices_list)
    std_position_index = np.std(position_indices_list)
    # Print the results
    print(f"\nResults for n_rules={n_rules}, learning_rate={lr}:")
    print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
    print(f"Validation RMSE: {val_rmse_mean:.4f} ± {val_rmse_std:.4f}")
    print(f"Time: {np.mean(time_list):.2f}s ± {np.std(time_list):.2f}s")
    print(f"Average Overlap Index (Iov): {avg_overlap_index:.4f} ± {std_overlap_index:.4f}")
    print(f"Average Fuzzy Set Position Index (Ifspe): {avg_position_index:.4f} ± {std_position_index:.4f} ")
    print(f"Average Attribute Weights over Repeats: {avg_attribute_weights_over_repeats}")
    print(f"Average Total Number of Attributes Included in All Rules: {average_total_attributes:.2f}")

    # Save results
    result = {
        'n_rules': n_rules,
        'learning_rate': lr,
        'test_rmse_mean': test_rmse_mean,
        'test_rmse_std': test_rmse_std,
        'val_rmse_mean': val_rmse_mean,
        'val_rmse_std': val_rmse_std,
        'time_mean': np.mean(time_list),
        'time_std': np.std(time_list),
        'attribute_weights': avg_attribute_weights_over_repeats,
        'overlap_index': avg_overlap_index,
        'position_index': avg_position_index,
        'average_total_attributes': average_total_attributes
    }
    results.append(result)

    # Save result data
    np.save(f'results/attribute_weights_nrules{n_rules}_lr{lr}.npy', avg_attribute_weights_over_repeats)
    np.save(f'results/overlap_index_nrules{n_rules}_lr{lr}.npy', avg_overlap_index)
    np.save(f'results/position_index_nrules{n_rules}_lr{lr}.npy', avg_position_index)
    np.save(f'results/average_total_attributes_nrules{n_rules}_lr{lr}.npy', average_total_attributes)

    # 可视化重叠指数和位置指数
    plt.figure(figsize=(10, 6))
    plt.bar(['Overlap Index (Iov)', 'Position Index (Ifspe)'], [avg_overlap_index, avg_position_index], color=['skyblue', 'salmon'])
    plt.ylabel('Index Value')
    plt.title('Average Interpretability Indices')
    plt.grid(axis='y')
    plt.show()

    # 可视化平均总属性数量
    plt.figure(figsize=(6, 4))
    plt.bar(['Average Total Attributes'], [average_total_attributes], color=['lightgreen'])
    plt.ylabel('Number of Attributes')
    plt.title('Average Total Number of Attributes Included in All Rules')
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

# 注意：
# - 请确保您已经定义了 X 和 y 数据。
# - X 应该是一个包含特征的 DataFrame 或 numpy 数组。
# - y 应该是一个包含目标变量的 Series 或 numpy 数组。
# - 例如：
# import pandas as pd
# X = pd.read_csv('features.csv')
# y = pd.read_csv('targets.csv').values.flatten()


# ADAR-ANFIS

In [None]:

# 从 UCI ML Repo 导入 Auto MPG 数据集
auto_mpg_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

# 定义列名称
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model_year', 'origin', 'car_name']

# 读取数据集，处理缺失值
data = pd.read_csv(auto_mpg_url, delim_whitespace=True, names=column_names, na_values='?')

# 删除含有缺失值的样本
data = data.dropna()

# 特征选择
# 排除 'mpg' 和 'car_name'，将其余作为输入特征
features_to_use = [
    'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin'
]

# 处理目标变量
X = data[features_to_use]
y = data['mpg']

# 将类别变量 'origin' 进行独热编码（如果需要，可以选择保留为数值型）
# 这里保留为数值型，以简化 ANFIS 模型的处理
# 如果希望进行独热编码，请取消下方代码的注释
# X = pd.get_dummies(X, columns=['origin'], drop_first=True)
# features_to_use = X.columns.tolist()

# 检查缺失值并删除含有缺失值的样本（已在读取时完成）

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use

In [None]:
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm, trange
from scipy.stats import norm  # 新增

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================
from sklearn.datasets import fetch_openml
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

# 定义列名称
col_names = ['Year'] + [f'Feature_{i}' for i in range(1, 91)]

# 读取数据集
data = pd.read_csv(data_url, header=None, names=col_names)

# 特征选择
X = data.drop('Year', axis=1)
y = data['Year']

# 更新特征名称以便后续使用
feature_labels = X.columns.tolist()

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

features_to_use=feature_labels

# ============================
# 定义 ADAR-ANFIS 模型
# ============================

class AttentionDynamicAttributeAndRuleANFIS(nn.Module):
    def __init__(self, n_inputs, n_rules, X_train=None, attention_threshold=0.1, init_from_checkpoint=False):
        super(AttentionDynamicAttributeAndRuleANFIS, self).__init__()
        self.n_inputs = n_inputs
        self.attention_threshold = attention_threshold  # 属性注意力阈值
        self.n_rules = n_rules  # 初始规则数量

        if not init_from_checkpoint:
            if X_train is None:
                raise ValueError("X_train must be provided for initializing rules.")

            # 使用 KMeans 聚类初始化隶属函数中心
            kmeans = KMeans(n_clusters=n_rules, random_state=42)
            kmeans.fit(X_train)
            cluster_centers = kmeans.cluster_centers_  # 形状：(n_rules, n_inputs)

            # 初始化隶属函数参数
            self.mu = nn.Parameter(torch.tensor(cluster_centers, dtype=torch.float32))  # 均值，形状：(n_rules, n_inputs)
            self.sigma = nn.Parameter(torch.ones(n_rules, n_inputs))  # 标准差

            # 初始化属性注意力权重参数
            self.attention_weights = nn.Parameter(torch.randn(n_rules, n_inputs))

            # 初始化规则注意力权重参数
            self.rule_attention_weights = nn.Parameter(torch.ones(n_rules))

            # 初始化后件参数（对于回归任务）
            self.consequents = nn.Parameter(torch.randn(n_rules, n_inputs))

            # 初始化属性掩码（1表示活跃，0表示被剪除），针对每个规则
            device = self.mu.device  # 获取设备
            self.register_buffer('attribute_mask', torch.ones(n_rules, n_inputs, device=device))
        else:
            # 初始化占位参数，实际参数将在加载后设置
            self.mu = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.sigma = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.attention_weights = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.rule_attention_weights = nn.Parameter(torch.empty(n_rules))
            self.consequents = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.register_buffer('attribute_mask', torch.empty(n_rules, n_inputs))

    def forward(self, x):
        batch_size = x.size(0)

        # 将被剪枝的属性的 attention_weights 设置为一个大负数
        masked_attention_weights = self.attention_weights.clone()
        masked_attention_weights[self.attribute_mask == 0] = -1e6

        # 计算属性注意力权重（使用 sigmoid 激活函数并应用属性掩码）
        attention = torch.sigmoid(masked_attention_weights) * self.attribute_mask  # 形状：(n_rules, n_inputs)

        # 计算规则注意力权重（使用 sigmoid 激活函数）
        rule_attention = torch.sigmoid(self.rule_attention_weights)  # 形状：(n_rules,)

        # 扩展维度以匹配批次大小
        x_expanded = x.unsqueeze(1)  # 形状：(batch_size, 1, n_inputs)
        mu_expanded = self.mu.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)
        sigma_expanded = self.sigma.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 确保 sigma 为正数，避免除以零
        sigma_expanded = torch.clamp(sigma_expanded, min=1e-3)

        # 计算高斯隶属度函数的对数
        log_gauss = -0.5 * ((x_expanded - mu_expanded) ** 2) / (sigma_expanded ** 2)

        # 使用属性注意力权重并应用属性掩码
        log_gauss_weighted = log_gauss * attention.unsqueeze(0)  # 形状：(batch_size, n_rules, n_inputs)

        # 对输入属性维度求和
        sum_log_gauss = log_gauss_weighted.sum(dim=2)  # 形状：(batch_size, n_rules)

        # 计算规则的激活度
        firing_strength = torch.exp(sum_log_gauss)  # 形状：(batch_size, n_rules)

        # 使用规则注意力权重调整规则的激活度
        firing_strength_weighted = firing_strength * rule_attention.unsqueeze(0)  # 形状：(batch_size, n_rules)

        # 计算归一化的激活度
        sum_firing_strength = firing_strength_weighted.sum(dim=1, keepdim=True) + 1e-8
        norm_firing_strength = firing_strength_weighted / sum_firing_strength  # 形状：(batch_size, n_rules)

        # 计算后件部分（使用属性注意力权重）
        consequents_weighted = self.consequents * attention  # 形状：(n_rules, n_inputs)
        consequents_weighted_expanded = consequents_weighted.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 计算规则的输出（对于每个规则，后件为被选中属性的线性组合）
        rule_outputs = torch.sum(consequents_weighted_expanded * x_expanded, dim=2)  # 形状：(batch_size, n_rules)

        # 计算总输出
        output = torch.sum(norm_firing_strength * rule_outputs, dim=1)  # 形状：(batch_size,)

        return output, firing_strength, attention, rule_attention

    def train_step(self, x, target, optimizer, lambda_attention=1e-7, lambda_rule_attention=1e-8, lambda_diversity=1e-4):
        """
        执行一次训练步骤。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - target: 目标数据，形状：(batch_size,)
        - optimizer: 优化器实例
        - lambda_attention: 属性注意力权重的正则化系数
        - lambda_rule_attention: 规则注意力权重的正则化系数
        - lambda_diversity: 多样性正则化的系数

        返回：
        - loss.item(): 当前批次的总损失
        - output: 模型的输出
        """
        self.train()
        optimizer.zero_grad()
        output, firing_strength, attention, rule_attention = self.forward(x)
        # 计算预测损失（均方误差损失）
        loss_pred = F.mse_loss(output, target)

        # 添加属性注意力正则化损失（L1 正则化）
        loss_attention = lambda_attention * attention.abs().sum()

        # 计算规则注意力正则化损失（L1 正则化）
        loss_rule_attention = lambda_rule_attention * rule_attention.abs().sum()

        # 添加多样性正则化损失（鼓励不同规则的注意力权重不同）
        if self.n_rules > 1:
            # 计算注意力权重的余弦相似度矩阵
            attention_norm = attention / (attention.norm(dim=1, keepdim=True) + 1e-8)
            similarity_matrix = torch.matmul(attention_norm, attention_norm.t())
            # 计算非对角线的平均相似度
            diversity_loss = torch.sum(similarity_matrix) - torch.diag(similarity_matrix).sum()
            diversity_loss = diversity_loss / (self.n_rules * (self.n_rules - 1))
        else:
            diversity_loss = torch.tensor(0.0).to(attention.device)

        loss_diversity = lambda_diversity * diversity_loss

        # 总损失
        loss = loss_pred + loss_attention + loss_rule_attention + loss_diversity

        if torch.isnan(loss):
            print("Loss is NaN. Stopping training.")
            return loss.item(), output

        loss.backward()

        optimizer.step()

        return loss.item(), output

    def prune_attributes_per_rule(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除每个规则中注意力权重低于阈值的属性，并冻结其相关参数。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则不执行剪枝。

        参数：
        - threshold: 剪枝阈值，默认为0.1
        - X_val: 验证集特征，形状：(num_val_samples, n_inputs)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned_dict: 字典，键为规则索引，值为被剪除的属性索引列表
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        # 执行剪枝操作
        pruned_dict = {}
        with torch.no_grad():
            attention = torch.sigmoid(model_copy.attention_weights)  # 形状：(n_rules, n_inputs)

            for rule_idx in range(model_copy.n_rules):
                if torch.all(model_copy.attribute_mask[rule_idx] == 0):
                    continue  # 跳过已被完全剪除的规则

                prune_indices = torch.where((attention[rule_idx] < threshold) & (model_copy.attribute_mask[rule_idx] == 1))[0].tolist()

                if prune_indices:
                    # 更新属性掩码
                    model_copy.attribute_mask[rule_idx, prune_indices] = 0.0

                    # 冻结被剪除属性的相关参数
                    model_copy.attention_weights[rule_idx, prune_indices].requires_grad = False
                    model_copy.consequents[rule_idx, prune_indices].requires_grad = False

                    pruned_dict[rule_idx] = prune_indices

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after = model_copy.infer(X_val)
            loss_after = F.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，不执行剪枝
            print(f"Attribute pruning was not performed due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned_dict = {}  # 清空剪枝记录
            pruned = False
        else:
            # 性能未下降，更新原始模型的参数
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.attention_weights.data = model_copy.attention_weights.data.clone()
            self.consequents.data = model_copy.consequents.data.clone()
            print(f"Attribute pruning successful. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned_dict

    def prune_rules_with_recovery(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则不执行剪枝。

        参数：
        - threshold: 剪枝阈值，默认值为0.1
        - X_val: 验证集特征，形状为 (num_val_samples, n_inputs)
        - y_val: 验证集目标，形状为 (num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        # 执行规则剪枝操作
        pruned = False
        with torch.no_grad():
            # 获取规则注意力权重
            rule_attention = torch.sigmoid(model_copy.rule_attention_weights)
            # 找到需要移除的规则索引（rule_attention < threshold）
            low_attention_indices = torch.where(rule_attention < threshold)[0]

            if len(low_attention_indices) == 0:
                return pruned  # 没有需要移除的规则

            # 保留的规则索引（rule_attention >= threshold）
            keep_indices = torch.where(rule_attention >= threshold)[0]

            # 更新模型参数，移除低重要性的规则
            model_copy.mu = nn.Parameter(model_copy.mu.data[keep_indices])
            model_copy.sigma = nn.Parameter(model_copy.sigma.data[keep_indices])
            model_copy.attention_weights = nn.Parameter(model_copy.attention_weights.data[keep_indices])
            model_copy.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data[keep_indices])
            model_copy.consequents = nn.Parameter(model_copy.consequents.data[keep_indices])
            model_copy.attribute_mask = model_copy.attribute_mask.data[keep_indices].clone()

            # 更新规则数量
            model_copy.n_rules = len(keep_indices)

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after = model_copy.infer(X_val)
            loss_after = F.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，不执行剪枝
            print(f"Rule pruning was not performed due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned = False
        else:
            # 性能未下降，更新原始模型的参数
            self.mu = nn.Parameter(model_copy.mu.data.clone())
            self.sigma = nn.Parameter(model_copy.sigma.data.clone())
            self.attention_weights = nn.Parameter(model_copy.attention_weights.data.clone())
            self.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data.clone())
            self.consequents = nn.Parameter(model_copy.consequents.data.clone())
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.n_rules = model_copy.n_rules  # 更新规则数量
            print(f"Rules pruned successfully. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned

    def prune_rules(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝操作。

        参数：
        - threshold: 剪枝阈值，默认值为0.1
        - X_val: 验证集特征，形状为 (num_val_samples, n_inputs)
        - y_val: 验证集目标，形状为 (num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        return self.prune_rules_with_recovery(threshold, X_val, y_val, performance_drop_tolerance, best_val_loss)

    def grow_rule_with_performance_check(self, X_new, X_train, y_train, X_val, y_val, best_val_loss, device, lr, grow_epochs=10):
        """
        添加一个新的规则，并进行性能检查。如果性能没有提升，则撤销规则生长。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, n_inputs)
        - X_train: 训练集特征，形状：(num_train_samples, n_inputs)
        - y_train: 训练集目标，形状：(num_train_samples,)
        - X_val: 验证集特征，形状：(num_val_samples, n_inputs)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - best_val_loss: 当前最佳验证损失
        - device: 设备
        - lr: 学习率
        - grow_epochs: 在规则生长后训练的 epoch 数，默认值为10

        返回：
        - improved: 布尔值，指示是否保留了新规则
        """
        # 创建模型的副本
        model_copy = copy.deepcopy(self).to(device)

        # 添加新规则到副本
        model_copy.grow_rule(X_new)

        # 初始化优化器和调度器
        optimizer_copy = optim.AdamW(model_copy.parameters(), lr=lr)
        scheduler_copy = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_copy, T_max=grow_epochs)

        # 将训练数据转换为张量
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

        # 训练模型副本
        for epoch in range(grow_epochs):
            model_copy.train()
            optimizer_copy.zero_grad()
            output, _, _, _ = model_copy.forward(X_train_tensor)
            loss = F.mse_loss(output, y_train_tensor)
            loss.backward()
            optimizer_copy.step()
            scheduler_copy.step()

        # 评估副本模型的性能
        model_copy.eval()
        with torch.no_grad():
            output = model_copy.forward(X_val_tensor)[0]
            loss = F.mse_loss(output, y_val_tensor).item()

        print(f"After growing rule and training for {grow_epochs} epochs: Validation Loss = {loss:.4f}")

        if loss < best_val_loss:
            # 性能有所提升，保留新规则
            # 手动更新原始模型的参数
            self.mu = nn.Parameter(model_copy.mu.data.clone())
            self.sigma = nn.Parameter(model_copy.sigma.data.clone())
            self.attention_weights = nn.Parameter(model_copy.attention_weights.data.clone())
            self.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data.clone())
            self.consequents = nn.Parameter(model_copy.consequents.data.clone())
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.n_rules = model_copy.n_rules  # 更新规则数量
            print("Performance improved after growing rule. New rule retained.")
            return True
        else:
            # 性能未提升，丢弃副本，保持主模型不变
            print("Performance did not improve after growing rule. Rule growth reverted.")
            return False

    def grow_rule(self, X_new):
        """
        添加一个新的规则。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, n_inputs)
        """
        # 获取设备和数据类型
        device = self.mu.device
        dtype = self.mu.dtype

        # 使用 X_new 计算新的规则中心和标准差
        new_mu = torch.tensor(X_new.mean(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)
        new_sigma = torch.tensor(X_new.std(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)

        # 计算现有规则的属性注意力权重的平均值
        if self.n_rules > 0:
            existing_attention_weights = torch.sigmoid(self.attention_weights).data  # (n_rules, n_inputs)
            attention_mean = existing_attention_weights.mean(dim=0, keepdim=True)  # (1, n_inputs)
        else:
            attention_mean = torch.ones(1, self.n_inputs, dtype=dtype).to(device)  # 初始化为1

        # 将新规则的属性注意力权重初始化为平均值并加入随机扰动
        noise = torch.randn_like(attention_mean) * 0.05  # 调整扰动大小以控制多样性
        new_attention_weights = (attention_mean + noise).clamp(0, 1).detach()  # 保持在[0,1]范围内

        # 将新规则的规则注意力权重初始化为与现有权重的均值 logit 相同，并加入随机扰动
        if self.n_rules > 0:
            existing_rule_attention_logits = self.rule_attention_weights.data  # (n_rules,)
            rule_attention_mean_logit = existing_rule_attention_logits.mean().unsqueeze(0)  # (1,)
            rule_attention_noise = torch.randn_like(rule_attention_mean_logit) * 0.05  # 调整扰动大小
            new_rule_attention_weight = (rule_attention_mean_logit + rule_attention_noise).detach()
        else:
            rule_attention_mean_logit = torch.tensor([0.0], dtype=dtype).to(device)  # 中性 logit
            new_rule_attention_weight = rule_attention_mean_logit.clone().detach()  # (1,)

        # 初始化后件参数为小的随机值
        new_consequents = torch.randn(1, self.n_inputs, dtype=dtype).to(device) * 0.01  # (1, n_inputs)

        # 将新的参数添加到模型中
        self.mu = nn.Parameter(torch.cat([self.mu.data, new_mu], dim=0))  # (n_rules + 1, n_inputs)
        self.sigma = nn.Parameter(torch.cat([self.sigma.data, new_sigma], dim=0))  # (n_rules + 1, n_inputs)
        self.attention_weights = nn.Parameter(torch.cat([self.attention_weights.data, new_attention_weights], dim=0))  # (n_rules + 1, n_inputs)
        self.rule_attention_weights = nn.Parameter(torch.cat([self.rule_attention_weights.data, new_rule_attention_weight], dim=0))  # (n_rules + 1,)
        self.consequents = nn.Parameter(torch.cat([self.consequents.data, new_consequents], dim=0))  # (n_rules + 1, n_inputs)

        # 更新属性掩码，添加新规则的掩码行
        new_attribute_mask = torch.ones(1, self.n_inputs, dtype=self.attribute_mask.dtype).to(device)  # (1, n_inputs)
        self.attribute_mask = torch.cat([self.attribute_mask, new_attribute_mask], dim=0)  # (n_rules + 1, n_inputs)

        # 更新规则数量
        self.n_rules += 1

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.attribute_mask.shape[0] == self.n_rules, \
            f"After growing, attribute_mask has shape {self.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"New rule added. Total rules: {self.n_rules}")

    def infer(self, x, targets=None):
        """
        执行推理。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - targets: 目标数据，形状：(batch_size,)，可选

        返回：
        - 如果 targets 为 None，返回模型输出。
        - 否则，返回模型输出和损失值。
        """
        with torch.no_grad():
            self.eval()
            output, _, _, _ = self.forward(x)
            if targets is None:
                return output
            else:
                loss = F.mse_loss(output, targets)
                return output, loss.item()

    def save_model_with_architecture(self, scaler_X, scaler_y, path):
        """
        保存模型的状态字典和架构信息。

        参数：
        - scaler_X: 输入数据的标准化器
        - scaler_y: 输出数据的标准化器
        - path: 保存路径
        """
        torch.save({
            'n_rules': self.n_rules,
            'mu': self.mu.detach().cpu().numpy(),
            'sigma': self.sigma.detach().cpu().numpy(),
            'attention_weights': self.attention_weights.detach().cpu().numpy(),
            'rule_attention_weights': self.rule_attention_weights.detach().cpu().numpy(),
            'consequents': self.consequents.detach().cpu().numpy(),
            'attribute_mask': self.attribute_mask.detach().cpu().numpy(),
            'scaler_X_mean': scaler_X.mean_,
            'scaler_X_scale': scaler_X.scale_,
            'scaler_y_mean': scaler_y.mean_,
            'scaler_y_scale': scaler_y.scale_
        }, path)
        print(f"Model and architecture saved to {path}")

    @staticmethod
    def load_model_with_architecture(path, device='cpu'):
        checkpoint = torch.load(path, map_location=device)
        n_rules = checkpoint['n_rules']
        n_inputs = checkpoint['mu'].shape[1]

        # 初始化模型，设置 init_from_checkpoint=True 以跳过 KMeans 初始化
        model = AttentionDynamicAttributeAndRuleANFIS(
            n_inputs=n_inputs,
            n_rules=n_rules,
            X_train=None,  # 跳过 KMeans 初始化
            attention_threshold=0.1,
            init_from_checkpoint=True
        ).to(device)

        # 手动设置参数
        with torch.no_grad():
            model.mu = nn.Parameter(torch.tensor(checkpoint['mu'], dtype=torch.float32, device=device))
            model.sigma = nn.Parameter(torch.tensor(checkpoint['sigma'], dtype=torch.float32, device=device))
            model.attention_weights = nn.Parameter(torch.tensor(checkpoint['attention_weights'], dtype=torch.float32, device=device))
            model.rule_attention_weights = nn.Parameter(torch.tensor(checkpoint['rule_attention_weights'], dtype=torch.float32, device=device))
            model.consequents = nn.Parameter(torch.tensor(checkpoint['consequents'], dtype=torch.float32, device=device))

            # 使用 copy_ 复制到 attribute_mask buffer，确保在正确的设备上
            model.attribute_mask.copy_(torch.tensor(checkpoint['attribute_mask'], dtype=torch.float32, device=device))

        # 加载标准化器参数
        scaler_X = StandardScaler()
        scaler_X.mean_ = checkpoint['scaler_X_mean']
        scaler_X.scale_ = checkpoint['scaler_X_scale']

        scaler_y = StandardScaler()
        scaler_y.mean_ = checkpoint['scaler_y_mean']
        scaler_y.scale_ = checkpoint['scaler_y_scale']

        print(f"Model and architecture loaded from {path} with n_rules={n_rules}")

        return model, scaler_X, scaler_y

    def plot_membership_functions(self, feature_names=None):
        """
        绘制训练后的隶属函数图像。

        参数：
        - feature_names: 特征名称列表，默认为 None。
        """
        mus = self.mu.detach().cpu().numpy()
        sigmas = self.sigma.detach().cpu().numpy()
        attentions = torch.sigmoid(self.attention_weights).detach().cpu().numpy()
        rule_attentions = torch.sigmoid(self.rule_attention_weights).detach().cpu().numpy()
        xn = np.linspace(-3, 3, 1000)

        n_inputs = self.n_inputs
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(n_inputs)]

        for r in range(self.n_rules):
            rule_attention_value = rule_attentions[r]
            plt.figure(figsize=(10, 6))
            plt.title(f"Rule {r + 1}, Rule Attention: {rule_attention_value:.4f}")
            for j in range(n_inputs):
                # 使用 attribute_mask 确认是否为活跃属性
                if self.attribute_mask[r, j] == 0:
                    continue  # 跳过被剪除的属性
                attention_value = attentions[r, j]
                c_val = mus[r, j]
                sigma_val = sigmas[r, j]
                # 绘制带有注意力权重的隶属函数
                y = np.exp(-0.5 * ((xn - c_val) ** 2) / (sigma_val ** 2 + 1e-8))
                plt.plot(xn, y, label=f"{feature_names[j]} (Attn: {attention_value:.4f})")
            plt.legend()
            plt.xlabel('Input')
            plt.ylabel('Membership degree')
            plt.grid(True)
            plt.show()

# ============================
# 定义辅助函数
# ============================

def extract_fuzzy_rules(anfis_model, scaler_X, feature_names=None):
    """
    提取 ANFIS 模型的模糊规则，包含所有用于计算的权重。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - scaler_X: 输入数据的标准化器
    - feature_names: 特征名称列表

    返回：
    - rules: 包含规则字符串的列表
    """
    # 获取模型的参数
    mus = anfis_model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = anfis_model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
    rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
    consequents = anfis_model.consequents.detach().cpu().numpy()

    # 反标准化 mu 和 sigma
    c_orig = mus * scaler_X.scale_ + scaler_X.mean_  # (n_rules, n_inputs)
    sigma_orig = sigmas * scaler_X.scale_  # (n_rules, n_inputs)

    # 获取属性掩码
    attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()

    # 如果未提供特征名称，使用默认名称
    n_rules, n_inputs = mus.shape
    if feature_names is None:
        feature_names = [f'Input {i+1}' for i in range(n_inputs)]

    rules = []

    for i in range(n_rules):
        # 包含规则注意力权重
        rule_str = (f"Rule {i+1} (Rule Attention: "
                    f"{rule_attention_weights[i]:.4f}): IF ")
        antecedent = []
        for j in range(n_inputs):
            if attribute_mask[i, j] == 0:
                continue  # 忽略被剪枝的属性
            attention_value = attention_weights[i, j]
            c_val = c_orig[i, j]
            sigma_val = sigma_orig[i, j]
            antecedent.append(
                f"[{feature_names[j]} (Attn: {attention_value:.4f}) "
                f"is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})]"
            )
        antecedent_str = " AND ".join(antecedent) if antecedent else "True"
        rule_str += antecedent_str + " THEN Output = "

        consequent_terms = []
        for j in range(n_inputs):
            if attribute_mask[i, j] == 0:
                continue
            attention_value = attention_weights[i, j]
            coef = consequents[i, j]
            consequent_terms.append(
                f"({coef:.4f} * {feature_names[j]} "
                f"(Attn: {attention_value:.4f}))"
            )
        consequent_str = " + ".join(consequent_terms) if consequent_terms else "0"
        rule_str += consequent_str
        rules.append(rule_str)

    return rules

def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    """
    使用解析解计算两个高斯隶属度函数的重叠面积。

    参数：
    - c1, sigma1: 第一个高斯函数的中心和标准差
    - c2, sigma2: 第二个高斯函数的中心和标准差

    返回：
    - overlap_area: 两个高斯函数的重叠面积
    """
    denominator = np.sqrt(sigma1**2 + sigma2**2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(model):
    """
    计算 Average Overlap Index (Iov)。

    参数：
    - model: 训练好的 ANFIS 模型

    返回：
    - average_iov: 平均重叠指数
    """
    mus = model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attribute_mask = model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)
    n_rules, n_inputs = mus.shape

    total_max_overlap = 0
    valid_attributes = 0

    for j in range(n_inputs):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, j] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算重叠

        max_overlap = -np.inf
        for i in range(len(active_rules)):
            for k in range(i + 1, len(active_rules)):
                rule_i = active_rules[i]
                rule_k = active_rules[k]
                c1 = mus[rule_i, j]
                sigma1 = sigmas[rule_i, j]
                c2 = mus[rule_k, j]
                sigma2 = sigmas[rule_k, j]
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0  # 避免除以零

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(model):
    """
    计算 Average Fuzzy Set Position Index (Ifspe)。

    参数：
    - model: 训练好的 ANFIS 模型

    返回：
    - average_ifspe: 平均模糊集位置指数（非负数）
    """
    mus = model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigmas = model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attribute_mask = model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)
    n_rules, n_inputs = mus.shape

    total_ifspe = 0
    valid_terms = 0

    for j in range(n_inputs):
        # 获取当前属性的活跃规则
        active_rules = np.where(attribute_mask[:, j] == 1)[0]
        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算 Ifspe

        # 按中心值排序
        sorted_indices = active_rules[np.argsort(mus[active_rules, j])]
        sorted_centers = mus[sorted_indices, j]
        sorted_sigma = sigmas[sorted_indices, j]

        # 计算相邻规则对的 phi 和 psi
        for l in range(len(sorted_centers) - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sorted_sigma[l]
            s_lp1 = sorted_sigma[l + 1]

            phi = np.exp(-0.5 * ((v_l + v_lp1) / (s_l + s_lp1))**2)
            denominator = s_l - s_lp1
            if denominator == 0:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator)**2)

            # 使用绝对值确保 Ifspe_term 为非负数
            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0  # 避免除以零

    # 归一化因子为 L * D
    average_ifspe = total_ifspe / (n_inputs * n_rules)
    return average_ifspe

def plot_attribute_weights(attribute_weights, feature_names):
    plt.figure(figsize=(10, 6))
    x = np.arange(len(feature_names))
    plt.bar(x, attribute_weights)
    plt.xticks(x, feature_names, rotation=45)
    plt.xlabel('Attributes')
    plt.ylabel('Average Attribute Weights')
    plt.title('Average Attribute Weights over Repeats')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_heatmap(anfis_model, feature_names):
    """
    绘制属性权重的热力图。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - feature_names: 特征名称列表
    """
    # 提取属性掩码
    attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 将被剪枝的属性的 attention_weights 设置为一个大负数
    attention_weights = torch.sigmoid(anfis_model.attention_weights).clone()
    attention_weights[anfis_model.attribute_mask == 0] = -1e6

    # 计算注意力权重，并应用 attribute_mask
    attention = torch.sigmoid(attention_weights) * anfis_model.attribute_mask
    attention_np = attention.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 创建注释字符串
    annotations = []
    for r in range(attention_np.shape[0]):
        row = []
        for a in range(attention_np.shape[1]):
            if attribute_mask_np[r, a] == 0:
                row.append("0.00\nX")  # 被剪枝的属性，值为 0，标记为 X
            else:
                row.append(f"{attention_np[r, a]:.2f}")
        annotations.append(row)

    plt.figure(figsize=(12, 6))
    sns.heatmap(
        attention_np,
        annot=annotations,
        fmt='',
        cmap='viridis',
        xticklabels=feature_names,
        yticklabels=[f'Rule {i+1}' for i in range(anfis_model.n_rules)],
        cbar_kws={'label': 'Attention Weight'}
    )
    plt.title(f'属性注意力权重 (被剪枝的属性标记为 X)')
    plt.xlabel('输入特征')
    plt.ylabel('规则')
    plt.tight_layout()
    plt.show()

# ============================
# 定义训练函数
# ============================

def train_attention_dynamic_attribute_and_rule_anfis(
    X_train_np, y_train_np, X_val_np, y_val_np,
    initial_n_rules=3, epochs=1500, batch_size=32, lr=0.01,
    prune_frequency=25, prune_threshold=0.1,
    best_model_path='best_model.pth'
):
    """
    训练 AttentionDynamicAttributeAndRuleANFIS 模型。
    并在训练过程中保存验证集上表现最好的模型。

    参数：
    - X_train_np: 训练集特征，形状为 (num_samples, n_inputs)
    - y_train_np: 训练集目标，形状为 (num_samples,)
    - X_val_np: 验证集特征，形状为 (num_val_samples, n_inputs)
    - y_val_np: 验证集目标，形状为 (num_val_samples,)
    - initial_n_rules: 初始规则数量，默认值为 3
    - epochs: 训练轮数，默认值为 1500
    - batch_size: 每批次的样本数量，默认值为 32
    - lr: 学习率，默认值为 0.01
    - prune_frequency: 进行属性剪枝的频率（每隔多少个 epoch）
    - prune_threshold: 属性剪枝的阈值
    - best_model_path: 最佳模型保存的文件路径，默认值为 'best_model.pth'

    返回:
    - anfis_model: 训练好的 ANFIS 模型（加载了最佳模型状态）
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    - total_active_attributes: 最优模型的总活跃属性数量
    - training_info: 训练过程中的信息
    """
    # 创建结果保存的目录
    os.makedirs('results', exist_ok=True)

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    n_inputs = X_train_scaled_tensor.shape[1]

    anfis_model = AttentionDynamicAttributeAndRuleANFIS(
        n_inputs=n_inputs,
        n_rules=initial_n_rules,
        X_train=X_train_scaled,
        attention_threshold=prune_threshold
    ).to(device)

    # 初始化优化器和调度器
    optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
    # 使用余弦退火学习率调度器
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    # 将优化器赋值给模型
    anfis_model.optimizer = optimizer

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'active_rules': [],
        'total_rules': [],
        'val_rmse': [],
        'attribute_weights': [],
        'rule_attention_weights': [],
        'pruned_attributes': [],
        'total_active_attributes': []  # 新增
    }

    # 初始化变量以记录最佳验证损失和最佳模型状态
    best_val_loss = float('inf')
    best_model_state = None  # 用于保存最佳模型状态

    # 设置规则生长和剪枝的参数
    patience = 25  # 等待多少个 epoch 后触发规则生长
    grow_threshold = 0.0001  # 训练损失下降低于该阈值，触发规则生长
    no_improve_epochs = 0
    prev_val_loss = float('inf')

    max_rules = 3  # 设置规则数量上限，防止无限生长
    attention_threshold_final = 0.05  # 定义活跃规则的注意力权重阈值

    # 设置特征名称
    feature_names = feature_labels if feature_labels else [f'Input {i+1}' for i in range(n_inputs)]

    # 初始化列表用于统计总属性数量
    total_attributes_list = []

    # 训练模型
    for epoch in trange(epochs, desc="Training"):
        anfis_model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = anfis_model.train_step(
                batch_x,
                batch_y,
                optimizer,
                lambda_attention=1e-7,
                lambda_rule_attention=1e-8,
                lambda_diversity=1e-4
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        anfis_model.eval()

        with torch.no_grad():
            output, firing_strength, attention, rule_attention = anfis_model.forward(X_val_scaled_tensor)
            loss_val = F.mse_loss(output, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(output.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

        # 调整学习率
        scheduler.step()

        # 计算当前活跃规则的数量
        rule_attention_np = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
        num_active_rules = np.sum(rule_attention_np >= attention_threshold_final)

        # 计算总活跃属性数量（仅在当前 epoch 使用）
        attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()
        num_active_attributes_per_rule = np.sum(attribute_mask, axis=1)  # 每个规则中活跃的属性数量
        total_active_attributes = np.sum(num_active_attributes_per_rule)  # 该模型中总的活跃属性数量

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['active_rules'].append(num_active_rules)
        training_info['total_rules'].append(anfis_model.n_rules)
        training_info['val_rmse'].append(val_rmse)
        training_info['total_active_attributes'].append(total_active_attributes)

        # 提取注意力权重
        attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()

        # 计算平均属性权重
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules

        # 保存注意力权重
        training_info['attribute_weights'].append(avg_attribute_weights)
        training_info['rule_attention_weights'].append(rule_attention_weights)

        # 保存总活跃属性数量
        total_attributes_list.append(total_active_attributes)

        # Check for best validation loss
        if loss_val.item() < best_val_loss:
            best_val_loss = loss_val.item()
            # 保存最佳模型状态
            best_model_state = copy.deepcopy(anfis_model.state_dict())
            # 保存最佳模型
            anfis_model.save_model_with_architecture(scaler_X, scaler_y, best_model_path)
            print(f"\nEpoch {epoch+1}: New best validation loss: {loss_val.item():.6f}. Model saved.")

        # 显示训练进度
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.6f} - Val Loss: {loss_val.item():.6f} - Val RMSE: {val_rmse:.6f} - Total Rules: {anfis_model.n_rules} - Active Rules: {num_active_rules}")

        # 检查验证损失的改进情况
        if loss_val.item() < prev_val_loss - grow_threshold:
            no_improve_epochs = 0
            prev_val_loss = loss_val.item()
        else:
            no_improve_epochs += 1

        # 如果验证损失在连续若干个 epoch 中没有显著改进，触发规则生长
        if no_improve_epochs >= patience and anfis_model.n_rules < max_rules:
            print(f"\nEpoch {epoch+1}: No significant improvement in validation loss, growing a new rule. Current rules: {anfis_model.n_rules}")
            # 找出当前误差较大的数据点，用于初始化新规则
            residuals = (y_val_scaled_tensor.cpu().numpy() - output.cpu().numpy())
            high_error_indices = np.argsort(np.abs(residuals))[-int(0.1 * len(residuals)):]  # 选取误差最大的 10% 数据
            X_new_rule = X_val_scaled[high_error_indices]
            # 添加新规则并进行性能检查
            improved = anfis_model.grow_rule_with_performance_check(
                X_new_rule,
                X_train_scaled,  # 传入训练集特征
                y_train_scaled_tensor.cpu().numpy(),  # 传入训练集目标
                X_val_scaled,
                y_val_scaled_tensor.cpu().numpy(),
                best_val_loss,
                device,
                lr,
                grow_epochs=100  # 设定在规则生长后训练的轮次
            )

            if improved:
                # 如果性能有所改善，更新最佳验证损失
                prev_val_loss = best_val_loss  # 更新之前的验证损失
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
                print("Optimizer and scheduler re-initialized after growing a new rule.")
            else:
                # 如果性能未改善，撤销规则生长操作（已在方法内完成）
                pass

            no_improve_epochs = 0  # 重置计数器

        # 设置剪枝停止的 epoch 阈值
        pruning_stop_epoch = int(epochs * 0.8)  # 在 80% 的训练过程中进行剪枝

        # 每隔 prune_frequency 个 epoch 进行属性剪枝
        if (epoch + 1) % prune_frequency == 0 and epoch < pruning_stop_epoch:
            pruned_dict = anfis_model.prune_attributes_per_rule(
                threshold=prune_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,  # 性能下降容忍度，可根据需要调整
                best_val_loss=best_val_loss
            )
            training_info['pruned_attributes'].append(pruned_dict)
            if pruned_dict:
                print(f"Epoch {epoch+1}: Pruned attributes per rule: {pruned_dict}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning attributes.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % prune_frequency == 0:
            print(f"Epoch {epoch+1}: Pruning has been stopped to stabilize the model structure.")

        # 每隔若干个 epoch 进行规则剪枝
        if (epoch + 1) % 50 == 0 and epoch < pruning_stop_epoch:
            pruned = anfis_model.prune_rules(
                threshold=attention_threshold_final,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,
                best_val_loss=best_val_loss
            )
            if pruned:
                print(f"Epoch {epoch+1}: Pruned rules. Total rules: {anfis_model.n_rules}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}: Rule pruning has been stopped to stabilize the model structure.")

    # 在训练结束后，加载最佳模型
    if os.path.exists(best_model_path):
        model_loaded, scaler_X_loaded, scaler_y_loaded = AttentionDynamicAttributeAndRuleANFIS.load_model_with_architecture(best_model_path, device=device)
        anfis_model = model_loaded  # 更新模型为加载的最佳模型
        scaler_X = scaler_X_loaded
        scaler_y = scaler_y_loaded
        print("\nLoaded the best model based on validation loss.")

        # 显示训练后的隶属函数图像（可选）
        # anfis_model.plot_membership_functions(feature_names=feature_labels)

        # 绘制属性权重的热力图（可选）
        # plot_heatmap(anfis_model, feature_labels)

        # 提取并保存规则
        rules = extract_fuzzy_rules(anfis_model, scaler_X, feature_names=feature_labels)
        print(f"\n=== ADAR-ANFIS Extracted Fuzzy Rules ===")
        for rule in rules:
            print(rule)
            print()
        # # 保存规则到文件
        # with open(f'results/rules_lr{lr}_final.txt', 'w') as f:
        #     for rule in rules:
        #         f.write(rule + '\n')

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")
    else:
        print("No improvement during training. Using the final model.")

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")

    # 可视化训练过程
    # 合并验证损失和规则数量曲线
    fig, ax1 = plt.subplots(figsize=(10, 6))

    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Validation Loss', color=color)
    ax1.plot(training_info['epoch'], training_info['val_loss'], color=color, label='Validation Loss')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # 共享 x 轴
    color = 'tab:red'
    ax2.set_ylabel('Number of Rules', color=color)
    ax2.plot(training_info['epoch'], training_info['total_rules'], color=color, label='Total Rules', linestyle='--')
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()
    plt.title('Validation Loss and Number of Rules over Epochs')
    plt.grid(True)
    plt.show()

    return anfis_model, scaler_X, scaler_y, total_active_attributes, training_info

# ============================
# 定义实验函数
# ============================

def run_experiment(
    X, y, feature_names, n_rules=9, learning_rates=[0.01], repeats=5
):
    """
    执行多次实验，计算 RMSE、Iov 和 Ifspe。

    参数：
    - X: 特征数据，pandas DataFrame 或 numpy array
    - y: 目标数据，pandas Series 或 numpy array
    - feature_names: 特征名称列表
    - n_rules: 规则数量
    - learning_rates: 学习率列表
    - repeats: 每种配置的重复次数

    返回：
    - results: 实验结果的列表
    """
    # 将数据拆分为训练集和测试集
    X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 创建结果保存的目录
    os.makedirs('results_sofenn', exist_ok=True)

    # 记录实验结果
    results_sofenn = []

    for lr in learning_rates:
        test_rmse_list = []  # List to store test set RMSEs for each repeat
        val_rmse_list = []   # List to store validation set RMSEs for each repeat
        time_list = []
        attribute_weights_list = []
        overlap_indices_list = []
        position_indices_list = []
        total_attributes_list_experiment = []
        print(f"\nStarting experiments for n_rules={n_rules}, learning_rate={lr}")
        for repeat in range(repeats):
            # Record start time
            start_time = time.time()

            # Further split the training set into training and validation sets
            X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
                X_train_np, y_train_np, test_size=0.2, random_state=repeat
            )

            # Define a unique path for saving the best model for this repeat
            best_model_path = f'results_sofenn/best_model_nrules{n_rules}_lr{lr}_repeat{repeat+1}.pth'

            # Train the model
            anfis_model, scaler_X, scaler_y, total_active_attributes, training_info = train_attention_dynamic_attribute_and_rule_anfis(
                X_train_sub, y_train_sub, X_test_np, y_test_np,
                initial_n_rules=n_rules,
                epochs=1500,
                batch_size=512,
                lr=lr,
                prune_frequency=25,
                prune_threshold=0.1,
                best_model_path=best_model_path  # 传递最佳模型保存路径
            )

            # 加载最佳模型的状态字典（确保使用的是最佳模型）
            anfis_model.eval()  # 设置为评估模式
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            anfis_model.to(device)

            # Test the model on the test set using the best model
            X_test_scaled = scaler_X.transform(X_test_np)  # 使用 X_test_np 作为测试集
            X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
            y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
            with torch.no_grad():
                y_pred_scaled = anfis_model.infer(X_test_tensor)
                y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
                y_true = y_test_np  # Original unstandardized y_test
                test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                test_rmse_list.append(test_rmse)

            # Compute RMSE on the validation set
            X_val_scaled = scaler_X.transform(X_val_sub)
            X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
            y_val_tensor = torch.tensor(y_val_sub, dtype=torch.float32).to(device)
            with torch.no_grad():
                y_val_pred_scaled = anfis_model.infer(X_val_tensor)
                y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
                y_val_true = y_val_sub  # 使用原始的 y_val_sub 作为真实值
                val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
                val_rmse_list.append(val_rmse)

            # Record end time
            end_time = time.time()
            time_taken = end_time - start_time
            time_list.append(time_taken)

            # Extract attribute weights
            attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
            avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules
            attribute_weights_list.append(avg_attribute_weights)

            # Compute interpretability indices
            overlap_index = compute_iov(anfis_model)
            position_index = compute_ifspe(anfis_model)
            overlap_indices_list.append(overlap_index)
            position_indices_list.append(position_index)

            # Collect total active attributes from best model
            total_attributes_list_experiment.append(total_active_attributes)

            print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Val RMSE={val_rmse:.4f}, Time={time_taken:.2f}s")

            # Extract and save fuzzy rules
            rules = extract_fuzzy_rules(anfis_model, scaler_X, feature_names=feature_names)
            print(f"\n=== Fuzzy Rules for Repeat {repeat+1} ===")
            for rule in rules:
                print(rule)
                print()
            # # Save rules to file
            # with open(f'results_sofenn/rules_nrules{n_rules}_lr{lr}_repeat{repeat+1}.txt', 'w') as f:
            #     for rule in rules:
            #         f.write(rule + '\n')

        # Compute RMSE mean and std for test set and validation set
        test_rmse_mean = np.mean(test_rmse_list)
        test_rmse_std = np.std(test_rmse_list)
        val_rmse_mean = np.mean(val_rmse_list)
        val_rmse_std = np.std(val_rmse_list)

        # Compute average attribute weights over repeats
        avg_attribute_weights_over_repeats = np.mean(attribute_weights_list, axis=0)

        # Compute average total number of attributes included in all rules
        average_total_attributes = np.mean(total_attributes_list_experiment)

        # Compute average interpretability indices
        avg_overlap_index = np.mean(overlap_indices_list)
        avg_position_index = np.mean(position_indices_list)
        std_overlap_index = np.std(overlap_indices_list)
        std_position_index = np.std(position_indices_list)
        # Print the results
        print(f"\nResults for n_rules={n_rules}, learning_rate={lr}:")
        print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
        print(f"Validation RMSE: {val_rmse_mean:.4f} ± {val_rmse_std:.4f}")
        print(f"Time: {np.mean(time_list):.2f}s ± {np.std(time_list):.2f}s")
        print(f"Average Overlap Index (Iov): {avg_overlap_index:.4f} ± {std_overlap_index:.4f}")
        print(f"Average Fuzzy Set Position Index (Ifspe): {avg_position_index:.4f} ± {std_position_index:.4f} ")
        print(f"Average Attribute Weights over Repeats: {avg_attribute_weights_over_repeats}")
        print(f"Average Total Number of Attributes Included in All Rules: {average_total_attributes:.2f}")

        # Save results
        result = {
            'n_rules': n_rules,
            'learning_rate': lr,
            'test_rmse_mean': test_rmse_mean,
            'test_rmse_std': test_rmse_std,
            'val_rmse_mean': val_rmse_mean,
            'val_rmse_std': val_rmse_std,
            'time_mean': np.mean(time_list),
            'time_std': np.std(time_list),
            'attribute_weights': avg_attribute_weights_over_repeats,
            'overlap_index': avg_overlap_index,
            'position_index': avg_position_index,
            'average_total_attributes': average_total_attributes
        }
        results_sofenn.append(result)

        # Save result data
        np.save(f'results_sofenn/attribute_weights_nrules{n_rules}_lr{lr}.npy', avg_attribute_weights_over_repeats)
        np.save(f'results_sofenn/overlap_index_nrules{n_rules}_lr{lr}.npy', avg_overlap_index)
        np.save(f'results_sofenn/position_index_nrules{n_rules}_lr{lr}.npy', avg_position_index)
        np.save(f'results_sofenn/average_total_attributes_nrules{n_rules}_lr{lr}.npy', average_total_attributes)

        # 可视化重叠指数和位置指数
        plt.figure(figsize=(10, 6))
        plt.bar(['Overlap Index (Iov)', 'Position Index (Ifspe)'], [avg_overlap_index, avg_position_index], color=['skyblue', 'salmon'])
        plt.ylabel('Index Value')
        plt.title('Average Interpretability Indices')
        plt.grid(axis='y')
        plt.show()

        # 可视化平均总属性数量
        plt.figure(figsize=(6, 4))
        plt.bar(['Average Total Attributes'], [average_total_attributes], color=['lightgreen'])
        plt.ylabel('Number of Attributes')
        plt.title('Average Total Number of Attributes Included in All Rules')
        plt.grid(axis='y')
        plt.tight_layout()
        plt.show()

    # 将结果保存为 CSV 文件
    results_df = pd.DataFrame(results_sofenn)
    results_df.to_csv('results_sofenn/experiment_results_boston.csv', index=False)
    print("\n所有实验结果已保存到 'results_sofenn/experiment_results_boston.csv' 文件中。")

    return results_sofenn

# ============================
# 执行实验
# ============================

# 将训练集进一步拆分为训练和验证集
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
    X_train_np, y_train_np, test_size=0.2, random_state=42
)

# 设置实验参数
n_rules = 3
learning_rates = [0.01]
repeats = 5  # 每种配置重复次数

# 运行实验
results = run_experiment(
    X=X_train_np,
    y=y_train_np,
    feature_names=feature_labels,
    n_rules=n_rules,
    learning_rates=learning_rates,
    repeats=repeats
)


In [None]:
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from interpret.glassbox import ExplainableBoostingRegressor, APLRRegressor
from tqdm import tqdm, trange

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================

# 从 UCI ML Repo 下载 Appliances Energy Prediction 数据集
energy_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'

# 读取数据集
data = pd.read_csv(energy_url)

# 特征选择
# 排除 'date' 和 'Appliances'，将其余作为输入特征
features_to_use = [
    'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
    'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
    'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
    'Tdewpoint', 'rv1', 'rv2'
]

# 处理目标变量
X = data[features_to_use]
y = data['Appliances']

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X, y], axis=1).dropna()
X = data[features_to_use]
y = data['Appliances']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# feature_names = features_to_use

# ============================
# 定义 ADAR-ANFIS 模型
# ============================

class AttentionDynamicAttributeAndRuleANFIS(nn.Module):
    def __init__(self, n_inputs, n_rules, X_train=None, attention_threshold=0.1, init_from_checkpoint=False):
        super(AttentionDynamicAttributeAndRuleANFIS, self).__init__()
        self.n_inputs = n_inputs
        self.attention_threshold = attention_threshold  # 属性注意力阈值
        self.n_rules = n_rules  # 初始规则数量

        if not init_from_checkpoint:
            if X_train is None:
                raise ValueError("X_train must be provided for initializing rules.")

            # 使用 KMeans 聚类初始化隶属函数中心
            kmeans = KMeans(n_clusters=n_rules, random_state=42)
            kmeans.fit(X_train)
            cluster_centers = kmeans.cluster_centers_  # 形状：(n_rules, n_inputs)

            # 初始化隶属函数参数
            self.mu = nn.Parameter(torch.tensor(cluster_centers, dtype=torch.float32))  # 均值，形状：(n_rules, n_inputs)
            self.sigma = nn.Parameter(torch.ones(n_rules, n_inputs))  # 标准差

            # 初始化属性注意力权重参数
            self.attention_weights = nn.Parameter(torch.randn(n_rules, n_inputs))

            # 初始化规则注意力权重参数
            self.rule_attention_weights = nn.Parameter(torch.ones(n_rules))

            # 初始化后件参数（对于回归任务）
            self.consequents = nn.Parameter(torch.randn(n_rules, n_inputs))

            # 初始化属性掩码（1表示活跃，0表示被剪除），针对每个规则
            device = self.mu.device  # 获取设备
            self.register_buffer('attribute_mask', torch.ones(n_rules, n_inputs, device=device))
        else:
            # 初始化占位参数，实际参数将在加载后设置
            self.mu = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.sigma = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.attention_weights = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.rule_attention_weights = nn.Parameter(torch.empty(n_rules))
            self.consequents = nn.Parameter(torch.empty(n_rules, n_inputs))
            self.register_buffer('attribute_mask', torch.empty(n_rules, n_inputs))

    def forward(self, x):
        batch_size = x.size(0)

        # 将被剪枝的属性的 attention_weights 设置为一个大负数
        masked_attention_weights = self.attention_weights.clone()
        masked_attention_weights[self.attribute_mask == 0] = -1e6

        # 计算属性注意力权重（使用 sigmoid 激活函数并应用属性掩码）
        attention = torch.sigmoid(masked_attention_weights) * self.attribute_mask  # 形状：(n_rules, n_inputs)

        # 计算规则注意力权重（使用 sigmoid 激活函数）
        rule_attention = torch.sigmoid(self.rule_attention_weights)  # 形状：(n_rules,)

        # 扩展维度以匹配批次大小
        x_expanded = x.unsqueeze(1)  # 形状：(batch_size, 1, n_inputs)
        mu_expanded = self.mu.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)
        sigma_expanded = self.sigma.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 确保 sigma 为正数，避免除以零
        sigma_expanded = torch.clamp(sigma_expanded, min=1e-3)

        # 计算高斯隶属度函数的对数
        log_gauss = -0.5 * ((x_expanded - mu_expanded) ** 2) / (sigma_expanded ** 2)

        # 使用属性注意力权重并应用属性掩码
        log_gauss_weighted = log_gauss * attention.unsqueeze(0)  # 形状：(batch_size, n_rules, n_inputs)

        # 对输入属性维度求和
        sum_log_gauss = log_gauss_weighted.sum(dim=2)  # 形状：(batch_size, n_rules)

        # 计算规则的激活度
        firing_strength = torch.exp(sum_log_gauss)  # 形状：(batch_size, n_rules)

        # 使用规则注意力权重调整规则的激活度
        firing_strength_weighted = firing_strength * rule_attention.unsqueeze(0)  # 形状：(batch_size, n_rules)

        # 计算归一化的激活度
        sum_firing_strength = firing_strength_weighted.sum(dim=1, keepdim=True) + 1e-8
        norm_firing_strength = firing_strength_weighted / sum_firing_strength  # 形状：(batch_size, n_rules)

        # 计算后件部分（使用属性注意力权重）
        consequents_weighted = self.consequents * attention  # 形状：(n_rules, n_inputs)
        consequents_weighted_expanded = consequents_weighted.unsqueeze(0)  # 形状：(1, n_rules, n_inputs)

        # 计算规则的输出（对于每个规则，后件为被选中属性的线性组合）
        rule_outputs = torch.sum(consequents_weighted_expanded * x_expanded, dim=2)  # 形状：(batch_size, n_rules)

        # 计算总输出
        output = torch.sum(norm_firing_strength * rule_outputs, dim=1)  # 形状：(batch_size,)

        return output, firing_strength, attention, rule_attention

    def train_step(self, x, target, optimizer, lambda_attention=1e-7, lambda_rule_attention=1e-8, lambda_diversity=1e-4):
        """
        执行一次训练步骤。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - target: 目标数据，形状：(batch_size,)
        - optimizer: 优化器实例
        - lambda_attention: 属性注意力权重的正则化系数
        - lambda_rule_attention: 规则注意力权重的正则化系数
        - lambda_diversity: 多样性正则化的系数

        返回：
        - loss.item(): 当前批次的总损失
        - output: 模型的输出
        """
        self.train()
        optimizer.zero_grad()
        output, firing_strength, attention, rule_attention = self.forward(x)
        # 计算预测损失（均方误差损失）
        loss_pred = F.mse_loss(output, target)

        # 添加属性注意力正则化损失（L1 正则化）
        loss_attention = lambda_attention * attention.abs().sum()

        # 计算规则注意力正则化损失（L1 正则化）
        loss_rule_attention = lambda_rule_attention * rule_attention.abs().sum()

        # 添加多样性正则化损失（鼓励不同规则的注意力权重不同）
        if self.n_rules > 1:
            # 计算注意力权重的余弦相似度矩阵
            attention_norm = attention / (attention.norm(dim=1, keepdim=True) + 1e-8)
            similarity_matrix = torch.matmul(attention_norm, attention_norm.t())
            # 计算非对角线的平均相似度
            diversity_loss = torch.sum(similarity_matrix) - torch.diag(similarity_matrix).sum()
            diversity_loss = diversity_loss / (self.n_rules * (self.n_rules - 1))
        else:
            diversity_loss = torch.tensor(0.0).to(attention.device)

        loss_diversity = lambda_diversity * diversity_loss

        # 总损失
        loss = loss_pred + loss_attention + loss_rule_attention + loss_diversity

        if torch.isnan(loss):
            print("Loss is NaN. Stopping training.")
            return loss.item(), output

        loss.backward()

        optimizer.step()

        return loss.item(), output

    def prune_attributes_per_rule(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除每个规则中注意力权重低于阈值的属性，并冻结其相关参数。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则不执行剪枝。

        参数：
        - threshold: 剪枝阈值，默认为0.1
        - X_val: 验证集特征，形状：(num_val_samples, n_inputs)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned_dict: 字典，键为规则索引，值为被剪除的属性索引列表
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        # 执行剪枝操作
        pruned_dict = {}
        with torch.no_grad():
            attention = torch.sigmoid(model_copy.attention_weights)  # 形状：(n_rules, n_inputs)

            for rule_idx in range(model_copy.n_rules):
                if torch.all(model_copy.attribute_mask[rule_idx] == 0):
                    continue  # 跳过已被完全剪除的规则

                prune_indices = torch.where((attention[rule_idx] < threshold) & (model_copy.attribute_mask[rule_idx] == 1))[0].tolist()

                if prune_indices:
                    # 更新属性掩码
                    model_copy.attribute_mask[rule_idx, prune_indices] = 0.0

                    # 冻结被剪除属性的相关参数
                    model_copy.attention_weights[rule_idx, prune_indices].requires_grad = False
                    model_copy.consequents[rule_idx, prune_indices].requires_grad = False

                    pruned_dict[rule_idx] = prune_indices

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after, _, _, _ = model_copy.forward(X_val)
            loss_after = F.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，不执行剪枝
            print(f"Attribute pruning was not performed due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned_dict = {}  # 清空剪枝记录
            pruned = False
        else:
            # 性能未下降，更新原始模型的参数
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.attention_weights.data = model_copy.attention_weights.data.clone()
            self.consequents.data = model_copy.consequents.data.clone()
            print(f"Attribute pruning successful. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned_dict

    def prune_rules_with_recovery(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则不执行剪枝。

        参数：
        - threshold: 剪枝阈值，默认值为0.1
        - X_val: 验证集特征，形状为 (num_val_samples, n_inputs)
        - y_val: 验证集目标，形状为 (num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        # 执行规则剪枝操作
        pruned = False
        with torch.no_grad():
            # 获取规则注意力权重
            rule_attention = torch.sigmoid(model_copy.rule_attention_weights)
            # 找到需要移除的规则索引（rule_attention < threshold）
            low_attention_indices = torch.where(rule_attention < threshold)[0]

            if len(low_attention_indices) == 0:
                return pruned  # 没有需要移除的规则

            # 保留的规则索引（rule_attention >= threshold）
            keep_indices = torch.where(rule_attention >= threshold)[0]

            # 更新模型参数，移除低重要性的规则
            model_copy.mu = nn.Parameter(model_copy.mu.data[keep_indices])
            model_copy.sigma = nn.Parameter(model_copy.sigma.data[keep_indices])
            model_copy.attention_weights = nn.Parameter(model_copy.attention_weights.data[keep_indices])
            model_copy.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data[keep_indices])
            model_copy.consequents = nn.Parameter(model_copy.consequents.data[keep_indices])
            model_copy.attribute_mask = model_copy.attribute_mask.data[keep_indices].clone()

            # 更新规则数量
            model_copy.n_rules = len(keep_indices)

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after, _, _, _ = model_copy.forward(X_val)
            loss_after = F.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，不执行剪枝
            print(f"Rule pruning was not performed due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned = False
        else:
            # 性能未下降，更新原始模型的参数
            self.mu = nn.Parameter(model_copy.mu.data.clone())
            self.sigma = nn.Parameter(model_copy.sigma.data.clone())
            self.attention_weights = nn.Parameter(model_copy.attention_weights.data.clone())
            self.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data.clone())
            self.consequents = nn.Parameter(model_copy.consequents.data.clone())
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.n_rules = model_copy.n_rules  # 更新规则数量
            print(f"Rules pruned successfully. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned

    def prune_rules(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝操作。

        参数：
        - threshold: 剪枝阈值，默认值为0.1
        - X_val: 验证集特征，形状为 (num_val_samples, n_inputs)
        - y_val: 验证集目标，形状为 (num_val_samples,)
        - performance_drop_tolerance: 性能下降容忍度，默认为0.01（即1%）
        - best_val_loss: 之前的最佳验证损失

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        return self.prune_rules_with_recovery(threshold, X_val, y_val, performance_drop_tolerance, best_val_loss)

    def grow_rule_with_performance_check(self, X_new, X_train, y_train, X_val, y_val, best_val_loss, device, lr, grow_epochs=10):
        """
        添加一个新的规则，并进行性能检查。如果性能没有提升，则撤销规则生长。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, n_inputs)
        - X_train: 训练集特征，形状：(num_train_samples, n_inputs)
        - y_train: 训练集目标，形状：(num_train_samples,)
        - X_val: 验证集特征，形状：(num_val_samples, n_inputs)
        - y_val: 验证集目标，形状：(num_val_samples,)
        - best_val_loss: 当前最佳验证损失
        - device: 设备
        - lr: 学习率
        - grow_epochs: 在规则生长后训练的 epoch 数，默认值为10

        返回：
        - improved: 布尔值，指示是否保留了新规则
        """
        # 创建模型的副本
        model_copy = copy.deepcopy(self).to(device)

        # 添加新规则到副本
        model_copy.grow_rule(X_new)

        # 初始化优化器和调度器
        optimizer_copy = optim.AdamW(model_copy.parameters(), lr=lr)
        scheduler_copy = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_copy, T_max=grow_epochs)

        # 将训练数据转换为张量
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

        # 训练模型副本
        for epoch in range(grow_epochs):
            model_copy.train()
            optimizer_copy.zero_grad()
            output, _, _, _ = model_copy.forward(X_train_tensor)
            loss = F.mse_loss(output, y_train_tensor)
            loss.backward()
            optimizer_copy.step()
            scheduler_copy.step()

        # 评估副本模型的性能
        model_copy.eval()
        with torch.no_grad():
            output_after, _, _, _ = model_copy.forward(X_val_tensor)
            loss_after = F.mse_loss(output_after, y_val_tensor).item()

        print(f"After growing rule and training for {grow_epochs} epochs: Validation Loss = {loss_after:.4f}")

        if loss_after < best_val_loss:
            # 性能有所提升，保留新规则
            # 手动更新原始模型的参数
            self.mu = nn.Parameter(model_copy.mu.data.clone())
            self.sigma = nn.Parameter(model_copy.sigma.data.clone())
            self.attention_weights = nn.Parameter(model_copy.attention_weights.data.clone())
            self.rule_attention_weights = nn.Parameter(model_copy.rule_attention_weights.data.clone())
            self.consequents = nn.Parameter(model_copy.consequents.data.clone())
            self.attribute_mask = model_copy.attribute_mask.clone()
            self.n_rules = model_copy.n_rules  # 更新规则数量
            print("Performance improved after growing rule. New rule retained.")
            return True
        else:
            # 性能未提升，丢弃副本，保持主模型不变
            print("Performance did not improve after growing rule. Rule growth reverted.")
            return False

    def grow_rule(self, X_new):
        """
        添加一个新的规则。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, n_inputs)
        """
        # 获取设备和数据类型
        device = self.mu.device
        dtype = self.mu.dtype

        # 使用 X_new 计算新的规则中心和标准差
        new_mu = torch.tensor(X_new.mean(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)
        new_sigma = torch.tensor(X_new.std(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, n_inputs)

        # 计算现有规则的属性注意力权重的平均值
        if self.n_rules > 0:
            existing_attention_weights = torch.sigmoid(self.attention_weights).data  # (n_rules, n_inputs)
            attention_mean = existing_attention_weights.mean(dim=0, keepdim=True)  # (1, n_inputs)
        else:
            attention_mean = torch.ones(1, self.n_inputs, dtype=dtype).to(device)  # 初始化为1

        # 将新规则的属性注意力权重初始化为平均值并加入随机扰动
        noise = torch.randn_like(attention_mean) * 0.05  # 调整扰动大小以控制多样性
        new_attention_weights = (attention_mean + noise).clamp(0, 1).detach()  # 保持在[0,1]范围内

        # 将新规则的规则注意力权重初始化为与现有权重的均值 logit 相同，并加入随机扰动
        if self.n_rules > 0:
            existing_rule_attention_logits = self.rule_attention_weights.data  # (n_rules,)
            rule_attention_mean_logit = existing_rule_attention_logits.mean().unsqueeze(0)  # (1,)
            rule_attention_noise = torch.randn_like(rule_attention_mean_logit) * 0.05  # 调整扰动大小
            new_rule_attention_weight = (rule_attention_mean_logit + rule_attention_noise).detach()
        else:
            rule_attention_mean_logit = torch.tensor([0.0], dtype=dtype).to(device)  # 中性 logit
            new_rule_attention_weight = rule_attention_mean_logit.clone().detach()  # (1,)

        # 初始化后件参数为小的随机值
        new_consequents = torch.randn(1, self.n_inputs, dtype=dtype).to(device) * 0.01  # (1, n_inputs)

        # 将新的参数添加到模型中
        self.mu = nn.Parameter(torch.cat([self.mu.data, new_mu], dim=0))  # (n_rules + 1, n_inputs)
        self.sigma = nn.Parameter(torch.cat([self.sigma.data, new_sigma], dim=0))  # (n_rules + 1, n_inputs)
        self.attention_weights = nn.Parameter(torch.cat([self.attention_weights.data, new_attention_weights], dim=0))  # (n_rules + 1, n_inputs)
        self.rule_attention_weights = nn.Parameter(torch.cat([self.rule_attention_weights.data, new_rule_attention_weight], dim=0))  # (n_rules + 1,)
        self.consequents = nn.Parameter(torch.cat([self.consequents.data, new_consequents], dim=0))  # (n_rules + 1, n_inputs)

        # 更新属性掩码，添加新规则的掩码行
        new_attribute_mask = torch.ones(1, self.n_inputs, dtype=self.attribute_mask.dtype).to(device)  # (1, n_inputs)
        self.attribute_mask = torch.cat([self.attribute_mask, new_attribute_mask], dim=0)  # (n_rules + 1, n_inputs)

        # 更新规则数量
        self.n_rules += 1

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.attribute_mask.shape[0] == self.n_rules, \
            f"After growing, attribute_mask has shape {self.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"New rule added. Total rules: {self.n_rules}")

    def infer(self, x, targets=None):
        """
        执行推理。

        参数：
        - x: 输入数据，形状：(batch_size, n_inputs)
        - targets: 目标数据，形状：(batch_size,)，可选

        返回：
        - 如果 targets 为 None，返回模型输出。
        - 否则，返回模型输出和损失值。
        """
        with torch.no_grad():
            self.eval()
            output, _, _, _ = self.forward(x)
            if targets is None:
                return output
            else:
                loss = F.mse_loss(output, targets)
                return output, loss.item()

    def save_model_with_architecture(self, scaler_X, scaler_y, path):
        """
        保存模型的状态字典和架构信息。

        参数：
        - scaler_X: 输入数据的标准化器
        - scaler_y: 输出数据的标准化器
        - path: 保存路径
        """
        torch.save({
            'n_rules': self.n_rules,
            'mu': self.mu.detach().cpu().numpy(),
            'sigma': self.sigma.detach().cpu().numpy(),
            'attention_weights': self.attention_weights.detach().cpu().numpy(),
            'rule_attention_weights': self.rule_attention_weights.detach().cpu().numpy(),
            'consequents': self.consequents.detach().cpu().numpy(),
            'attribute_mask': self.attribute_mask.detach().cpu().numpy(),
            'scaler_X_mean': scaler_X.mean_,
            'scaler_X_scale': scaler_X.scale_,
            'scaler_y_mean': scaler_y.mean_,
            'scaler_y_scale': scaler_y.scale_
        }, path)
        print(f"Model and architecture saved to {path}")

    @staticmethod
    def load_model_with_architecture(path, device='cpu'):
        checkpoint = torch.load(path, map_location=device)
        n_rules = checkpoint['n_rules']
        n_inputs = checkpoint['mu'].shape[1]

        # 初始化模型，设置 init_from_checkpoint=True 以跳过 KMeans 初始化
        model = AttentionDynamicAttributeAndRuleANFIS(
            n_inputs=n_inputs,
            n_rules=n_rules,
            X_train=None,  # 跳过 KMeans 初始化
            attention_threshold=0.1,
            init_from_checkpoint=True
        ).to(device)

        # 手动设置参数
        with torch.no_grad():
            model.mu = nn.Parameter(torch.tensor(checkpoint['mu'], dtype=torch.float32, device=device))
            model.sigma = nn.Parameter(torch.tensor(checkpoint['sigma'], dtype=torch.float32, device=device))
            model.attention_weights = nn.Parameter(torch.tensor(checkpoint['attention_weights'], dtype=torch.float32, device=device))
            model.rule_attention_weights = nn.Parameter(torch.tensor(checkpoint['rule_attention_weights'], dtype=torch.float32, device=device))
            model.consequents = nn.Parameter(torch.tensor(checkpoint['consequents'], dtype=torch.float32, device=device))

            # 使用 copy_ 复制到 attribute_mask buffer，确保在正确的设备上
            model.attribute_mask.copy_(torch.tensor(checkpoint['attribute_mask'], dtype=torch.float32, device=device))


        # 加载标准化器参数
        scaler_X = StandardScaler()
        scaler_X.mean_ = checkpoint['scaler_X_mean']
        scaler_X.scale_ = checkpoint['scaler_X_scale']

        scaler_y = StandardScaler()
        scaler_y.mean_ = checkpoint['scaler_y_mean']
        scaler_y.scale_ = checkpoint['scaler_y_scale']

        print(f"Model and architecture loaded from {path} with n_rules={n_rules}")

        return model, scaler_X, scaler_y

    def plot_membership_functions(self, feature_names=None):
        """
        绘制训练后的隶属函数图像。

        参数：
        - feature_names: 特征名称列表，默认为 None。
        """
        mus = self.mu.detach().cpu().numpy()
        sigmas = self.sigma.detach().cpu().numpy()
        attentions = torch.sigmoid(self.attention_weights).detach().cpu().numpy()
        rule_attentions = torch.sigmoid(self.rule_attention_weights).detach().cpu().numpy()
        xn = np.linspace(-3, 3, 1000)

        n_inputs = self.n_inputs
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(n_inputs)]

        for r in range(self.n_rules):
            rule_attention_value = rule_attentions[r]
            plt.figure(figsize=(10, 6))
            plt.title(f"Rule {r + 1}, Rule Attention: {rule_attention_value:.4f}")
            for j in range(n_inputs):
                # 使用 attribute_mask 确认是否为活跃属性
                if self.attribute_mask[r, j] == 0:
                    continue  # 跳过被剪除的属性
                attention_value = attentions[r, j]
                c_val = mus[r, j]
                sigma_val = sigmas[r, j]
                # 绘制带有注意力权重的隶属函数
                y = np.exp(-0.5 * ((xn - c_val) ** 2) / (sigma_val ** 2 + 1e-8))
                plt.plot(xn, y, label=f"{feature_names[j]} (Attn: {attention_value:.4f})")
            plt.legend()
            plt.xlabel('Input')
            plt.ylabel('Membership degree')
            plt.grid(True)
            plt.show()

# ============================
# 定义辅助函数
# ============================

def extract_fuzzy_rules(anfis_model, scaler_X, feature_names=None):
    """
    提取 ANFIS 模型的模糊规则，包含所有用于计算的权重。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - scaler_X: 输入数据的标准化器
    - feature_names: 特征名称列表

    返回：
    - rules: 包含规则字符串的列表
    """
    # 获取模型的参数
    mu = anfis_model.mu.detach().cpu().numpy()  # (n_rules, n_inputs)
    sigma = anfis_model.sigma.detach().cpu().numpy()  # (n_rules, n_inputs)
    attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
    rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
    consequents = anfis_model.consequents.detach().cpu().numpy()

    # 反标准化 mu 和 sigma
    c_orig = mu * scaler_X.scale_ + scaler_X.mean_  # (n_rules, n_inputs)
    sigma_orig = sigma * scaler_X.scale_  # (n_rules, n_inputs)

    # 获取属性掩码
    attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()

    # 如果未提供特征名称，使用默认名称
    input_dim = c_orig.shape[1]
    if feature_names is None:
        feature_names = [f'Input {i+1}' for i in range(input_dim)]

    rules = []
    n_rules = c_orig.shape[0]

    for i in range(n_rules):
        # 包含规则注意力权重
        rule_str = (f"Rule {i+1} (Rule Attention: "
                    f"{rule_attention_weights[i]:.4f}): IF ")
        antecedent = []
        for j in range(input_dim):
            if attribute_mask[i, j] == 0:
                continue  # 忽略被剪枝的属性
            attention_value = attention_weights[i, j]
            c_val = c_orig[i, j]
            sigma_val = sigma_orig[i, j]
            antecedent.append(
                f"[{feature_names[j]} (Attn: {attention_value:.4f}) "
                f"is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})]"
            )
        antecedent_str = " AND ".join(antecedent) if antecedent else "True"
        rule_str += antecedent_str + " THEN Output = "

        consequent_terms = []
        for j in range(input_dim):
            if attribute_mask[i, j] == 0:
                continue
            attention_value = attention_weights[i, j]
            coef = consequents[i, j]
            consequent_terms.append(
                f"({coef:.4f} * {feature_names[j]} "
                f"(Attn: {attention_value:.4f}))"
            )
        consequent_str = " + ".join(consequent_terms) if consequent_terms else "0"
        rule_str += consequent_str
        rules.append(rule_str)

    return rules

def compute_overlap_index(mus, sigmas):
    """
    计算模糊集之间的重叠指数。

    参数：
    - mus: 形状为 (n_rules, n_inputs) 的数组，表示隶属函数的中心。
    - sigmas: 形状为 (n_rules, n_inputs) 的数组，表示隶属函数的标准差。

    返回：
    - overlap_index: 浮点数，表示所有输入特征上的平均重叠指数。
    """
    n_rules, n_inputs = mus.shape
    overlap_indices = []
    for j in range(n_inputs):
        overlaps = []
        for i in range(n_rules):
            for k in range(i + 1, n_rules):
                # 计算第 j 个输入特征上规则 i 和规则 k 的隶属函数之间的重叠程度
                distance = abs(mus[i, j] - mus[k, j])
                sigma_sum = sigmas[i, j] + sigmas[k, j]
                overlap = max(0, (sigma_sum - distance) / sigma_sum)
                overlaps.append(overlap)
        # 计算第 j 个输入特征上的平均重叠指数
        if overlaps:
            overlap_indices.append(np.mean(overlaps))
        else:
            overlap_indices.append(0)
    # 返回所有输入特征上的平均重叠指数
    overlap_index = np.mean(overlap_indices)
    return overlap_index

def compute_fuzzy_set_position_index(mus):
    """
    计算模糊集的位置指数。

    参数：
    - mus: 形状为 (n_rules, n_inputs) 的数组，表示隶属函数的中心。

    返回：
    - position_index: 浮点数，表示所有输入特征上的平均标准差。
    """
    n_rules, n_inputs = mus.shape
    position_indices = []
    for j in range(n_inputs):
        centers = mus[:, j]
        std_dev = np.std(centers)
        position_indices.append(std_dev)
    # 返回所有输入特征上的平均位置指数
    position_index = np.mean(position_indices)
    return position_index

def plot_attribute_weights(attribute_weights, feature_names):
    plt.figure(figsize=(10, 6))
    x = np.arange(len(feature_names))
    plt.bar(x, attribute_weights)
    plt.xticks(x, feature_names, rotation=45)
    plt.xlabel('Attributes')
    plt.ylabel('Average Attribute Weights')
    plt.title('Average Attribute Weights over Repeats')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_heatmap(anfis_model, feature_names):
    """
    绘制属性权重的热力图。

    参数：
    - anfis_model: 训练好的 ANFIS 模型
    - feature_names: 特征名称列表
    """
    # 提取属性掩码
    attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 将被剪枝的属性的 attention_weights 设置为一个大负数
    attention_weights = torch.sigmoid(anfis_model.attention_weights).clone()
    attention_weights[anfis_model.attribute_mask == 0] = -1e6

    # 计算注意力权重，并应用 attribute_mask
    attention = torch.sigmoid(attention_weights) * anfis_model.attribute_mask
    attention_np = attention.detach().cpu().numpy()  # (n_rules, n_inputs)

    # 创建注释字符串
    annotations = []
    for r in range(attention_np.shape[0]):
        row = []
        for a in range(attention_np.shape[1]):
            if attribute_mask_np[r, a] == 0:
                row.append("0.00\nX")  # 被剪枝的属性，值为 0，标记为 X
            else:
                row.append(f"{attention_np[r, a]:.2f}")
        annotations.append(row)

    plt.figure(figsize=(12, 6))
    sns.heatmap(
        attention_np,
        annot=annotations,
        fmt='',
        cmap='viridis',
        xticklabels=feature_names,
        yticklabels=[f'Rule {i+1}' for i in range(anfis_model.n_rules)],
        cbar_kws={'label': 'Attention Weight'}
    )
    plt.title(f'属性注意力权重 (被剪枝的属性标记为 X)')
    plt.xlabel('输入特征')
    plt.ylabel('规则')
    plt.tight_layout()
    plt.show()

# ============================
# 定义训练函数
# ============================

def train_attention_dynamic_attribute_and_rule_anfis(
    X_train_np, y_train_np, X_val_np, y_val_np,
    initial_n_rules=3, epochs=1500, batch_size=32, lr=0.01,
    prune_frequency=25, prune_threshold=0.1,
    best_model_path='best_model.pth'
):
    """
    训练 AttentionDynamicAttributeAndRuleANFIS 模型。
    并在训练过程中保存验证集上表现最好的模型。

    参数：
    - X_train_np: 训练集特征，形状为 (num_samples, n_inputs)
    - y_train_np: 训练集目标，形状为 (num_samples,)
    - X_val_np: 验证集特征，形状为 (num_val_samples, n_inputs)
    - y_val_np: 验证集目标，形状为 (num_val_samples,)
    - initial_n_rules: 初始规则数量，默认值为 3
    - epochs: 训练轮数，默认值为 1500
    - batch_size: 每批次的样本数量，默认值为 32
    - lr: 学习率，默认值为 0.01
    - prune_frequency: 进行属性剪枝的频率（每隔多少个 epoch）
    - prune_threshold: 属性剪枝的阈值
    - best_model_path: 最佳模型保存的文件路径，默认值为 'best_model.pth'

    返回:
    - anfis_model: 训练好的 ANFIS 模型（加载了最佳模型状态）
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    - total_active_attributes: 最优模型的总活跃属性数量
    - training_info: 训练过程中的信息
    """
    # 创建结果保存的目录
    os.makedirs('results', exist_ok=True)

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    n_inputs = X_train_scaled_tensor.shape[1]

    anfis_model = AttentionDynamicAttributeAndRuleANFIS(
        n_inputs=n_inputs,
        n_rules=initial_n_rules,
        X_train=X_train_scaled,
        attention_threshold=prune_threshold
    ).to(device)

    # 初始化优化器和调度器
    optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
    # 使用余弦退火学习率调度器
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    # 将优化器赋值给模型
    anfis_model.optimizer = optimizer

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'active_rules': [],
        'total_rules': [],
        'val_rmse': [],
        'attribute_weights': [],
        'rule_attention_weights': [],
        'pruned_attributes': [],
        'total_active_attributes': []
    }

    # 初始化变量以记录最佳验证损失和最佳模型状态
    best_val_loss = float('inf')
    best_model_state = None  # 用于保存最佳模型状态

    # 设置规则生长和剪枝的参数
    patience = 5  # 等待多少个 epoch 后触发规则生长
    grow_threshold = 0.0001  # 训练损失下降低于该阈值，触发规则生长
    no_improve_epochs = 0
    prev_val_loss = float('inf')

    max_rules = 50  # 设置规则数量上限，防止无限生长（根据需求调整）
    attention_threshold_final = 0.25  # 定义活跃规则的注意力权重阈值

    # 设置特征名称
    feature_names = feature_labels if feature_labels else [f'Input {i+1}' for i in range(n_inputs)]

    # 初始化列表用于统计总属性数量
    total_attributes_list = []

    # 训练模型
    for epoch in trange(epochs, desc="Training"):
        anfis_model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = anfis_model.train_step(
                batch_x,
                batch_y,
                optimizer,
                lambda_attention=1e-7,
                lambda_rule_attention=1e-8,
                lambda_diversity=1e-4
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        anfis_model.eval()

        with torch.no_grad():
            output, firing_strength, attention, rule_attention = anfis_model.forward(X_val_scaled_tensor)
            loss_val = F.mse_loss(output, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(output.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

        # 调整学习率
        scheduler.step()

        # 计算当前活跃规则的数量
        rule_attention_np = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()
        num_active_rules = np.sum(rule_attention_np >= attention_threshold_final)

        # 计算总活跃属性数量（仅在当前 epoch 使用）
        attribute_mask = anfis_model.attribute_mask.detach().cpu().numpy()
        num_active_attributes_per_rule = np.sum(attribute_mask, axis=1)  # 每个规则中活跃的属性数量
        total_active_attributes = np.sum(num_active_attributes_per_rule)  # 该模型中总的活跃属性数量

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['active_rules'].append(num_active_rules)
        training_info['total_rules'].append(anfis_model.n_rules)
        training_info['val_rmse'].append(val_rmse)
        training_info['total_active_attributes'].append(total_active_attributes)

        # 提取注意力权重
        attention_weights = torch.sigmoid(anfis_model.attention_weights).detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(anfis_model.rule_attention_weights).detach().cpu().numpy()

        # 计算平均属性权重
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules

        # 保存注意力权重
        training_info['attribute_weights'].append(avg_attribute_weights)
        training_info['rule_attention_weights'].append(rule_attention_weights)

        # 保存总活跃属性数量
        total_attributes_list.append(total_active_attributes)

        # Check for best validation loss
        if loss_val.item() < best_val_loss:
            best_val_loss = loss_val.item()
            # 保存最佳模型状态
            best_model_state = copy.deepcopy(anfis_model.state_dict())
            # 保存最佳模型
            anfis_model.save_model_with_architecture(scaler_X, scaler_y, best_model_path)
            print(f"\nEpoch {epoch+1}: New best validation loss: {loss_val.item():.6f}. Model saved.")

        # 显示训练进度
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.6f} - Val Loss: {loss_val.item():.6f} - Val RMSE: {val_rmse:.6f} - Total Rules: {anfis_model.n_rules} - Active Rules: {num_active_rules}")

        # 检查验证损失的改进情况
        if loss_val.item() < prev_val_loss - grow_threshold:
            no_improve_epochs = 0
            prev_val_loss = loss_val.item()
        else:
            no_improve_epochs += 1

        # 如果验证损失在连续若干个 epoch 中没有显著改进，触发规则生长
        if no_improve_epochs >= patience and anfis_model.n_rules < max_rules:
            print(f"\nEpoch {epoch+1}: No significant improvement in validation loss, growing a new rule. Current rules: {anfis_model.n_rules}")

            # 找出当前误差较大的数据点，用于初始化新规则
            residuals = (y_val_scaled_tensor.cpu().numpy() - output.cpu().numpy())
            high_error_indices = np.argsort(np.abs(residuals))[-int(0.1 * len(residuals)):]  # 选取误差最大的 10% 数据
            X_new_rule = X_val_scaled[high_error_indices]

            # 添加新规则并进行性能检查
            improved = anfis_model.grow_rule_with_performance_check(
                X_new_rule,
                X_train_scaled,  # 传入训练集特征
                y_train_scaled_tensor.cpu().numpy(),  # 传入训练集目标
                X_val_scaled,
                y_val_scaled_tensor.cpu().numpy(),
                best_val_loss,
                device,
                lr,
                grow_epochs=100  # 设定在规则生长后训练的轮次
            )

            if improved:
                # 如果性能有所改善，更新最佳验证损失
                prev_val_loss = best_val_loss  # 更新之前的验证损失
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
                print("Optimizer and scheduler re-initialized after growing a new rule.")
            else:
                # 如果性能未改善，撤销规则生长操作（已在方法内完成）
                pass

            no_improve_epochs = 0  # 重置计数器

        # 设置剪枝停止的 epoch 阈值
        pruning_stop_epoch = int(epochs * 0.8)  # 在 80% 的训练过程中进行剪枝

        # 每隔 prune_frequency 个 epoch 进行属性剪枝
        if (epoch + 1) % prune_frequency == 0 and epoch < pruning_stop_epoch:
            pruned_dict = anfis_model.prune_attributes_per_rule(
                threshold=prune_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,  # 性能下降容忍度，可根据需要调整
                best_val_loss=best_val_loss
            )
            training_info['pruned_attributes'].append(pruned_dict)
            if pruned_dict:
                print(f"Epoch {epoch+1}: Pruned attributes per rule: {pruned_dict}")
                # Reinitialize optimizer and scheduler after pruning
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
                print("Optimizer and scheduler re-initialized after pruning attributes.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % prune_frequency == 0:
            print(f"Epoch {epoch+1}: Pruning has been stopped to stabilize the model structure.")

        # 每隔若干个 epoch 进行规则剪枝
        if (epoch + 1) % 50 == 0 and epoch < pruning_stop_epoch:
            pruned = anfis_model.prune_rules(
                threshold=attention_threshold_final,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,
                best_val_loss=best_val_loss
            )
            if pruned:
                print(f"Epoch {epoch+1}: Pruned rules. Total rules: {anfis_model.n_rules}")
                # Reinitialize optimizer and scheduler after pruning
                optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch and (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}: Rule pruning has been stopped to stabilize the model structure.")

    # 在训练结束后，加载最佳模型
    if os.path.exists(best_model_path):
        model_loaded, scaler_X_loaded, scaler_y_loaded = AttentionDynamicAttributeAndRuleANFIS.load_model_with_architecture(best_model_path, device=device)
        anfis_model = model_loaded  # 更新模型为加载的最佳模型
        scaler_X = scaler_X_loaded
        scaler_y = scaler_y_loaded
        print("\nLoaded the best model based on validation loss.")

        # 显示训练后的隶属函数图像（可选）
        # anfis_model.plot_membership_functions(feature_names=feature_labels)

        # 绘制属性权重的热力图（可选）
        # plot_heatmap(anfis_model, feature_labels)

        # 提取并保存规则
        rules = extract_fuzzy_rules(anfis_model, scaler_X, feature_names=feature_labels)
        print(f"\n=== ADAR-ANFIS Extracted Fuzzy Rules ===")
        for rule in rules:
            print(rule)
            print()
        # 保存规则到文件
        with open(f'results/rules_lr{lr}_final.txt', 'w') as f:
            for rule in rules:
                f.write(rule + '\n')

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")
    else:
        print("No improvement during training. Using the final model.")

        # 计算总活跃属性数量
        attribute_mask_np = anfis_model.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask_np)
        print(f"Total Number of Attributes Included in All Rules: {total_active_attributes:.2f}")

    # 可视化训练过程
    # 合并验证损失和规则数量曲线
    fig, ax1 = plt.subplots(figsize=(10, 6))

    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Validation Loss', color=color)
    ax1.plot(training_info['epoch'], training_info['val_loss'], color=color, label='Validation Loss')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # 共享 x 轴
    color = 'tab:red'
    ax2.set_ylabel('Number of Rules', color=color)
    ax2.plot(training_info['epoch'], training_info['total_rules'], color=color, label='Total Rules', linestyle='--')
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()
    plt.title('Validation Loss and Number of Rules over Epochs')
    plt.grid(True)
    plt.show()

    return anfis_model, scaler_X, scaler_y, total_active_attributes, training_info

# ============================
# 执行实验
# ============================

# 将训练集进一步拆分为训练和验证集
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
    X_train_np, y_train_np, test_size=0.2, random_state=42
)

# 设置实验参数
initial_n_rules = 2  # 初始规则数量
epochs = 500
batch_size = 64
learning_rate = 0.01
prune_frequency = 25
prune_threshold = 0.1

# 训练模型
anfis_model, scaler_X, scaler_y, total_active_attributes, training_info = train_attention_dynamic_attribute_and_rule_anfis(
    X_train_sub, y_train_sub, X_val_sub, y_val_sub,
    initial_n_rules=initial_n_rules,
    epochs=epochs,
    batch_size=batch_size,
    lr=learning_rate,
    prune_frequency=prune_frequency,
    prune_threshold=prune_threshold,
    best_model_path='best_model_dynamic.pth'
)


In [None]:
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
from scipy.stats import norm

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================

# 假设您已经获取了北京 PM2.5 数据集，并将其存储在变量中
# 由于无法在线获取数据，请确保您已将数据集加载到 X 和 y 中

# 数据（作为 pandas 数据帧）
# X = 数据的特征部分
# y = 数据的目标变量 'pm2.5'

# 从 UCI ML Repo 下载 Appliances Energy Prediction 数据集
energy_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'

# 读取数据集
data = pd.read_csv(energy_url)

# 特征选择
# 排除 'date' 和 'Appliances'，将其余作为输入特征
features_to_use = [
    'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
    'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
    'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
    'Tdewpoint', 'rv1', 'rv2'
]

# 处理目标变量
X = data[features_to_use]
y = data['Appliances']

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X, y], axis=1).dropna()
X = data[features_to_use]
y = data['Appliances']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# feature_names = features_to_use


# ============================
# 定义模型组件
# ============================

# 定义扩展后的 FuzzyLayer 类，加入属性注意力机制
class AttentionFuzzyLayer(nn.Module):
    def __init__(self, input_dim, n_rules):
        super(AttentionFuzzyLayer, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules

        # 初始化中心 c 和宽度 sigma
        self.c = nn.Parameter(torch.randn(n_rules, input_dim))
        self.sigma = nn.Parameter(torch.ones(n_rules, input_dim))

        # 初始化属性注意力权重
        self.attention_weights = nn.Parameter(torch.randn(n_rules, input_dim))

        # 初始化属性掩码（1 表示活跃，0 表示被剪除）
        self.register_buffer('attribute_mask', torch.ones(n_rules, input_dim))

    def forward(self, x):
        # x: (batch_size, input_dim)
        batch_size = x.size(0)

        # 计算属性注意力权重（使用 sigmoid 激活函数并应用属性掩码）
        attention = torch.sigmoid(self.attention_weights) * self.attribute_mask  # (n_rules, input_dim)

        # 扩展维度以进行广播
        x_expanded = x.unsqueeze(1)  # (batch_size, 1, input_dim)
        c_expanded = self.c.unsqueeze(0)  # (1, n_rules, input_dim)
        sigma_expanded = self.sigma.unsqueeze(0)  # (1, n_rules, input_dim)

        # 确保 sigma 为正数，避免除以零
        sigma_expanded = torch.clamp(sigma_expanded, min=1e-3)

        # 计算高斯隶属度函数的对数
        diff = x_expanded - c_expanded  # (batch_size, n_rules, input_dim)
        exponent = -0.5 * ((diff / sigma_expanded) ** 2)

        # 使用属性注意力权重并应用属性掩码
        exponent_weighted = exponent * attention.unsqueeze(0)  # (batch_size, n_rules, input_dim)

        # 对输入属性维度求和
        sum_exponent = exponent_weighted.sum(dim=2)  # (batch_size, n_rules)

        # 计算规则激活度
        phi = torch.exp(sum_exponent)  # (batch_size, n_rules)

        return phi, attention

# 定义扩展后的 NormalizedLayer 类，加入规则注意力机制
class AttentionNormalizedLayer(nn.Module):
    def __init__(self, n_rules):
        super(AttentionNormalizedLayer, self).__init__()
        self.n_rules = n_rules

        # 初始化规则注意力权重
        self.rule_attention_weights = nn.Parameter(torch.ones(n_rules))

    def forward(self, phi):
        # phi: (batch_size, n_rules)
        # 计算规则注意力权重（使用 sigmoid 激活函数）
        rule_attention = torch.sigmoid(self.rule_attention_weights)  # (n_rules,)

        # 使用规则注意力权重调整规则激活度
        phi_weighted = phi * rule_attention.unsqueeze(0)  # (batch_size, n_rules)

        # 计算归一化的激活度
        phi_sum = phi_weighted.sum(dim=1, keepdim=True) + 1e-8  # 防止除以零
        psi = phi_weighted / phi_sum  # (batch_size, n_rules)
        return psi, rule_attention

# 定义扩展后的 WeightedLayer 类
class AttentionWeightedLayer(nn.Module):
    def __init__(self, input_dim, n_rules):
        super(AttentionWeightedLayer, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules

        # 初始化后件参数 a，包括偏置项
        self.a = nn.Parameter(torch.randn(n_rules, input_dim + 1))

    def forward(self, x, psi):
        # x: (batch_size, input_dim)
        # psi: (batch_size, n_rules)
        batch_size = x.size(0)
        # 添加偏置项
        ones = torch.ones(batch_size, 1).to(x.device)
        x_with_bias = torch.cat([ones, x], dim=1)  # (batch_size, input_dim + 1)

        # 扩展 x 和 a 的维度以进行广播
        x_expanded = x_with_bias.unsqueeze(1)  # (batch_size, 1, input_dim + 1)
        a_expanded = self.a.unsqueeze(0)       # (1, n_rules, input_dim + 1)

        # 计算每个规则的输出（元素级乘法后在特征维度上求和）
        w = (x_expanded * a_expanded).sum(dim=2)  # (batch_size, n_rules)

        f = psi * w  # (batch_size, n_rules)
        return f

# 定义 OutputLayer 类
class OutputLayer(nn.Module):
    def forward(self, f):
        # f: (batch_size, n_rules)
        output = f.sum(dim=1)  # (batch_size,)
        return output

# ============================
# 定义完整的 SOFENN 模型，加入规则生长和剪枝功能
# ============================
class AttentionDynamicAttributeAndRuleSOFENN(nn.Module):
    def __init__(self, input_dim, n_rules, attention_threshold=0.1):
        super(AttentionDynamicAttributeAndRuleSOFENN, self).__init__()
        self.input_dim = input_dim
        self.n_rules = n_rules
        self.attention_threshold = attention_threshold

        self.fuzzy_layer = AttentionFuzzyLayer(input_dim, n_rules)
        self.normalized_layer = AttentionNormalizedLayer(n_rules)
        self.weighted_layer = AttentionWeightedLayer(input_dim, n_rules)
        self.output_layer = OutputLayer()

    def forward(self, x):
        phi, attention = self.fuzzy_layer(x)
        psi, rule_attention = self.normalized_layer(phi)
        f = self.weighted_layer(x, psi)
        output = self.output_layer(f)
        return output, phi, attention, rule_attention

    def train_step(self, x, target, optimizer, lambda_attention=1e-7, lambda_rule_attention=1e-8, lambda_diversity=1e-4):
        """
        执行一次训练步骤。
        """
        self.train()
        optimizer.zero_grad()
        output, _, attention, rule_attention = self.forward(x)
        # 计算预测损失（均方误差损失）
        loss_pred = nn.functional.mse_loss(output, target)

        # 添加属性注意力正则化损失（L1 正则化）
        loss_attention = lambda_attention * attention.abs().sum()

        # 计算规则注意力正则化损失（L1 正则化）
        loss_rule_attention = lambda_rule_attention * rule_attention.abs().sum()

        # 添加多样性正则化损失（鼓励不同规则的注意力权重不同）
        if self.n_rules > 1:
            # 计算注意力权重的余弦相似度矩阵
            attention_norm = attention / (attention.norm(dim=1, keepdim=True) + 1e-8)
            similarity_matrix = torch.matmul(attention_norm, attention_norm.t())
            # 计算非对角线的平均相似度
            diversity_loss = torch.sum(similarity_matrix) - torch.diag(similarity_matrix).sum()
            diversity_loss = diversity_loss / (self.n_rules * (self.n_rules - 1))
        else:
            diversity_loss = torch.tensor(0.0).to(attention.device)

        loss_diversity = lambda_diversity * diversity_loss

        # 总损失
        loss = loss_pred + loss_attention + loss_rule_attention + loss_diversity

        if torch.isnan(loss):
            print("Loss is NaN. Stopping training.")
            return loss.item(), output

        loss.backward()

        optimizer.step()

        return loss.item(), output

    def prune_attributes_per_rule(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除每个规则中注意力权重低于阈值的属性，并冻结其相关参数。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝。

        返回：
        - pruned_dict: 字典，键为规则索引，值为被剪除的属性索引列表
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")
        if best_val_loss is None:
            raise ValueError("best_val_loss must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        # 保存剪枝前的模型状态和验证损失
        self.eval()
        with torch.no_grad():
            output_before = self.infer(X_val)
            loss_before = nn.functional.mse_loss(output_before, y_val)

        # 执行剪枝操作
        pruned_dict = {}
        with torch.no_grad():
            attention = torch.sigmoid(model_copy.fuzzy_layer.attention_weights)  # (n_rules, input_dim)

            for rule_idx in range(model_copy.n_rules):
                if torch.all(model_copy.fuzzy_layer.attribute_mask[rule_idx] == 0):
                    continue  # 跳过已被完全剪除的规则

                prune_indices = torch.where(
                    (attention[rule_idx] < threshold) & (model_copy.fuzzy_layer.attribute_mask[rule_idx] == 1)
                )[0].tolist()

                if prune_indices:
                    # 更新属性掩码
                    model_copy.fuzzy_layer.attribute_mask[rule_idx, prune_indices] = 0.0

                    # 冻结被剪除属性的相关参数
                    model_copy.fuzzy_layer.attention_weights[rule_idx, prune_indices].requires_grad = False
                    model_copy.fuzzy_layer.c[rule_idx, prune_indices].requires_grad = False
                    model_copy.fuzzy_layer.sigma[rule_idx, prune_indices].requires_grad = False
                    # 对 prune_indices 中的每个索引加 1，因为偏置项占用了第一个位置
                    prune_indices_plus_one = [idx + 1 for idx in prune_indices]
                    model_copy.weighted_layer.a[rule_idx, prune_indices_plus_one].requires_grad = False  # +1 是因为有偏置项

                    pruned_dict[rule_idx] = prune_indices

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after = model_copy.infer(X_val)
            loss_after = nn.functional.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，撤销剪枝操作
            print(f"Pruning was reverted due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned_dict = {}  # 清空剪枝记录
            pruned = False
        else:
            # 性能没有显著下降，更新模型参数
            self.load_state_dict(model_copy.state_dict())
            print(f"Attribute pruning successful. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned_dict

    def prune_rules(self, threshold=0.1, X_val=None, y_val=None, performance_drop_tolerance=0.01, best_val_loss=None):
        """
        剪除规则注意力权重低于阈值的规则，并从模型中完全移除这些规则。
        如果剪枝后模型在验证集上的性能下降超过容忍度，则撤销剪枝。

        返回：
        - pruned: 布尔值，指示是否实际移除了规则
        """
        if X_val is None or y_val is None:
            raise ValueError("X_val and y_val must be provided for validation performance check.")
        if best_val_loss is None:
            raise ValueError("best_val_loss must be provided for validation performance check.")

        # 创建模型的副本
        model_copy = copy.deepcopy(self)

        pruned = False
        with torch.no_grad():
            # 获取规则注意力权重
            rule_attention = torch.sigmoid(model_copy.normalized_layer.rule_attention_weights)
            # 找到需要移除的规则索引（rule_attention < threshold）
            prune_indices = torch.where(rule_attention < threshold)[0]

            if len(prune_indices) == 0:
                # 没有需要移除的规则
                return pruned

            # 保留的规则索引（rule_attention >= threshold）
            keep_indices = torch.where(rule_attention >= threshold)[0]

            # 更新模型参数，移除低重要性的规则
            model_copy.fuzzy_layer.c = nn.Parameter(model_copy.fuzzy_layer.c.data[keep_indices])
            model_copy.fuzzy_layer.sigma = nn.Parameter(model_copy.fuzzy_layer.sigma.data[keep_indices])
            model_copy.fuzzy_layer.attention_weights = nn.Parameter(model_copy.fuzzy_layer.attention_weights.data[keep_indices])
            model_copy.normalized_layer.rule_attention_weights = nn.Parameter(
                model_copy.normalized_layer.rule_attention_weights.data[keep_indices]
            )
            model_copy.weighted_layer.a = nn.Parameter(model_copy.weighted_layer.a.data[keep_indices])

            # 更新属性掩码，移除被剪除规则的掩码行
            model_copy.fuzzy_layer.attribute_mask = model_copy.fuzzy_layer.attribute_mask.data[keep_indices].clone()

            # 更新规则数量
            model_copy.n_rules = len(keep_indices)
            model_copy.fuzzy_layer.n_rules = model_copy.n_rules
            model_copy.normalized_layer.n_rules = model_copy.n_rules
            model_copy.weighted_layer.n_rules = model_copy.n_rules

            pruned = True  # 标记为已剪枝

        # 剪枝后的验证损失
        model_copy.eval()
        with torch.no_grad():
            output_after = model_copy.infer(X_val)
            loss_after = nn.functional.mse_loss(output_after, y_val)

        # 判断性能是否下降超过容忍度
        performance_drop = (loss_after - best_val_loss) / best_val_loss

        if performance_drop > performance_drop_tolerance:
            # 性能下降超过容忍度，不执行剪枝
            print(f"Rule pruning was not performed due to performance degradation: Loss increased by {performance_drop * 100:.2f}%")
            pruned = False
        else:
            # 性能未下降，更新原始模型的参数
            self.load_state_dict(model_copy.state_dict())
            print(f"Rules pruned successfully. Performance drop: {performance_drop * 100:.2f}%")
            pruned = True

        return pruned

    def grow_rule(self, X_new):
        """
        添加一个新的规则。
        """
        # 获取设备和数据类型
        device = self.fuzzy_layer.c.device
        dtype = self.fuzzy_layer.c.dtype

        # 使用 X_new 计算新的规则中心和标准差
        new_c = torch.tensor(X_new.mean(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, input_dim)
        new_sigma = torch.tensor(X_new.std(axis=0), dtype=dtype).unsqueeze(0).to(device)  # (1, input_dim)

        # 计算现有规则的属性注意力权重的平均值
        if self.n_rules > 0:
            existing_attention_weights = torch.sigmoid(self.fuzzy_layer.attention_weights).data  # (n_rules, input_dim)
            attention_mean = existing_attention_weights.mean(dim=0, keepdim=True)  # (1, input_dim)
        else:
            attention_mean = torch.ones(1, self.input_dim, dtype=dtype).to(device)  # 初始化为 1

        # 将新规则的属性注意力权重初始化为平均值并加入随机扰动
        noise = torch.randn_like(attention_mean) * 0.05  # 调整扰动大小以控制多样性
        new_attention_weights = (attention_mean + noise).clamp(0, 1).detach()  # 保持在 [0, 1] 范围内

        # 将新规则的规则注意力权重初始化为与现有权重的均值 logit 相同，并加入随机扰动
        if self.n_rules > 0:
            existing_rule_attention_logits = self.normalized_layer.rule_attention_weights.data  # (n_rules,)
            rule_attention_mean_logit = existing_rule_attention_logits.mean().unsqueeze(0)  # (1,)
            rule_attention_noise = torch.randn_like(rule_attention_mean_logit) * 0.05  # 调整扰动大小
            new_rule_attention_weight = (rule_attention_mean_logit + rule_attention_noise).detach()
        else:
            rule_attention_mean_logit = torch.tensor([0.0], dtype=dtype).to(device)  # 中性 logit
            new_rule_attention_weight = rule_attention_mean_logit.clone().detach()  # (1,)

        # 初始化后件参数为小的随机值
        new_a = torch.randn(1, self.input_dim + 1, dtype=dtype).to(device) * 0.01  # (1, input_dim + 1)

        # 将新的参数添加到模型中
        self.fuzzy_layer.c = nn.Parameter(torch.cat([self.fuzzy_layer.c.data, new_c], dim=0))  # (n_rules + 1, input_dim)
        self.fuzzy_layer.sigma = nn.Parameter(torch.cat([self.fuzzy_layer.sigma.data, new_sigma], dim=0))  # (n_rules + 1, input_dim)
        self.fuzzy_layer.attention_weights = nn.Parameter(torch.cat([self.fuzzy_layer.attention_weights.data, new_attention_weights], dim=0))  # (n_rules + 1, input_dim)
        self.normalized_layer.rule_attention_weights = nn.Parameter(torch.cat([self.normalized_layer.rule_attention_weights.data, new_rule_attention_weight], dim=0))  # (n_rules + 1,)
        self.weighted_layer.a = nn.Parameter(torch.cat([self.weighted_layer.a.data, new_a], dim=0))  # (n_rules + 1, input_dim + 1)

        # 更新属性掩码，添加新规则的掩码行
        new_attribute_mask = torch.ones(1, self.input_dim, dtype=self.fuzzy_layer.attribute_mask.dtype).to(device)  # (1, input_dim)
        self.fuzzy_layer.attribute_mask = torch.cat([self.fuzzy_layer.attribute_mask, new_attribute_mask], dim=0)  # (n_rules + 1, input_dim)

        # 更新规则数量
        self.n_rules += 1
        self.fuzzy_layer.n_rules = self.n_rules
        self.normalized_layer.n_rules = self.n_rules
        self.weighted_layer.n_rules = self.n_rules

        # 确保 attribute_mask 的维度与 n_rules 一致
        assert self.fuzzy_layer.attribute_mask.shape[0] == self.n_rules, \
            f"After growing, attribute_mask has shape {self.fuzzy_layer.attribute_mask.shape}, but n_rules={self.n_rules}"

        print(f"New rule added. Total rules: {self.n_rules}")

    def grow_rule_with_performance_check(self, X_new, X_train, y_train, X_val, y_val, best_val_loss, device, lr, grow_epochs=10):
        """
        添加一个新的规则，并进行性能检查。如果性能没有提升，则撤销规则生长。

        参数：
        - X_new: 新规则的初始数据，形状：(num_samples, input_dim)
        """
        # 创建模型的副本
        model_copy = copy.deepcopy(self).to(device)

        # 添加新规则到副本
        model_copy.grow_rule(X_new)

        # 初始化优化器和调度器
        optimizer_copy = optim.AdamW(model_copy.parameters(), lr=lr)
        scheduler_copy = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_copy, T_max=grow_epochs)

        # 将训练数据转换为张量
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
        y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

        # 训练模型副本
        for epoch in range(grow_epochs):
            model_copy.train()
            optimizer_copy.zero_grad()
            output, _, _, _ = model_copy.forward(X_train_tensor)
            loss = nn.functional.mse_loss(output, y_train_tensor)
            loss.backward()
            optimizer_copy.step()
            scheduler_copy.step()

        # 评估副本模型的性能
        model_copy.eval()
        with torch.no_grad():
            output_after = model_copy.infer(X_val_tensor)
            loss_after = nn.functional.mse_loss(output_after, y_val_tensor).item()

        print(f"After growing rule and training for {grow_epochs} epochs: Validation Loss = {loss_after:.4f}")

        if loss_after < best_val_loss:
            # 性能有所提升，保留新规则
            # 更新原始模型的参数
            self.load_state_dict(model_copy.state_dict())
            self.n_rules = model_copy.n_rules  # 更新规则数量
            print("Performance improved after growing rule. New rule retained.")
            return True
        else:
            # 性能未提升，丢弃副本，保持主模型不变
            print("Performance did not improve after growing rule. Rule growth reverted.")
            return False

    def infer(self, x, targets=None):
        """
        执行推理。
        """
        with torch.no_grad():
            self.eval()
            output, _, _, _ = self.forward(x)
            if targets is None:
                return output
            else:
                loss = nn.functional.mse_loss(output, targets)
                return output, loss.item()

    def extract_rules(self, scaler_X, scaler_y, feature_names=None):
        """
        提取模型的模糊规则。
        """
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(self.input_dim)]

        rules = []
        c = self.fuzzy_layer.c.detach().cpu().numpy()  # (n_rules, input_dim)
        sigma = self.fuzzy_layer.sigma.detach().cpu().numpy()  # (n_rules, input_dim)
        attention_weights = torch.sigmoid(self.fuzzy_layer.attention_weights).detach().cpu().numpy()  # (n_rules, input_dim)
        attribute_mask = self.fuzzy_layer.attribute_mask.detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(self.normalized_layer.rule_attention_weights).detach().cpu().numpy()
        a = self.weighted_layer.a.detach().cpu().numpy()  # (n_rules, input_dim + 1)

        # 反标准化
        c_orig = c * scaler_X.scale_.reshape(1, -1) + scaler_X.mean_.reshape(1, -1)
        sigma_orig = sigma * scaler_X.scale_.reshape(1, -1)
        a_orig = a.copy()
        a_orig[:, 1:] = a[:, 1:] / scaler_X.scale_.reshape(1, -1) * scaler_y.scale_[0]
        a_orig[:, 0] = scaler_y.scale_[0] * a[:, 0] + scaler_y.mean_[0] - np.sum(
            a[:, 1:] * scaler_X.mean_.reshape(1, -1) / scaler_X.scale_.reshape(1, -1) * scaler_y.scale_[0],
            axis=1
        )

        for j in range(self.n_rules):
            # 包含规则注意力权重
            rule_str = (f"Rule {j+1} (Rule Attention: {rule_attention_weights[j]:.4f}): IF ")
            antecedents = []
            for i in range(self.input_dim):
                if attribute_mask[j, i] == 0:
                    continue  # 忽略被剪枝的属性
                attention_value = attention_weights[j, i]
                mu = c_orig[j, i]
                sigma_i = sigma_orig[j, i]
                antecedents.append(
                    f"[{feature_names[i]} (Attn: {attention_value:.4f}) is Gaussian(c={mu:.4f}, σ={sigma_i:.4f})]"
                )
            antecedent_str = " AND ".join(antecedents) if antecedents else "True"
            rule_str += antecedent_str + " THEN Output = "

            # 构建后件部分
            a_j = a_orig[j, :]  # (input_dim + 1,)
            consequent_terms = [f"{a_j[0]:.4f}"]
            for idx, coef in enumerate(a_j[1:]):
                if attribute_mask[j, idx] == 0:
                    continue  # 忽略被剪枝的属性
                attention_value = attention_weights[j, idx]
                if coef >= 0:
                    term = f"+ {coef:.4f} * {feature_names[idx]} (Attn: {attention_value:.4f})"
                else:
                    term = f"- {abs(coef):.4f} * {feature_names[idx]} (Attn: {attention_value:.4f})"
                consequent_terms.append(term)
            consequent_str = " ".join(consequent_terms)
            rule_str += consequent_str
            rules.append(rule_str)
        return rules

    def save_model(self, path):
        torch.save(self.state_dict(), path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        self.load_state_dict(torch.load(path))
        print(f"Model loaded from {path}")

# ============================
# 定义训练函数
# ============================
def train_attention_dynamic_attribute_and_rule_sofenn(
    X_train_np, y_train_np, X_val_np, y_val_np,
    initial_n_rules=3, epochs=1500, batch_size=32, lr=0.01,
    prune_frequency=190, prune_threshold=0.1,
    best_model_path='best_sofenn_model.pth'
):
    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    input_dim = X_train_scaled_tensor.shape[1]
    model = AttentionDynamicAttributeAndRuleSOFENN(
        input_dim=input_dim,
        n_rules=initial_n_rules,
        attention_threshold=prune_threshold
    ).to(device)

    # 初始化优化器和调度器
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'active_rules': [],
        'total_rules': [],
        'val_rmse': [],
        'attribute_weights': [],
        'rule_attention_weights': [],
        'pruned_attributes': [],
        'total_active_attributes': []
    }

    # 初始化变量以记录最佳验证损失和最佳模型状态
    best_val_loss = float('inf')
    best_model_state = None

    # 设置规则生长和剪枝的参数
    patience = 25  # 等待多少个 epoch 后触发规则生长
    grow_threshold = 0.0001  # 训练损失下降低于该阈值，触发规则生长
    no_improve_epochs = 0
    prev_val_loss = float('inf')

    max_rules = 9  # 设置规则数量上限，防止无限生长
    attention_threshold = 0.05  # 定义活跃规则的注意力权重阈值

    # 设置剪枝停止的 epoch 阈值
    pruning_stop_epoch = int(epochs * 0.95)  # 在 95% 的训练过程中进行剪枝

    # 训练模型
    for epoch in range(epochs):
        model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = model.train_step(
                batch_x,
                batch_y,
                optimizer,
                lambda_attention=1e-7,
                lambda_rule_attention=1e-8,
                lambda_diversity=1e-4
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        model.eval()

        with torch.no_grad():
            output, phi, attention, rule_attention = model.forward(X_val_scaled_tensor)
            loss_val = nn.functional.mse_loss(output, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(output.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

        # 调整学习率
        scheduler.step()

        # 计算当前活跃规则的数量
        rule_attention_np = torch.sigmoid(model.normalized_layer.rule_attention_weights).detach().cpu().numpy()
        num_active_rules = np.sum(rule_attention_np >= attention_threshold)

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['active_rules'].append(num_active_rules)
        training_info['total_rules'].append(model.n_rules)
        training_info['val_rmse'].append(val_rmse)

        # 提取注意力权重
        attention_weights = torch.sigmoid(model.fuzzy_layer.attention_weights).detach().cpu().numpy()
        rule_attention_weights = torch.sigmoid(model.normalized_layer.rule_attention_weights).detach().cpu().numpy()

        # 计算平均属性权重
        avg_attribute_weights = attention_weights.mean(axis=0)  # Average over rules

        # 保存注意力权重
        training_info['attribute_weights'].append(avg_attribute_weights)
        training_info['rule_attention_weights'].append(rule_attention_weights)

        # 计算当前所有规则中激活的属性总数
        attribute_mask = model.fuzzy_layer.attribute_mask.detach().cpu().numpy()
        total_active_attributes = np.sum(attribute_mask)

        # 保存激活的属性总数
        training_info['total_active_attributes'].append(total_active_attributes)

        # 检查是否为最佳验证损失
        if loss_val.item() < best_val_loss:
            best_val_loss = loss_val.item()
            best_model_state = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), best_model_path)
            print(f"Epoch {epoch+1}: New best validation loss: {loss_val.item():.4f}. Model saved.")

        # 显示注意力权重信息
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {loss_val.item():.4f} - Val RMSE: {val_rmse:.4f} - Total Rules: {model.n_rules} - Active Rules: {num_active_rules}")

        # 检查验证损失的改进情况
        if loss_val.item() < prev_val_loss - grow_threshold:
            no_improve_epochs = 0
            prev_val_loss = loss_val.item()
        else:
            no_improve_epochs += 1

        # 如果验证损失在连续若干个 epoch 中没有显著改进，触发规则生长
        if no_improve_epochs >= patience and model.n_rules < max_rules:
            print(f"Epoch {epoch+1}: No significant improvement in validation loss, growing a new rule. Current rules: {model.n_rules}")
            # 找出当前误差较大的数据点，用于初始化新规则
            residuals = (y_val_scaled_tensor.cpu().numpy() - output.cpu().numpy())
            high_error_indices = np.argsort(np.abs(residuals))[-int(0.1 * len(residuals)):]  # 选取误差最大的 10% 数据
            X_new_rule = X_val_np[high_error_indices]
            # 添加新规则并进行性能检查
            improved = model.grow_rule_with_performance_check(
                X_new_rule,
                X_train_np,  # 传入训练集特征
                y_train_scaled_tensor.cpu().numpy(),  # 传入训练集目标
                X_val_np,
                y_val_scaled_tensor.cpu().numpy(),
                best_val_loss,
                device,
                lr,
                grow_epochs=100  # 设定在规则生长后训练的轮次
            )
            if improved:
                # 如果性能有所改善，更新最佳验证损失
                prev_val_loss = best_val_loss  # 更新之前的验证损失
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after growing a new rule.")
            else:
                # 如果性能未改善，撤销规则生长操作（已在方法内完成）
                pass
            no_improve_epochs = 0  # 重置计数器

        # 每隔 prune_frequency 个 epoch 进行属性剪枝
        if (epoch + 1) % prune_frequency == 0 and epoch < pruning_stop_epoch:
            pruned_dict = model.prune_attributes_per_rule(
                threshold=prune_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,  # 性能下降容忍度，可根据需要调整
                best_val_loss=best_val_loss
            )
            training_info['pruned_attributes'].append(pruned_dict)
            if pruned_dict:
                print(f"Epoch {epoch+1}: Pruned attributes per rule: {pruned_dict}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch:
            pass  # 不再进行剪枝操作

        # 每隔若干个 epoch 进行规则剪枝
        if (epoch + 1) % 50 == 0 and epoch < pruning_stop_epoch:
            pruned = model.prune_rules(
                threshold=attention_threshold,
                X_val=X_val_scaled_tensor,
                y_val=y_val_scaled_tensor,
                performance_drop_tolerance=0.01,
                best_val_loss=best_val_loss
            )
            if pruned:
                print(f"Epoch {epoch+1}: Pruned rules. Total rules: {model.n_rules}")
                # 重新初始化优化器和调度器
                optimizer = optim.AdamW(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs - epoch - 1)
                print("Optimizer and scheduler re-initialized after pruning rules.")
        elif epoch >= pruning_stop_epoch:
            pass  # 不再进行规则剪枝

    # 在训练结束后，加载最佳模型的状态字典
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Loaded the best model based on validation loss.")
    else:
        print("No improvement during training. Using the final model.")

    # 返回最佳模型和标准化器
    return model, scaler_X, scaler_y

# ============================
# 使用更新后的模型进行训练和测试
# ============================
# 定义实验参数
initial_n_rules = 3
learning_rate = 0.01
epochs = 500
batch_size = 512
prune_frequency = 25
prune_threshold = 0.1
repeats = 1  # 重复次数

# 记录实验结果
results_sofenn = []
test_rmse_list = []
time_list = []
for repeat in range(repeats):
    start_time = time.time()

    # 进一步将训练集拆分为训练和验证集
    X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
        X_train_np, y_train_np, test_size=0.2, random_state=repeat
    )

    # 定义最佳模型保存路径
    best_model_path = f'best_sofenn_model_repeat{repeat+1}.pth'

    # 训练模型
    sofenn_model, scaler_X, scaler_y = train_attention_dynamic_attribute_and_rule_sofenn(
        X_train_sub, y_train_sub, X_test_np, y_test_np,
        initial_n_rules=initial_n_rules,
        epochs=epochs,
        batch_size=batch_size,
        lr=learning_rate,
        prune_frequency=prune_frequency,
        prune_threshold=prune_threshold,
        best_model_path=best_model_path
    )

    # 在测试集上测试模型
    sofenn_model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_test_scaled = scaler_X.transform(X_test_np)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
    with torch.no_grad():
        y_pred_scaled = sofenn_model.infer(X_test_tensor)
        y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
        y_true = y_test_np  # 使用原始的 y_test_np
        test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        test_rmse_list.append(test_rmse)
    # 记录时间
    end_time = time.time()
    time_taken = end_time - start_time
    time_list.append(time_taken)
    print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

    # 提取模糊规则
    rules = sofenn_model.extract_rules(scaler_X, scaler_y, feature_names=features_to_use)
    # 打印模糊规则
    print(f"\nFuzzy Rules for Repeat={repeat+1}:")
    for rule in rules:
        print(rule)
        print()

    # 保存模型
    torch.save(sofenn_model.state_dict(), f'sofenn_model_repeat{repeat+1}.pth')

    # 保存结果
    result = {
        'repeat': repeat + 1,
        'test_rmse': test_rmse,
        'time_taken': time_taken,
        'total_active_attributes': np.sum(sofenn_model.fuzzy_layer.attribute_mask.detach().cpu().numpy())
    }
    results_sofenn.append(result)

# 打印所有实验的结果
for res in results_sofenn:
    print(f"Repeat {res['repeat']}: Test RMSE={res['test_rmse']:.4f}, Time={res['time_taken']:.2f}s, Total Active Attributes={res['total_active_attributes']}")

# 计算平均 RMSE 和时间
test_rmse_mean = np.mean(test_rmse_list)
test_rmse_std = np.std(test_rmse_list)
time_mean = np.mean(time_list)
time_std = np.std(time_list)

# 打印结果
print(f"\nResults:")
print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")

# 计算平均的总属性数量
total_attributes_list = [res['total_active_attributes'] for res in results_sofenn]
average_total_attributes = np.mean(total_attributes_list)
print(f"\nAverage Total Active Attributes over {repeats} repeats: {average_total_attributes:.2f}")


In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# 从 UCI ML Repo 导入数据集
# from ucimlrepo import fetch_ucirepo

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 从 UCI ML Repo 下载 Appliances Energy Prediction 数据集
energy_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'

# 读取数据集
data = pd.read_csv(energy_url)

# 特征选择
# 排除 'date' 和 'Appliances'，将其余作为输入特征
features_to_use = [
    'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
    'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
    'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
    'Tdewpoint', 'rv1', 'rv2'
]

# 处理目标变量
X = data[features_to_use]
y = data['Appliances']

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X, y], axis=1).dropna()
X = data[features_to_use]
y = data['Appliances']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# featu
# 定义 FuBiNFS 算法的实现
def fubinfs(X_train_np, y_train_np, X_test_np, y_test_np, n_clusters=3, max_iter=100, tol=1e-5, m=2):
    """
    实现 FuBiNFS 算法并在测试集上评估性能。

    参数：
    - X_train_np: 训练集输入数据，形状：(K, D)
    - y_train_np: 训练集目标数据，形状：(K,)
    - X_test_np: 测试集输入数据，形状：(N_test, D)
    - y_test_np: 测试集目标数据，形状：(N_test,)
    - n_clusters: 聚类数目 C
    - max_iter: 最大迭代次数
    - tol: 收敛阈值
    - m: 模糊化系数

    返回：
    - y_pred_test: 测试集的预测输出
    - test_rmse: 测试集上的 RMSE
    - 其他中间结果
    """
    K, D = X_train_np.shape
    N_test = X_test_np.shape[0]

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_test_scaled = scaler_X.transform(X_test_np)
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test_np.reshape(-1, 1)).flatten()

    # 初始化 U
    np.random.seed(42)
    U = np.random.rand(n_clusters, K, D)
    # 归一化 U，使其满足约束条件 (5)
    U = U / U.sum(axis=0, keepdims=1)

    # 初始化聚类中心 V^(k) 和 V^(d)
    V_k = np.random.rand(n_clusters, D)
    V_d = np.random.rand(n_clusters, K)

    previous_J = np.inf
    for iteration in range(max_iter):
        # 保存上一轮的 U，用于检查收敛
        U_old = U.copy()

        # Step 1: 计算聚类中心 V^(k) among objects，公式 (3)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=0)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=0)  # 修正此处
            V_k[c] = numerator / (denominator + 1e-8)

        # Step 2: 计算聚类中心 V^(d) among attributes，公式 (4)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=1)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=1)  # 修正此处
            V_d[c] = numerator / (denominator + 1e-8)

        # Step 3: 更新隶属度矩阵 U，公式 (14)
        for c in range(n_clusters):
            # 计算距离矩阵
            dist_c = (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2  # 形状 (K, D)
            # 初始化分母
            denom = np.zeros((K, D))
            for cc in range(n_clusters):
                dist_cc = (X_train_scaled - V_k[cc]) ** 2 + (X_train_scaled - V_d[cc][:, np.newaxis]) ** 2
                denom += (dist_c / (dist_cc + 1e-8)) ** (1 / (m - 1))
            U[c] = 1 / (denom + 1e-8)

        # 归一化 U，使其满足约束条件 (5)
        U = U / U.sum(axis=0, keepdims=1)

        # Step 4: 计算目标函数 J，公式 (2)
        J = 0
        for c in range(n_clusters):
            J += np.sum((U[c] ** m) * (
                (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2
            ))

        # 检查收敛条件
        if abs(J - previous_J) < tol:
            print(f"Converged at iteration {iteration + 1}")
            break
        previous_J = J

    else:
        print("Reached maximum iterations without convergence.")

    # Step 5: 生成模糊规则
    # 使用高斯隶属函数，标准差 σ 可以设为聚类中心的标准差
    sigma_k = np.std(V_k, axis=0) + 1e-8  # 防止为零

    # Step 6: 对训练集进行模糊推理并进行后验训练以拟合 y_train_scaled
    # 计算规则激活度（训练集）
    activation_train = np.zeros((K, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_train_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_train[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_train = activation_train.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_train = activation_train / total_activation_train

    # 使用归一化激活度作为特征，训练线性模型拟合 y_train_scaled
    from sklearn.linear_model import LinearRegression

    lin_reg = LinearRegression()
    lin_reg.fit(normalized_activation_train, y_train_scaled)

    # Step 7: 对测试集进行模糊推理并预测输出
    activation_test = np.zeros((N_test, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_test_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_test[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_test = activation_test.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_test = activation_test / total_activation_test

    # 使用线性模型预测
    y_pred_test_scaled = lin_reg.predict(normalized_activation_test)
    # 反标准化
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled.reshape(-1, 1)).flatten()

    # 计算测试集上的 RMSE
    test_rmse = np.sqrt(mean_squared_error(y_test_np, y_pred_test))

    # 返回结果
    # 添加提取的模糊规则
    rules = []
    V_k_orig = V_k * scaler_X.scale_ + scaler_X.mean_
    sigma_k_orig = sigma_k * scaler_X.scale_
    for c in range(n_clusters):
        antecedent = []
        for d in range(len(features_to_use)):
            c_val = V_k_orig[c, d]
            sigma_val = sigma_k_orig[d]
            antecedent.append(f"{features_to_use[d]} is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})")
        antecedent_str = " AND ".join(antecedent)
        # 结论部分使用线性模型的系数
        coef = lin_reg.coef_[c]
        intercept = lin_reg.intercept_
        consequent_str = f"{coef:.4f} * Activation_{c+1} + {intercept:.4f}"
        rule = f"Rule {c+1}: IF {antecedent_str} THEN Output = {consequent_str}"
        rules.append(rule)

    return y_pred_test, test_rmse, {
        'V_k': V_k,
        'sigma_k': sigma_k,
        'lin_reg': lin_reg,
        'scaler_X': scaler_X,
        'scaler_y': scaler_y,
        'activation_test': activation_test,
        'rules': rules  # 添加模糊规则到返回结果
    }

# 实验参数
n_clusters_list = [3, 5, 7, 9]  # 可以测试不同的聚类数量
max_iters = 100
tol = 1e-5
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results_fubinfs', exist_ok=True)

# 记录实验结果
results_fubinfs = []

for n_clusters in n_clusters_list:
    test_rmse_list = []
    time_list = []
    print(f"\nStarting experiments for n_clusters={n_clusters}")
    for repeat in range(repeats):
        start_time = time.time()
        # 调用 FuBiNFS 算法
        y_pred_test, test_rmse, intermediate_results = fubinfs(
            X_train_np, y_train_np, X_test_np, y_test_np,
            n_clusters=n_clusters, max_iter=max_iters, tol=tol, m=2
        )
        end_time = time.time()
        time_taken = end_time - start_time
        time_list.append(time_taken)
        test_rmse_list.append(test_rmse)
        print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

        # 提取模糊规则
        rules = intermediate_results['rules']

        # 打印模糊规则
        print(f"\nFuzzy Rules for n_clusters={n_clusters}, Repeat={repeat+1}:")
        for rule in rules:
            print(rule)
            print()

        # 保存规则到文件
        with open(f'results_fubinfs/rules_nclusters{n_clusters}_repeat{repeat+1}.txt', 'w') as f:
            for rule in rules:
                f.write(rule + '\n')

    # 计算平均 RMSE 和时间
    test_rmse_mean = np.mean(test_rmse_list)
    test_rmse_std = np.std(test_rmse_list)
    time_mean = np.mean(time_list)
    time_std = np.std(time_list)

    # 打印结果
    print(f"\nResults for n_clusters={n_clusters}:")
    print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
    print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")

    # 保存结果
    result = {
        'n_clusters': n_clusters,
        'test_rmse_mean': test_rmse_mean,
        'test_rmse_std': test_rmse_std,
        'time_mean': time_mean,
        'time_std': time_std
    }
    results_fubinfs.append(result)


In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# 从 UCI ML Repo 导入数据集
# from ucimlrepo import fetch_ucirepo

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

energy_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'

# 读取数据集
data = pd.read_csv(energy_url)

# 特征选择
# 排除 'date' 和 'Appliances'，将其余作为输入特征
features_to_use = [
    'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
    'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9',
    'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
    'Tdewpoint', 'rv1', 'rv2'
]

# 处理目标变量
X = data[features_to_use]
y = data['Appliances']

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X, y], axis=1).dropna()
X = data[features_to_use]
y = data['Appliances']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# 定义 ANFIS 模型
# 定义 FuBiNFS 算法的实现
def fubinfs(X_train_np, y_train_np, X_test_np, y_test_np, n_clusters=3, max_iter=100, tol=1e-5, m=2):
    """
    实现 FuBiNFS 算法并在测试集上评估性能。

    参数：
    - X_train_np: 训练集输入数据，形状：(K, D)
    - y_train_np: 训练集目标数据，形状：(K,)
    - X_test_np: 测试集输入数据，形状：(N_test, D)
    - y_test_np: 测试集目标数据，形状：(N_test,)
    - n_clusters: 聚类数目 C
    - max_iter: 最大迭代次数
    - tol: 收敛阈值
    - m: 模糊化系数

    返回：
    - y_pred_test: 测试集的预测输出
    - test_rmse: 测试集上的 RMSE
    - 其他中间结果
    """
    K, D = X_train_np.shape
    N_test = X_test_np.shape[0]

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_test_scaled = scaler_X.transform(X_test_np)
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test_np.reshape(-1, 1)).flatten()

    # 初始化 U
    np.random.seed(42)
    U = np.random.rand(n_clusters, K, D)
    # 归一化 U，使其满足约束条件 (5)
    U = U / U.sum(axis=0, keepdims=1)

    # 初始化聚类中心 V^(k) 和 V^(d)
    V_k = np.random.rand(n_clusters, D)
    V_d = np.random.rand(n_clusters, K)

    previous_J = np.inf
    for iteration in range(max_iter):
        # 保存上一轮的 U，用于检查收敛
        U_old = U.copy()

        # Step 1: 计算聚类中心 V^(k) among objects，公式 (3)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=0)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=0)  # 修正此处
            V_k[c] = numerator / (denominator + 1e-8)

        # Step 2: 计算聚类中心 V^(d) among attributes，公式 (4)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=1)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=1)  # 修正此处
            V_d[c] = numerator / (denominator + 1e-8)

        # Step 3: 更新隶属度矩阵 U，公式 (14)
        for c in range(n_clusters):
            # 计算距离矩阵
            dist_c = (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2  # 形状 (K, D)
            # 初始化分母
            denom = np.zeros((K, D))
            for cc in range(n_clusters):
                dist_cc = (X_train_scaled - V_k[cc]) ** 2 + (X_train_scaled - V_d[cc][:, np.newaxis]) ** 2
                denom += (dist_c / (dist_cc + 1e-8)) ** (1 / (m - 1))
            U[c] = 1 / (denom + 1e-8)

        # 归一化 U，使其满足约束条件 (5)
        U = U / U.sum(axis=0, keepdims=1)

        # Step 4: 计算目标函数 J，公式 (2)
        J = 0
        for c in range(n_clusters):
            J += np.sum((U[c] ** m) * (
                (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2
            ))

        # 检查收敛条件
        if abs(J - previous_J) < tol:
            print(f"Converged at iteration {iteration + 1}")
            break
        previous_J = J

    else:
        print("Reached maximum iterations without convergence.")

    # Step 5: 生成模糊规则
    # 使用高斯隶属函数，标准差 σ 可以设为聚类中心的标准差
    sigma_k = np.std(V_k, axis=0) + 1e-8  # 防止为零

    # Step 6: 对训练集进行模糊推理并进行后验训练以拟合 y_train_scaled
    # 计算规则激活度（训练集）
    activation_train = np.zeros((K, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_train_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_train[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_train = activation_train.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_train = activation_train / total_activation_train

    # 使用归一化激活度作为特征，训练线性模型拟合 y_train_scaled
    from sklearn.linear_model import LinearRegression

    lin_reg = LinearRegression()
    lin_reg.fit(normalized_activation_train, y_train_scaled)

    # Step 7: 对测试集进行模糊推理并预测输出
    activation_test = np.zeros((N_test, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_test_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_test[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_test = activation_test.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_test = activation_test / total_activation_test

    # 使用线性模型预测
    y_pred_test_scaled = lin_reg.predict(normalized_activation_test)
    # 反标准化
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled.reshape(-1, 1)).flatten()

    # 计算测试集上的 RMSE
    test_rmse = np.sqrt(mean_squared_error(y_test_np, y_pred_test))

    # 返回结果
    # 添加提取的模糊规则
    rules = []
    V_k_orig = V_k * scaler_X.scale_ + scaler_X.mean_
    sigma_k_orig = sigma_k * scaler_X.scale_
    for c in range(n_clusters):
        antecedent = []
        for d in range(len(features_to_use)):
            c_val = V_k_orig[c, d]
            sigma_val = sigma_k_orig[d]
            antecedent.append(f"{features_to_use[d]} is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})")
        antecedent_str = " AND ".join(antecedent)
        # 结论部分使用线性模型的系数
        coef = lin_reg.coef_[c]
        intercept = lin_reg.intercept_
        consequent_str = f"{coef:.4f} * Activation_{c+1} + {intercept:.4f}"
        rule = f"Rule {c+1}: IF {antecedent_str} THEN Output = {consequent_str}"
        rules.append(rule)

    return y_pred_test, test_rmse, {
        'V_k': V_k,
        'sigma_k': sigma_k,
        'lin_reg': lin_reg,
        'scaler_X': scaler_X,
        'scaler_y': scaler_y,
        'activation_test': activation_test,
        'rules': rules  # 添加模糊规则到返回结果
    }

# 实验参数
n_clusters_list = [3, 5, 7, 9]  # 可以测试不同的聚类数量
max_iters = 100
tol = 1e-5
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results_fubinfs', exist_ok=True)

# 记录实验结果
results_fubinfs = []

for n_clusters in n_clusters_list:
    test_rmse_list = []
    time_list = []
    print(f"\nStarting experiments for n_clusters={n_clusters}")
    for repeat in range(repeats):
        start_time = time.time()
        # 调用 FuBiNFS 算法
        y_pred_test, test_rmse, intermediate_results = fubinfs(
            X_train_np, y_train_np, X_test_np, y_test_np,
            n_clusters=n_clusters, max_iter=max_iters, tol=tol, m=2
        )
        end_time = time.time()
        time_taken = end_time - start_time
        time_list.append(time_taken)
        test_rmse_list.append(test_rmse)
        print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

        # 提取模糊规则
        rules = intermediate_results['rules']

        # 打印模糊规则
        print(f"\nFuzzy Rules for n_clusters={n_clusters}, Repeat={repeat+1}:")
        for rule in rules:
            print(rule)
            print()

        # # 保存规则到文件
        # with open(f'results_fubinfs/rules_nclusters{n_clusters}_repeat{repeat+1}.txt', 'w') as f:
        #     for rule in rules:
        #         f.write(rule + '\n')

    # 计算平均 RMSE 和时间
    test_rmse_mean = np.mean(test_rmse_list)
    test_rmse_std = np.std(test_rmse_list)
    time_mean = np.mean(time_list)
    time_std = np.std(time_list)

    # 打印结果
    print(f"\nResults for n_clusters={n_clusters}:")
    print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
    print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")

    # 保存结果
    result = {
        'n_clusters': n_clusters,
        'test_rmse_mean': test_rmse_mean,
        'test_rmse_std': test_rmse_std,
        'time_mean': time_mean,
        'time_std': time_std
    }
    results_fubinfs.append(result)


# ANFIS

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from scipy.stats import norm  # 新增
import warnings
warnings.filterwarnings('ignore')
# !pip install ucimlrepo

# 从 UCI ML Repo 导入 Auto MPG 数据集
auto_mpg_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

# 定义列名称
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model_year', 'origin', 'car_name']

# 读取数据集，处理缺失值
data = pd.read_csv(auto_mpg_url, delim_whitespace=True, names=column_names, na_values='?')

# 删除含有缺失值的样本
data = data.dropna()

# 特征选择
# 排除 'mpg' 和 'car_name'，将其余作为输入特征
features_to_use = [
    'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin'
]

# 处理目标变量
X = data[features_to_use]
y = data['mpg']

# 将类别变量 'origin' 进行独热编码（如果需要，可以选择保留为数值型）
# 这里保留为数值型，以简化 ANFIS 模型的处理
# 如果希望进行独热编码，请取消下方代码的注释
# X = pd.get_dummies(X, columns=['origin'], drop_first=True)
# features_to_use = X.columns.tolist()

# 检查缺失值并删除含有缺失值的样本（已在读取时完成）

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# 定义 ANFIS 模型
class ANFIS(nn.Module):
    def __init__(self, n_inputs, n_rules, learning_rate=1e-2, param_count=1):
        super(ANFIS, self).__init__()
        self.n = n_inputs
        self.m = n_rules
        self.param_count = param_count

        # Initialize parameters
        self.mu = nn.ParameterList([nn.Parameter(torch.randn(n_rules, n_inputs)) for _ in range(param_count)])
        self.sigma = nn.ParameterList([nn.Parameter(torch.abs(torch.randn(n_rules, n_inputs))) for _ in range(param_count)])
        self.y = nn.ParameterList([nn.Parameter(torch.randn(1, n_rules)) for _ in range(param_count)])

        # Optimizer
        self.optimizer = None  # 优化器将在外部设置

    def forward(self, x):
        batch_size = x.size(0)
        x_expanded = x.unsqueeze(1).expand(batch_size, self.m, self.n)

        rul = torch.ones(batch_size, self.m, self.n).to(x.device)

        for i in range(self.param_count):
            mu_expanded = self.mu[i].unsqueeze(0).expand(batch_size, self.m, self.n)
            sigma_expanded = self.sigma[i].unsqueeze(0).expand(batch_size, self.m, self.n)
            rul *= torch.exp(-0.5 * ((x_expanded - mu_expanded) ** 2) / (sigma_expanded ** 2 + 1e-8))

        rul = rul.prod(dim=2)

        # Compute output
        num = sum((rul * self.y[i].expand_as(rul)) for i in range(self.param_count))
        den = rul.sum(dim=1, keepdim=True).clamp(min=1e-12)
        out = num.sum(dim=1, keepdim=True) / den
        return out.squeeze()

    def train_step(self, x, target):
        if self.optimizer is None:
            raise ValueError("Optimizer not set. Please set the optimizer before training.")
        self.optimizer.zero_grad()
        output = self.forward(x)
        loss = nn.functional.mse_loss(output, target)
        loss.backward()
        self.optimizer.step()
        return loss.item(), output

    def infer(self, x, targets=None):
        with torch.no_grad():
            output = self.forward(x)
            if targets is None:
                return output
            else:
                loss = nn.functional.mse_loss(output, targets)
                return output, loss.item()

    def extract_rules(self, scaler_X, scaler_y, feature_names=None):
        """
        提取 ANFIS 模型的模糊规则。

        参数：
        - scaler_X: 输入数据的标准化器
        - scaler_y: 输出数据的标准化器
        - feature_names: 特征名称列表

        返回：
        - rules: 包含规则字符串的列表
        """
        if feature_names is None:
            feature_names = [f'Input {i+1}' for i in range(self.n)]

        rules = []
        for r in range(self.m):
            antecedents = []
            for j in range(self.n):
                # 对 mu 和 sigma 进行反标准化
                mu_values = []
                sigma_values = []
                for i in range(self.param_count):
                    mu = self.mu[i][r, j].detach().cpu().numpy()
                    sigma = self.sigma[i][r, j].detach().cpu().numpy()
                    # 反标准化
                    mu_orig = mu * scaler_X.scale_[j] + scaler_X.mean_[j]
                    sigma_orig = sigma * scaler_X.scale_[j]
                    mu_values.append(mu_orig)
                    sigma_values.append(sigma_orig)
                # 计算平均值
                mu_avg = np.mean(mu_values)
                sigma_avg = np.mean(sigma_values)
                antecedents.append(f"{feature_names[j]} is Gaussian(c={mu_avg:.4f}, σ={sigma_avg:.4f})")
            antecedent_str = " AND ".join(antecedents)
            # 后件部分
            consequents = []
            for i in range(self.param_count):
                y_value = self.y[i][0, r].detach().cpu().numpy()
                # 反标准化
                y_orig = y_value * scaler_y.scale_[0] + scaler_y.mean_[0]
                consequents.append(y_orig)  # 直接添加数值而非字符串
            consequent_avg = np.mean(consequents)  # 计算数值平均
            rule = f"Rule {r+1}: IF {antecedent_str} THEN Output = {consequent_avg:.4f}"
            rules.append(rule)
        return rules

    def save_model(self, path):
        torch.save(self.state_dict(), path)
        print(f"Model saved to {path}")

    def load_model(self, path):
        self.load_state_dict(torch.load(path))
        print(f"Model loaded from {path}")

# 定义计算 Iov 和 Ifspe 的辅助函数
def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    """
    使用解析解计算两个高斯隶属度函数的重叠面积。

    参数：
    - c1, sigma1: 第一个高斯函数的中心和标准差
    - c2, sigma2: 第二个高斯函数的中心和标准差

    返回：
    - overlap_area: 两个高斯函数的重叠面积
    """
    denominator = np.sqrt(sigma1**2 + sigma2**2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(mu_orig, sigma_orig):
    """
    计算 Average Overlap Index (Iov)。

    参数：
    - mu_orig: 反标准化后的 mu 数组，形状为 (n_rules, n_inputs)
    - sigma_orig: 反标准化后的 sigma 数组，形状为 (n_rules, n_inputs)

    返回：
    - average_iov: 平均重叠指数
    """
    n_rules, n_inputs = mu_orig.shape
    total_overlap = []
    for i in range(n_inputs):
        for j in range(n_rules):
            for k in range(j+1, n_rules):
                c1 = mu_orig[j, i]
                sigma1 = sigma_orig[j, i]
                c2 = mu_orig[k, i]
                sigma2 = sigma_orig[k, i]
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                total_overlap.append(overlap)
    if len(total_overlap) == 0:
        return 0
    average_iov = np.mean(total_overlap)
    return average_iov

def compute_ifspe(mu_orig, sigma_orig):
    """
    计算 Average Fuzzy Set Position Index (Ifspe)。

    参数：
    - mu_orig: 反标准化后的 mu 数组，形状为 (n_rules, n_inputs)
    - sigma_orig: 反标准化后的 sigma 数组，形状为 (n_rules, n_inputs)

    返回：
    - average_ifspe: 平均模糊集位置指数（非负数）
    """
    n_rules, n_inputs = mu_orig.shape
    total_ifspe = 0
    valid_terms = 0

    for i in range(n_inputs):
        # 仅考虑有两个及以上规则的输入特征
        if n_rules < 2:
            continue
        # 计算每个输入特征的中心值的标准差
        std_dev = np.std(mu_orig[:, i])
        # 使用绝对值确保 Ifspe_term 为非负数
        ifspe_term = std_dev  # 直接使用标准差作为 Ifspe 的度量
        total_ifspe += ifspe_term
        valid_terms += 1

    if valid_terms == 0:
        return 0  # 避免除以零

    average_ifspe = total_ifspe / valid_terms
    return average_ifspe

# 定义训练 ANFIS 模型的函数
def train_anfis(X_train_np, y_train_np, X_val_np, y_val_np, n_rules=3, epochs=500, batch_size=32, lr=0.01):
    """
    训练 ANFIS 模型

    参数：
    - X_train_np: 训练集输入数据，形状：(num_samples, n_inputs)
    - y_train_np: 训练集目标数据，形状：(num_samples,)
    - X_val_np: 验证集输入数据，形状：(num_samples, n_inputs)
    - y_val_np: 验证集目标数据，形状：(num_samples,)
    - n_rules: 规则数量
    - epochs: 训练轮数
    - batch_size: 批次大小
    - lr: 学习率

    返回：
    - anfis_model: 训练好的模型
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    """
    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    n_inputs = X_train_scaled_tensor.shape[1]

    anfis_model = ANFIS(
        n_inputs=n_inputs,
        n_rules=n_rules,
        learning_rate=lr,
        param_count=1  # 您可以根据需要调整
    ).to(device)

    # 初始化优化器
    anfis_model.optimizer = optim.AdamW(anfis_model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        anfis_model.optimizer, mode='min', factor=0.99, patience=100, verbose=True
    )

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'val_rmse': []
    }

    # 训练模型
    for epoch in range(epochs):
        anfis_model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        num_batches = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            loss_train, _ = anfis_model.train_step(
                batch_x,
                batch_y
            )
            epoch_loss += loss_train
            num_batches += 1

        epoch_loss /= num_batches

        anfis_model.eval()

        with torch.no_grad():
            y_val_pred_scaled, loss_val = anfis_model.infer(X_val_scaled_tensor, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
        scheduler.step(loss_val)

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val)  # 修正这里，去掉 .item()
        training_info['val_rmse'].append(val_rmse)

        # 显示验证集上的 RMSE
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {loss_val:.4f} - Val RMSE: {val_rmse:.4f}")

    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['train_loss'], label='Train Loss')
    plt.plot(training_info['epoch'], training_info['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 可视化验证集上的 RMSE
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['val_rmse'], label='Validation RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Validation RMSE over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 返回模型和标准化器
    return anfis_model, scaler_X, scaler_y

# 定义实验参数
n_rules_list = [3]  # 可以测试不同的规则数量
learning_rates = [0.01]
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results_anfis', exist_ok=True)

# 记录实验结果
results_anfis = []

for n_rules in n_rules_list:
    for lr in learning_rates:
        test_rmse_list = []
        time_list = []
        iov_list = []
        ifspe_list = []
        total_attributes_list = []  # 新增，用于跟踪总属性数量
        print(f"\nStarting experiments for n_rules={n_rules}, learning_rate={lr}")
        for repeat in range(repeats):
            start_time = time.time()
            # 进一步将训练集拆分为训练和验证集
            X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
                X_train_np, y_train_np, test_size=0.2, random_state=repeat
            )

            # 训练模型
            anfis_model, scaler_X, scaler_y = train_anfis(
                X_train_sub, y_train_sub, X_val_sub, y_val_sub,
                n_rules=n_rules,
                epochs=1500,
                batch_size=512,
                lr=lr
            )

            # 在测试集上测试模型
            anfis_model.eval()
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            X_test_scaled = scaler_X.transform(X_test_np)
            X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
            y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
            with torch.no_grad():
                y_pred_scaled = anfis_model.infer(X_test_tensor)
                y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
                y_true = y_test_np  # Original unstandardized y_test
                test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                test_rmse_list.append(test_rmse)

            # 记录时间
            end_time = time.time()
            time_taken = end_time - start_time
            time_list.append(time_taken)

            # 提取 mu 和 sigma，计算 Iov 和 Ifspe
            mu = anfis_model.mu[0].detach().cpu().numpy()  # (n_rules, n_inputs)
            sigma = anfis_model.sigma[0].detach().cpu().numpy()  # (n_rules, n_inputs)
            # 反标准化
            mu_orig = mu * scaler_X.scale_.reshape(1, -1) + scaler_X.mean_.reshape(1, -1)
            sigma_orig = sigma * scaler_X.scale_.reshape(1, -1)

            # 计算 Iov 和 Ifspe
            Iov = compute_iov(mu_orig, sigma_orig)
            Ifspe = compute_ifspe(mu_orig, sigma_orig)

            iov_list.append(Iov)
            ifspe_list.append(Ifspe)

            # 计算总属性数量（每个规则每个输入属性都被计算一次）
            # 这里假设每个规则都使用所有输入属性（因为原始模型未进行属性剪枝）
            total_attributes = n_rules * len(features_to_use)
            total_attributes_list.append(total_attributes)

            print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")
            print(f"Overlap Index (Iov): {Iov:.4f}, Fuzzy Set Position Index (Ifspe): {Ifspe:.4f}")

            # 提取模糊规则
            rules = anfis_model.extract_rules(scaler_X, scaler_y, feature_names=features_to_use)
            # 打印模糊规则
            print(f"\nFuzzy Rules for n_rules={n_rules}, learning_rate={lr}, Repeat={repeat+1}:")
            for rule in rules:
                print(rule)
                print()

            # 保存模型
            torch.save(anfis_model.state_dict(), f'results_anfis/anfis_model_nrules{n_rules}_lr{lr}_repeat{repeat+1}.pth')

        # 计算平均 RMSE 和时间
        test_rmse_mean = np.mean(test_rmse_list)
        test_rmse_std = np.std(test_rmse_list)
        time_mean = np.mean(time_list)
        time_std = np.std(time_list)

        # 计算平均 Iov 和 Ifspe
        Iov_mean = np.mean(iov_list)
        Iov_std = np.std(iov_list)
        Ifspe_mean = np.mean(ifspe_list)
        Ifspe_std = np.std(ifspe_list)

        # 计算平均总属性数量
        average_total_attributes = np.mean(total_attributes_list)

        # 打印结果
        print(f"\nResults for n_rules={n_rules}, learning_rate={lr}:")
        print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
        print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")
        print(f"Average Overlap Index (Iov): {Iov_mean:.4f} ± {Iov_std:.4f}")
        print(f"Average Fuzzy Set Position Index (Ifspe): {Ifspe_mean:.4f} ± {Ifspe_std:.4f}")
        print(f"Average Total Number of Attributes Included in All Rules: {average_total_attributes:.2f}")

        # 保存结果
        result = {
            'n_rules': n_rules,
            'learning_rate': lr,
            'test_rmse_mean': test_rmse_mean,
            'test_rmse_std': test_rmse_std,
            'time_mean': time_mean,
            'time_std': time_std,
            'Iov_mean': Iov_mean,
            'Iov_std': Iov_std,
            'Ifspe_mean': Ifspe_mean,
            'Ifspe_std': Ifspe_std,
            'average_total_attributes': average_total_attributes
        }
        results_anfis.append(result)

        # 保存结果到 CSV 文件
        results_df = pd.DataFrame(results_anfis)
        results_df.to_csv('results_anfis/anfis_experiments_results.csv', index=False)
        print("\n所有实验的结果已保存到 'results_anfis/anfis_experiments_results.csv'。")

        # 可视化重叠指数和位置指数
        plt.figure(figsize=(10, 6))
        plt.bar(['Overlap Index (Iov)', 'Fuzzy Set Position Index (Ifspe)'], [Iov_mean, Ifspe_mean],
                yerr=[Iov_std, Ifspe_std], capsize=5, color=['skyblue', 'salmon'])
        plt.ylabel('Index Value')
        plt.title('Average Interpretability Indices')
        plt.grid(axis='y')
        plt.show()

        # 可视化平均总属性数量
        plt.figure(figsize=(6, 4))
        plt.bar(['Average Total Attributes'], [average_total_attributes], color=['lightgreen'])
        plt.ylabel('Number of Attributes')
        plt.title('Average Total Number of Attributes Included in All Rules')
        plt.grid(axis='y')
        plt.tight_layout()
        plt.show()


# SOFENN

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
from scipy.stats import norm  # 确保导入了 norm 函数

# 从 UCI ML Repo 导入数据集
# 注意：请确保安装了 `ucimlrepo` 包，如果没有，请使用以下命令安装：
# !pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
# 从 UCI ML Repo 下载 Appliances Energy Prediction 数据集
from sklearn.datasets import fetch_openml
# 获取波士顿房价数据集
# 从 UCI ML Repo 导入 YearPredictionMSD 数据集
# 数据集链接：https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd
# 从 UCI ML Repo 导入 YearPredictionMSD 数据集
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

# 定义列名称
col_names = ['Year'] + [f'Feature_{i}' for i in range(1, 91)]

# 读取数据集
data = pd.read_csv(data_url, header=None, names=col_names)

# 特征选择
X = data.drop('Year', axis=1)
y = data['Year']

# 更新特征名称以便后续使用
feature_labels = X.columns.tolist()

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

features_to_use=feature_labels
# 更新特征名称以便后续使用
# feature_labels = features_to_use
# 定义 FuzzyLayer 类（PyTorch 实现）
class FuzzyLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FuzzyLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        # 初始化中心 c 和宽度 sigma
        self.c = nn.Parameter(torch.randn(input_dim, output_dim))
        self.sigma = nn.Parameter(torch.ones(input_dim, output_dim))  # 保持原始的初始化方式

    def forward(self, x):
        # x: (batch_size, input_dim)
        # 计算高斯隶属度函数
        # 扩展维度以进行广播
        x_expanded = x.unsqueeze(2)  # (batch_size, input_dim, 1)
        c_expanded = self.c.unsqueeze(0)  # (1, input_dim, output_dim)
        sigma_expanded = self.sigma.unsqueeze(0) + 1e-8  # (1, input_dim, output_dim) 防止除以零

        diff = x_expanded - c_expanded  # (batch_size, input_dim, output_dim)
        exponent = -0.5 * (diff / sigma_expanded) ** 2
        mu = torch.exp(exponent)  # (batch_size, input_dim, output_dim)

        # 对每个神经元，计算所有输入特征的隶属度函数的乘积
        phi = mu.prod(dim=1)  # (batch_size, output_dim)
        return phi

# 定义 NormalizedLayer 类
class NormalizedLayer(nn.Module):
    def forward(self, phi):
        # phi: (batch_size, output_dim)
        phi_sum = phi.sum(dim=1, keepdim=True) + 1e-8  # 防止除以零
        psi = phi / phi_sum
        return psi

# 定义 WeightedLayer 类
class WeightedLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(WeightedLayer, self).__init__()
        # 初始化后件参数 a，包括偏置项
        self.a = nn.Parameter(torch.randn(input_dim + 1, output_dim))

    def forward(self, x, psi):
        # x: (batch_size, input_dim)
        # psi: (batch_size, output_dim)
        batch_size = x.size(0)
        # 添加偏置项
        ones = torch.ones(batch_size, 1).to(x.device)
        x_with_bias = torch.cat([ones, x], dim=1)  # (batch_size, input_dim + 1)
        # 计算 w = x * a
        w = torch.matmul(x_with_bias, self.a)  # (batch_size, output_dim)
        f = psi * w  # (batch_size, output_dim)
        return f

# 定义 OutputLayer 类
class OutputLayer(nn.Module):
    def forward(self, f):
        # f: (batch_size, output_dim)
        output = f.sum(dim=1)  # (batch_size,)
        return output

# 定义 SOFENN 模型
class SOFENN(nn.Module):
    def __init__(self, input_dim, output_dim, n_rules):
        super(SOFENN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_rules = n_rules

        self.fuzzy_layer = FuzzyLayer(input_dim, n_rules)
        self.normalized_layer = NormalizedLayer()
        self.weighted_layer = WeightedLayer(input_dim, n_rules)
        self.output_layer = OutputLayer()

    def forward(self, x):
        phi = self.fuzzy_layer(x)
        psi = self.normalized_layer(phi)
        f = self.weighted_layer(x, psi)
        output = self.output_layer(f)
        return output

    def extract_rules(self, scaler_X, scaler_y, feature_names):
        """
        提取 SOFENN 模型的模糊规则。

        参数：
        - scaler_X: 输入数据的标准化器
        - scaler_y: 输出数据的标准化器
        - feature_names: 特征名称列表

        返回：
        - rules: 包含规则字符串的列表
        """
        rules = []
        c = self.fuzzy_layer.c.detach().cpu().numpy()  # (input_dim, n_rules)
        sigma = self.fuzzy_layer.sigma.detach().cpu().numpy()  # (input_dim, n_rules)
        a = self.weighted_layer.a.detach().cpu().numpy()  # (input_dim + 1, n_rules)

        # 反标准化
        c_orig = c * scaler_X.scale_.reshape(-1, 1) + scaler_X.mean_.reshape(-1, 1)
        sigma_orig = sigma * scaler_X.scale_.reshape(-1, 1)
        a_orig = a.copy()
        a_orig[0, :] = a[0, :]  # 偏置项无需反标准化
        a_orig[1:, :] = a[1:, :] / scaler_X.scale_.reshape(-1, 1) * scaler_y.scale_[0]
        a_orig[0, :] = a_orig[0, :] * scaler_y.scale_[0] + scaler_y.mean_[0] - np.sum(
            (a[1:, :] * scaler_X.mean_.reshape(-1, 1) / scaler_X.scale_.reshape(-1, 1)), axis=0
        ) * scaler_y.scale_[0]

        for j in range(self.n_rules):
            antecedents = []
            for i in range(self.input_dim):
                mu = c_orig[i, j]
                sigma_i = sigma_orig[i, j]
                antecedents.append(f"{feature_names[i]} is Gaussian(c={mu:.4f}, σ={sigma_i:.4f})")
            antecedent_str = " AND ".join(antecedents)
            # 构建后件部分
            a_j = a_orig[:, j]  # (input_dim + 1,)
            consequent_terms = [f"{a_j[0]:.4f}"]
            for idx, coef in enumerate(a_j[1:]):
                coef = coef
                if coef >= 0:
                    term = f"+ {coef:.4f} * {feature_names[idx]}"
                else:
                    term = f"- {abs(coef):.4f} * {feature_names[idx]}"
                consequent_terms.append(term)
            consequent_str = " ".join(consequent_terms)
            rule = f"Rule {j+1}: IF {antecedent_str} THEN Output = {consequent_str}"
            rules.append(rule)
        return rules

# 修改后的 Iov 和 Ifspe 计算函数
def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    """
    使用解析解计算两个高斯隶属度函数的重叠面积。

    参数：
    - c1, sigma1: 第一个高斯函数的中心和标准差
    - c2, sigma2: 第二个高斯函数的中心和标准差

    返回：
    - overlap_area: 两个高斯函数的重叠面积
    """
    denominator = np.sqrt(sigma1**2 + sigma2**2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(c_orig, sigma_orig):
    """
    计算 Average Overlap Index (Iov)。

    参数：
    - c_orig: 反标准化后的 c 数组，形状为 (input_dim, n_rules)
    - sigma_orig: 反标准化后的 sigma 数组，形状为 (input_dim, n_rules)

    返回：
    - average_iov: 平均重叠指数
    """
    input_dim, n_rules = c_orig.shape
    total_max_overlap = 0
    valid_attributes = 0

    for i in range(input_dim):
        c_i = c_orig[i, :]
        sigma_i = sigma_orig[i, :]
        # 假设所有规则都是活跃的
        active_rules = np.arange(n_rules)

        if len(active_rules) < 2:
            continue  # 需要至少两个规则才能计算重叠

        max_overlap = -np.inf
        for j in range(len(active_rules)):
            for k in range(j + 1, len(active_rules)):
                c1 = c_i[active_rules[j]]
                sigma1 = sigma_i[active_rules[j]]
                c2 = c_i[active_rules[k]]
                sigma2 = sigma_i[active_rules[k]]
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0  # 避免除以零

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(c_orig, sigma_orig):
    """
    计算 Average Fuzzy Set Position Index (Ifspe)。

    参数：
    - c_orig: 反标准化后的 c 数组，形状为 (input_dim, n_rules)
    - sigma_orig: 反标准化后的 sigma 数组，形状为 (input_dim, n_rules)

    返回：
    - average_ifspe: 平均模糊集位置指数
    """
    input_dim, n_rules = c_orig.shape
    total_ifspe = 0
    valid_terms = 0

    for i in range(input_dim):
        c_i = c_orig[i, :]
        sigma_i = sigma_orig[i, :]

        # 按中心值排序
        sorted_indices = np.argsort(c_i)
        sorted_centers = c_i[sorted_indices]
        sorted_sigma = sigma_i[sorted_indices]

        if len(sorted_centers) < 2:
            continue  # 需要至少两个规则才能计算 Ifspe

        for l in range(len(sorted_centers) - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sorted_sigma[l]
            s_lp1 = sorted_sigma[l + 1]

            phi_denominator = s_l + s_lp1 + 1e-8  # 防止除以零
            phi = np.exp(-0.5 * ((v_l + v_lp1) / phi_denominator) ** 2)

            denominator = s_l - s_lp1
            if abs(denominator) < 1e-8:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator) ** 2)

            # 使用绝对值确保 Ifspe_term 为非负数
            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0  # 避免除以零

    average_ifspe = total_ifspe / (input_dim * n_rules)
    return average_ifspe

# 定义训练 SOFENN 模型的函数
def train_sofenn(X_train_np, y_train_np, X_val_np, y_val_np, n_rules=3, epochs=500, batch_size=32, lr=0.01):
    """
    训练 SOFENN 模型

    参数：
    - X_train_np: 训练集输入数据，形状：(num_samples, n_inputs)
    - y_train_np: 训练集目标数据，形状：(num_samples,)
    - X_val_np: 验证集输入数据，形状：(num_samples, n_inputs)
    - y_val_np: 验证集目标数据，形状：(num_samples,)
    - n_rules: 规则数量
    - epochs: 训练轮数
    - batch_size: 批次大小
    - lr: 学习率

    返回：
    - sofenn_model: 训练好的模型
    - scaler_X: 输入数据的标准化器
    - scaler_y: 输出数据的标准化器
    """
    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)  # 使用相同的缩放器
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()  # 标准化输出并扁平化
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为 PyTorch 张量
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_scaled_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
    y_val_scaled_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)

    # 初始化模型
    input_dim = X_train_scaled_tensor.shape[1]
    sofenn_model = SOFENN(
        input_dim=input_dim,
        output_dim=1,
        n_rules=n_rules
    ).to(device)

    # 初始化优化器
    optimizer = optim.AdamW(sofenn_model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.9, patience=10, verbose=True
    )

    # 初始化列表，保存训练过程中的信息
    training_info = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'val_rmse': []
    }

    # 定义损失函数
    criterion = nn.MSELoss()

    # 训练模型
    for epoch in range(epochs):
        sofenn_model.train()
        # 采用批量训练
        permutation = torch.randperm(X_train_scaled_tensor.size()[0])
        epoch_loss = 0
        for i in range(0, X_train_scaled_tensor.size()[0], batch_size):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X_train_scaled_tensor[indices], y_train_scaled_tensor[indices]
            optimizer.zero_grad()
            output = sofenn_model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= (X_train_scaled_tensor.size()[0] // batch_size + 1)

        sofenn_model.eval()
        with torch.no_grad():
            y_val_pred_scaled = sofenn_model(X_val_scaled_tensor)
            loss_val = criterion(y_val_pred_scaled, y_val_scaled_tensor)
            # 反标准化预测值和真实值
            y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
            y_val_true = scaler_y.inverse_transform(y_val_scaled_tensor.cpu().numpy().reshape(-1, 1)).flatten()
            val_rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
        scheduler.step(loss_val)

        # 保存训练信息
        training_info['epoch'].append(epoch + 1)
        training_info['train_loss'].append(epoch_loss)
        training_info['val_loss'].append(loss_val.item())
        training_info['val_rmse'].append(val_rmse)

        # 显示验证集上的 RMSE
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {loss_val:.4f} - Val RMSE: {val_rmse:.4f}")

    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['train_loss'], label='Train Loss')
    plt.plot(training_info['epoch'], training_info['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 可视化验证集上的 RMSE
    plt.figure(figsize=(10, 6))
    plt.plot(training_info['epoch'], training_info['val_rmse'], label='Validation RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Validation RMSE over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

    # 返回模型和标准化器
    return sofenn_model, scaler_X, scaler_y

# 实验参数
n_rules_list = [3, 5, 7, 9]  # 可以测试不同的规则数量
learning_rates = [0.01]
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results_sofenn', exist_ok=True)

# 记录实验结果
results_sofenn = []

for n_rules in n_rules_list:
    for lr in learning_rates:
        test_rmse_list = []
        time_list = []
        iov_list = []
        ifspe_list = []
        print(f"\nStarting experiments for n_rules={n_rules}, learning_rate={lr}")
        for repeat in range(repeats):
            start_time = time.time()
            # 进一步将训练集拆分为训练和验证集
            X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
                X_train_np, y_train_np, test_size=0.2, random_state=repeat
            )

            # 训练模型
            sofenn_model, scaler_X, scaler_y = train_sofenn(
                X_train_sub, y_train_sub, X_val_sub, y_val_sub,
                n_rules=n_rules,
                epochs=1500,
                batch_size=512,
                lr=lr
            )

            # 在测试集上测试模型
            sofenn_model.eval()
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            X_test_scaled = scaler_X.transform(X_test_np)
            X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
            y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)
            with torch.no_grad():
                y_pred_scaled = sofenn_model(X_test_tensor)
                y_pred = scaler_y.inverse_transform(y_pred_scaled.cpu().numpy().reshape(-1, 1)).flatten()
                y_true = y_test_np  # Original unstandardized y_test
                test_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                test_rmse_list.append(test_rmse)

            # 记录时间
            end_time = time.time()
            time_taken = end_time - start_time
            time_list.append(time_taken)

            # 提取 c 和 sigma，计算 Iov 和 Ifspe
            c = sofenn_model.fuzzy_layer.c.detach().cpu().numpy()  # (input_dim, n_rules)
            sigma = sofenn_model.fuzzy_layer.sigma.detach().cpu().numpy()  # (input_dim, n_rules)
            c_orig = c * scaler_X.scale_.reshape(-1, 1) + scaler_X.mean_.reshape(-1, 1)
            sigma_orig = sigma * scaler_X.scale_.reshape(-1, 1)

            # 计算 Iov 和 Ifspe
            Iov = compute_iov(c_orig, sigma_orig)
            Ifspe = compute_ifspe(c_orig, sigma_orig)

            iov_list.append(Iov)
            ifspe_list.append(Ifspe)

            print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")
            print(f"Overlap Index (Iov): {Iov:.4f}, Fuzzy Set Position Index (Ifspe): {Ifspe:.4f}")

            # 提取模糊规则
            rules = sofenn_model.extract_rules(scaler_X, scaler_y, feature_names=features_to_use)
            # 打印模糊规则
            print(f"\nFuzzy Rules for n_rules={n_rules}, learning_rate={lr}, Repeat={repeat+1}:")
            for rule in rules:
                print(rule)
                print()

            # 保存模型
            torch.save(sofenn_model.state_dict(), f'results_sofenn/sofenn_model_nrules{n_rules}_lr{lr}_repeat{repeat+1}.pth')

        # 计算平均 RMSE 和时间
        test_rmse_mean = np.mean(test_rmse_list)
        test_rmse_std = np.std(test_rmse_list)
        time_mean = np.mean(time_list)
        time_std = np.std(time_list)

        # 计算平均 Iov 和 Ifspe
        Iov_mean = np.mean(iov_list)
        Iov_std = np.std(iov_list)
        Ifspe_mean = np.mean(ifspe_list)
        Ifspe_std = np.std(ifspe_list)

        # 打印结果
        print(f"\nResults for n_rules={n_rules}, learning_rate={lr}:")
        print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
        print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")
        print(f"Average Overlap Index (Iov): {Iov_mean:.4f} ± {Iov_std:.4f}")
        print(f"Average Fuzzy Set Position Index (Ifspe): {Ifspe_mean:.4f} ± {Ifspe_std:.4f}")

        # 保存结果
        result = {
            'n_rules': n_rules,
            'learning_rate': lr,
            'test_rmse_mean': test_rmse_mean,
            'test_rmse_std': test_rmse_std,
            'time_mean': time_mean,
            'time_std': time_std,
            'Iov_mean': Iov_mean,
            'Iov_std': Iov_std,
            'Ifspe_mean': Ifspe_mean,
            'Ifspe_std': Ifspe_std
        }
        results_sofenn.append(result)


In [None]:
!pip install scikit-fuzzy

In [None]:
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import skfuzzy as fuzz
from sklearn.cluster import KMeans
from scipy.stats import norm

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

# 假设您已经有 fetch_ucirepo 函数来获取数据集
# 如果没有，可以直接从 UCI 数据库下载并加载数据集
beijing_pm2_5 = fetch_ucirepo(id=381)

# 数据（作为 pandas 数据帧）
X = beijing_pm2_5.data.features
y = beijing_pm2_5.data.targets

# 选择需要的特征
features_to_use = ['year', 'month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir']

# 处理目标变量
X = X[features_to_use]
y = y['pm2.5']

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X, y], axis=1).dropna()
X = data[features_to_use]
y = data['pm2.5']

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# ============================
# 定义 Neuro-Fuzzy RVFL 模型
# ============================

class NeuroFuzzyRVFL:
    def __init__(self, input_dim, output_dim, NumFuzzyRule, NumHiddenNodes, activation_function, C, cluster_method='fcm'):
        # 初始化参数
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.NumFuzzyRule = NumFuzzyRule
        self.NumHiddenNodes = NumHiddenNodes
        self.activation_function = activation_function
        self.C = C
        self.cluster_method = cluster_method
        # 初始化 Alpha 和 WeightHidden
        self.Alpha = np.random.rand(input_dim, NumFuzzyRule)
        self.WeightHidden = np.random.rand(NumFuzzyRule + 1, NumHiddenNodes)
        # 其他参数将在训练期间设置
        self.center = None
        self.std = 1  # 与 MATLAB 代码一致
        self.beta = None  # 输出层权重

    def activation(self, H):
        if self.activation_function == 1:
            # Sigmoid 函数
            return 1 / (1 + np.exp(-H))
        elif self.activation_function == 2:
            return np.sin(H)
        elif self.activation_function == 3:
            # Tribas 函数
            return np.maximum(1 - np.abs(H), 0)
        elif self.activation_function == 4:
            # Radbas 函数
            return np.exp(-H ** 2)
        elif self.activation_function == 5:
            # Tansig 函数
            return (2 / (1 + np.exp(-2 * H))) - 1
        elif self.activation_function == 6:
            # ReLU 函数
            return np.maximum(0, H)
        else:
            raise ValueError("Invalid activation function")

    def fit(self, train_x, train_y):
        # 执行聚类以获得中心
        if self.cluster_method == 'kmeans':
            kmeans = KMeans(n_clusters=self.NumFuzzyRule, random_state=0).fit(train_x)
            self.center = kmeans.cluster_centers_
        elif self.cluster_method == 'fcm':
            # 使用 skfuzzy 进行 FCM 聚类
            cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
                train_x.T, self.NumFuzzyRule, 2, error=0.005, maxiter=1000, init=None)
            self.center = cntr
        else:
            # 随机中心
            indices = np.random.choice(train_x.shape[0], self.NumFuzzyRule, replace=False)
            self.center = train_x[indices, :]

        # 计算 MF 和 F
        diff = train_x[:, np.newaxis, :] - self.center[np.newaxis, :, :]
        exp_term = np.exp(-np.square(diff) / self.std)
        MF = np.prod(exp_term, axis=2)
        MF_sum = np.sum(MF, axis=1, keepdims=True)
        MF = MF / MF_sum

        train_x_Alpha = train_x @ self.Alpha
        F = MF * train_x_Alpha

        # 添加偏置项
        F1 = np.hstack((F, 0.1 * np.ones((F.shape[0], 1))))
        H = F1 @ self.WeightHidden
        H = self.activation(H)
        # 添加直接链接
        H = np.hstack((H, train_x))
        M = np.hstack((MF * (train_x @ self.Alpha), H))

        # 计算 beta
        if M.shape[1] < train_x.shape[0]:
            self.beta = np.linalg.inv(M.T @ M + np.eye(M.shape[1]) / self.C) @ (M.T @ train_y)
        else:
            self.beta = M.T @ np.linalg.inv(np.eye(M.shape[0]) / self.C + M @ M.T) @ train_y

    def predict(self, test_x):
        # 计算测试数据的 MF 和 F
        diff = test_x[:, np.newaxis, :] - self.center[np.newaxis, :, :]
        exp_term = np.exp(-np.square(diff) / self.std)
        MF = np.prod(exp_term, axis=2)
        MF_sum = np.sum(MF, axis=1, keepdims=True)
        MF = MF / MF_sum

        test_x_Alpha = test_x @ self.Alpha
        F = MF * test_x_Alpha

        F1 = np.hstack((F, 0.1 * np.ones((F.shape[0], 1))))
        H = F1 @ self.WeightHidden
        H = self.activation(H)
        H = np.hstack((H, test_x))
        M1 = np.hstack((MF * (test_x @ self.Alpha), H))

        PredictedTestLabel = M1 @ self.beta
        return PredictedTestLabel

# ============================
# 定义训练函数
# ============================

def train_nf_rvfl(X_train_np, y_train_np, X_val_np, y_val_np, X_test_np, y_test_np,
                  NumFuzzyRule=15, NumHiddenNodes=203, activation_function=5, C=0.001,
                  cluster_method='fcm'):
    # 标准化数据
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)
    X_test_scaled = scaler_X.transform(X_test_np)
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test_np.reshape(-1, 1)).flatten()

    # 合并训练和验证数据进行训练
    X_train_combined = np.vstack((X_train_scaled, X_val_scaled))
    y_train_combined = np.hstack((y_train_scaled, y_val_scaled))

    input_dim = X_train_scaled.shape[1]
    output_dim = 1

    # 初始化并训练模型
    nf_rvfl_model = NeuroFuzzyRVFL(input_dim=input_dim, output_dim=output_dim,
                                   NumFuzzyRule=NumFuzzyRule, NumHiddenNodes=NumHiddenNodes,
                                   activation_function=activation_function, C=C,
                                   cluster_method=cluster_method)

    start_time = time.time()
    nf_rvfl_model.fit(X_train_combined, y_train_combined)
    training_time = time.time() - start_time

    # 测试模型
    start_time = time.time()
    y_pred_scaled = nf_rvfl_model.predict(X_test_scaled)
    testing_time = time.time() - start_time

    # 反标准化预测值
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    test_rmse = np.sqrt(mean_squared_error(y_test_np, y_pred))

    return nf_rvfl_model, test_rmse, training_time + testing_time, scaler_X, scaler_y

# ============================
# 定义计算指标的函数
# ============================

from scipy.stats import norm

def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    denominator = np.sqrt(sigma1 ** 2 + sigma2 ** 2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(model):
    c = model.center  # (n_rules, input_dim)
    sigma = model.std  # 标量
    n_rules, input_dim = c.shape

    total_max_overlap = 0
    valid_attributes = 0

    for attr in range(input_dim):
        max_overlap = -np.inf
        for i in range(n_rules):
            for j in range(i + 1, n_rules):
                c1 = c[i, attr]
                c2 = c[j, attr]
                sigma1 = sigma
                sigma2 = sigma
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(model):
    c = model.center  # (n_rules, input_dim)
    sigma = model.std  # 标量
    n_rules, input_dim = c.shape

    total_ifspe = 0
    valid_terms = 0

    for attr in range(input_dim):
        # 获取该属性的中心
        centers = c[:, attr]
        # 排序中心
        sorted_indices = np.argsort(centers)
        sorted_centers = centers[sorted_indices]

        for l in range(n_rules - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sigma
            s_lp1 = sigma

            phi = np.exp(-0.5 * ((v_l + v_lp1) / (s_l + s_lp1)) ** 2)
            denominator = s_l - s_lp1
            if denominator == 0:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator) ** 2)

            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0

    average_ifspe = total_ifspe / (n_rules * input_dim)
    return average_ifspe

# ============================
# 实验设置与运行
# ============================

# 设置超参数
NumFuzzyRule = [3,5,7,9]
NumHiddenNodes = 203
activation_function = 5  # Tansig
C = 0.001
cluster_method = 'fcm'
repeats = 5
for i in NumFuzzyRule:
  test_rmse_list = []
  time_list = []
  results_nf_rvfl = []

  for repeat in range(repeats):
      start_time = time.time()

      # 进一步将训练集拆分为训练和验证集
      X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
          X_train_np, y_train_np, test_size=0.2, random_state=repeat)

      nf_rvfl_model, test_rmse, time_taken, scaler_X, scaler_y = train_nf_rvfl(
          X_train_sub, y_train_sub, X_val_sub, y_val_sub, X_test_np, y_test_np,
          NumFuzzyRule=NumFuzzyRule, NumHiddenNodes=NumHiddenNodes,
          activation_function=activation_function, C=C,
          cluster_method=cluster_method)

      test_rmse_list.append(test_rmse)
      time_list.append(time_taken)

      # 计算 Iov 和 Ifspe
      average_iov = compute_iov(nf_rvfl_model)
      average_ifspe = compute_ifspe(nf_rvfl_model)

      print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s, Iov={average_iov:.4f}, Ifspe={average_ifspe:.4f}")

      # 保存结果
      result = {
          'repeat': repeat + 1,
          'test_rmse': test_rmse,
          'time_taken': time_taken,
          'average_iov': average_iov,
          'average_ifspe': average_ifspe
      }
      results_nf_rvfl.append(result)

  # 计算平均值和标准差
  test_rmse_mean = np.mean(test_rmse_list)
  test_rmse_std = np.std(test_rmse_list)
  time_mean = np.mean(time_list)
  time_std = np.std(time_list)

  average_iov_list = [res['average_iov'] for res in results_nf_rvfl]
  average_ifspe_list = [res['average_ifspe'] for res in results_nf_rvfl]
  average_iov_mean = np.mean(average_iov_list)
  average_iov_std = np.std(average_iov_list)
  average_ifspe_mean = np.mean(average_ifspe_list)
  average_ifspe_std = np.std(average_ifspe_list)

  # 打印结果
  print(f"\nResults:",'rule=',i)
  print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
  print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")
  print(f"Average Overlap Index (Iov): {average_iov_mean:.4f} ± {average_iov_std:.4f}")
  print(f"Average Fuzzy Set Position Index (Ifspe): {average_ifspe_mean:.4f} ± {average_ifspe_std:.4f}")


# RVFL

In [None]:
!pip install scikit-fuzzy

In [None]:
# ============================
# 导入必要的库
# ============================
import numpy as np
import pandas as pd
import time
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import skfuzzy as fuzz
from sklearn.cluster import KMeans
from scipy.stats import norm

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import fetch_openml
# ============================
# 数据加载与预处理
# ============================
# 安装 ucimlrepo 库（仅在第一次运行时需要）
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip install ucimlrepo
    from ucimlrepo import fetch_ucirepo

# 从 UCI ML Repo 导入数据集
# 注意：请确保安装了 `ucimlrepo` 包，如果没有，请使用以下命令安装：
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
# 从 UCI ML Repo 下载 Appliances Energy Prediction 数据集

# 获取数据集


# 获取波士顿房价数据集
boston = fetch_openml(name='boston', version=1, as_frame=True)

# 数据（作为 pandas 数据帧）
X = boston.data
y = boston.target

# 选择需要的特征
features_to_use = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',
                  'RM', 'AGE', 'DIS', 'RAD', 'TAX',
                  'PTRATIO', 'B', 'LSTAT']

# 处理目标变量
y = y.astype(float)  # 确保目标变量为浮点数

# 检查缺失值并删除含有缺失值的样本
data = pd.concat([X[features_to_use], y.rename('MEDV')], axis=1).dropna()
X = data[features_to_use].astype(float).values  # 确保所有特征为浮点数
y = data['MEDV'].astype(float).values  # 确保目标变量为浮点数

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# ============================
# 定义 Neuro-Fuzzy RVFL 模型
# ============================

class NeuroFuzzyRVFL:
    def __init__(self, input_dim, output_dim, NumFuzzyRule, NumHiddenNodes, activation_function, C, cluster_method='fcm'):
        # 初始化参数
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.NumFuzzyRule = NumFuzzyRule
        self.NumHiddenNodes = NumHiddenNodes
        self.activation_function = activation_function
        self.C = C
        self.cluster_method = cluster_method
        # 初始化 Alpha 和 WeightHidden
        self.Alpha = np.random.rand(input_dim, NumFuzzyRule)
        self.WeightHidden = np.random.rand(NumFuzzyRule + 1, NumHiddenNodes)
        # 其他参数将在训练期间设置
        self.center = None
        self.std = 1  # 与 MATLAB 代码一致
        self.beta = None  # 输出层权重

    def activation(self, H):
        # 限制 H 的范围，防止数值溢出
        H = np.clip(H, -500, 500)
        if self.activation_function == 1:
            # Sigmoid 函数
            return 1 / (1 + np.exp(-H))
        elif self.activation_function == 2:
            return np.sin(H)
        elif self.activation_function == 3:
            # Tribas 函数
            return np.maximum(1 - np.abs(H), 0)
        elif self.activation_function == 4:
            # Radbas 函数
            return np.exp(-H ** 2)
        elif self.activation_function == 5:
            # Tansig 函数
            return (2 / (1 + np.exp(-2 * H))) - 1
        elif self.activation_function == 6:
            # ReLU 函数
            return np.maximum(0, H)
        else:
            raise ValueError("Invalid activation function")

    def fit(self, train_x, train_y):
        # 执行聚类以获得中心
        if self.cluster_method == 'kmeans':
            kmeans = KMeans(n_clusters=self.NumFuzzyRule, random_state=0).fit(train_x)
            self.center = kmeans.cluster_centers_
        elif self.cluster_method == 'fcm':
            # 使用 skfuzzy 进行 FCM 聚类
            cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
                train_x.T, self.NumFuzzyRule, 2, error=0.005, maxiter=1000, init=None)
            self.center = cntr
        else:
            # 随机中心
            indices = np.random.choice(train_x.shape[0], self.NumFuzzyRule, replace=False)
            self.center = train_x[indices, :]

        # 计算 MF 和 F
        diff = train_x[:, np.newaxis, :] - self.center[np.newaxis, :, :]
        exp_term = np.exp(-np.square(diff) / self.std)
        MF = np.prod(exp_term, axis=2)
        MF_sum = np.sum(MF, axis=1, keepdims=True) + 1e-10  # 防止除以零
        MF = MF / MF_sum

        train_x_Alpha = train_x @ self.Alpha
        F = MF * train_x_Alpha

        # 添加偏置项
        F1 = np.hstack((F, 0.1 * np.ones((F.shape[0], 1))))
        H = F1 @ self.WeightHidden
        H = self.activation(H)
        # 添加直接链接
        H = np.hstack((H, train_x))
        M = np.hstack((MF * (train_x @ self.Alpha), H))

        # 计算 beta 使用伪逆
        self.beta = np.linalg.pinv(M) @ train_y

    def predict(self, test_x):
        # 计算测试数据的 MF 和 F
        diff = test_x[:, np.newaxis, :] - self.center[np.newaxis, :, :]
        exp_term = np.exp(-np.square(diff) / self.std)
        MF = np.prod(exp_term, axis=2)
        MF_sum = np.sum(MF, axis=1, keepdims=True) + 1e-10  # 防止除以零
        MF = MF / MF_sum

        test_x_Alpha = test_x @ self.Alpha
        F = MF * test_x_Alpha

        F1 = np.hstack((F, 0.1 * np.ones((F.shape[0], 1))))
        H = F1 @ self.WeightHidden
        H = self.activation(H)
        H = np.hstack((H, test_x))
        M1 = np.hstack((MF * (test_x @ self.Alpha), H))

        PredictedTestLabel = M1 @ self.beta

        # 检查是否有 NaN 并处理
        if np.isnan(PredictedTestLabel).any():
            print("Warning: Predicted labels contain NaN. Replacing NaN with zero.")
            PredictedTestLabel = np.nan_to_num(PredictedTestLabel)

        return PredictedTestLabel

# ============================
# 定义训练函数
# ============================

def train_nf_rvfl(X_train_np, y_train_np, X_val_np, y_val_np, X_test_np, y_test_np,
                  NumFuzzyRule=15, NumHiddenNodes=203, activation_function=5, C=0.001,
                  cluster_method='fcm'):
    # 标准化数据
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_val_scaled = scaler_X.transform(X_val_np)
    X_test_scaled = scaler_X.transform(X_test_np)
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_val_scaled = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test_np.reshape(-1, 1)).flatten()

    # 合并训练和验证数据进行训练
    X_train_combined = np.vstack((X_train_scaled, X_val_scaled))
    y_train_combined = np.hstack((y_train_scaled, y_val_scaled))

    input_dim = X_train_scaled.shape[1]
    output_dim = 1

    # 初始化并训练模型
    nf_rvfl_model = NeuroFuzzyRVFL(input_dim=input_dim, output_dim=output_dim,
                                   NumFuzzyRule=NumFuzzyRule, NumHiddenNodes=NumHiddenNodes,
                                   activation_function=activation_function, C=C,
                                   cluster_method=cluster_method)

    start_time = time.time()
    nf_rvfl_model.fit(X_train_combined, y_train_combined)
    training_time = time.time() - start_time

    # 测试模型
    start_time = time.time()
    y_pred_scaled = nf_rvfl_model.predict(X_test_scaled)
    testing_time = time.time() - start_time

    # 反标准化预测值
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    # 检查 y_pred 是否包含 NaN
    if np.isnan(y_pred).any():
        print("Error: Predicted values contain NaN. Please check the model implementation.")
        # 这里我们可以选择替换 NaN，或者引发异常
        y_pred = np.nan_to_num(y_pred)

    test_rmse = np.sqrt(mean_squared_error(y_test_np, y_pred))

    return nf_rvfl_model, test_rmse, training_time + testing_time, scaler_X, scaler_y

# ============================
# 定义计算指标的函数
# ============================

def compute_overlap_analytic(c1, sigma1, c2, sigma2):
    denominator = np.sqrt(sigma1 ** 2 + sigma2 ** 2)
    if denominator == 0:
        return 0
    d = np.abs(c1 - c2) / denominator
    overlap_area = 2 * norm.cdf(-d)
    return overlap_area

def compute_iov(model):
    c = model.center  # (n_rules, input_dim)
    sigma = model.std  # 标量
    n_rules, input_dim = c.shape

    total_max_overlap = 0
    valid_attributes = 0

    for attr in range(input_dim):
        max_overlap = -np.inf
        for i in range(n_rules):
            for j in range(i + 1, n_rules):
                c1 = c[i, attr]
                c2 = c[j, attr]
                sigma1 = sigma
                sigma2 = sigma
                overlap = compute_overlap_analytic(c1, sigma1, c2, sigma2)
                if overlap > max_overlap:
                    max_overlap = overlap
        if max_overlap != -np.inf:
            total_max_overlap += max_overlap
            valid_attributes += 1

    if valid_attributes == 0:
        return 0

    average_iov = total_max_overlap / valid_attributes
    return average_iov

def compute_ifspe(model):
    c = model.center  # (n_rules, input_dim)
    sigma = model.std  # 标量
    n_rules, input_dim = c.shape

    total_ifspe = 0
    valid_terms = 0

    for attr in range(input_dim):
        # 获取该属性的中心
        centers = c[:, attr]
        # 排序中心
        sorted_indices = np.argsort(centers)
        sorted_centers = centers[sorted_indices]

        for l in range(n_rules - 1):
            v_l = sorted_centers[l]
            v_lp1 = sorted_centers[l + 1]
            s_l = sigma
            s_lp1 = sigma

            phi = np.exp(-0.5 * ((v_l + v_lp1) / (s_l + s_lp1)) ** 2)
            denominator = s_l - s_lp1
            if denominator == 0:
                psi = 0
            else:
                psi = np.exp(-0.5 * ((v_l + v_lp1) / denominator) ** 2)

            ifspe_term = 2 * abs(0.5 - phi) + psi

            total_ifspe += ifspe_term
            valid_terms += 1

    if valid_terms == 0:
        return 0

    average_ifspe = total_ifspe / (n_rules * input_dim)
    return average_ifspe

# ============================
# 实验设置与运行
# ============================

# 设置超参数
NumFuzzyRules = [3,5,7,9]  # 推荐使用与原始 MATLAB 代码一致的值
NumHiddenNodes = 21
activation_function =  5 # Tansig
C = 0.001
cluster_method = 'fcm'
repeats = 5
for NumFuzzyRule in NumFuzzyRules:
  test_rmse_list = []
  time_list = []
  results_nf_rvfl = []

  for repeat in range(repeats):
      start_time = time.time()

      # 进一步将训练集拆分为训练和验证集
      X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
          X_train_np, y_train_np, test_size=0.2, random_state=repeat)

      nf_rvfl_model, test_rmse, time_taken, scaler_X, scaler_y = train_nf_rvfl(
          X_train_sub, y_train_sub, X_val_sub, y_val_sub, X_test_np, y_test_np,
          NumFuzzyRule=NumFuzzyRule, NumHiddenNodes=NumHiddenNodes,
          activation_function=activation_function, C=C,
          cluster_method=cluster_method)

      test_rmse_list.append(test_rmse)
      time_list.append(time_taken)

      # 计算 Iov 和 Ifspe
      average_iov = compute_iov(nf_rvfl_model)
      average_ifspe = compute_ifspe(nf_rvfl_model)

      print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s, Iov={average_iov:.4f}, Ifspe={average_ifspe:.4f}")

      # 保存结果
      result = {
          'repeat': repeat + 1,
          'test_rmse': test_rmse,
          'time_taken': time_taken,
          'average_iov': average_iov,
          'average_ifspe': average_ifspe
      }
      results_nf_rvfl.append(result)

  # 计算平均值和标准差
  test_rmse_mean = np.mean(test_rmse_list)
  test_rmse_std = np.std(test_rmse_list)
  time_mean = np.mean(time_list)
  time_std = np.std(time_list)

  average_iov_list = [res['average_iov'] for res in results_nf_rvfl]
  average_ifspe_list = [res['average_ifspe'] for res in results_nf_rvfl]
  average_iov_mean = np.mean(average_iov_list)
  average_iov_std = np.std(average_iov_list)
  average_ifspe_mean = np.mean(average_ifspe_list)
  average_ifspe_std = np.std(average_ifspe_list)

  # 打印结果
  print(f"\nResults:Rule=",NumFuzzyRule)
  print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
  print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")
  print(f"Average Overlap Index (Iov): {average_iov_mean:.4f} ± {average_iov_std:.4f}")
  print(f"Average Fuzzy Set Position Index (Ifspe): {average_ifspe_mean:.4f} ± {average_ifspe_std:.4f}")


# ANFIS-PSO

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
# !pip install ucimlrepo
# 从 UCI ML Repo 导入数据集
from ucimlrepo import fetch_ucirepo
from sklearn.datasets import fetch_openml
# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')
# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# ============================
# 数据加载与预处理
# ============================
# 安装 ucimlrepo 库（仅在第一次运行时需要）
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip install ucimlrepo
    from ucimlrepo import fetch_ucirepo


# 从 UCI ML Repo 导入 Auto MPG 数据集
auto_mpg_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

# 定义列名称
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model_year', 'origin', 'car_name']

# 读取数据集，处理缺失值
data = pd.read_csv(auto_mpg_url, delim_whitespace=True, names=column_names, na_values='?')

# 删除含有缺失值的样本
data = data.dropna()

# 特征选择
# 排除 'mpg' 和 'car_name'，将其余作为输入特征
features_to_use = [
    'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin'
]

# 处理目标变量
X = data[features_to_use]
y = data['mpg']

# 将类别变量 'origin' 进行独热编码（如果需要，可以选择保留为数值型）
# 这里保留为数值型，以简化 ANFIS 模型的处理
# 如果希望进行独热编码，请取消下方代码的注释
# X = pd.get_dummies(X, columns=['origin'], drop_first=True)
# features_to_use = X.columns.tolist()

# 检查缺失值并删除含有缺失值的样本（已在读取时完成）

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
# feature_labels = features_to_use

# 更新特征名称以便后续使用
feature_labels = features_to_use
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 定义高斯隶属函数
def gmf(x, c, sigma):
    return torch.exp(-0.5 * ((x - c) / sigma) ** 2)

# 定义 ANFIS 模型的前向传播
def anfis_get_output(params, inputs, num_of_mf_terms):
    num_of_inputs = inputs.shape[1]
    num_of_rules = num_of_mf_terms ** num_of_inputs

    # 输入变量
    fis_input = []
    weight_index = 0
    for i in range(num_of_inputs):
        input_dict = {}
        input_dict['value'] = inputs[:, i]
        input_dict['mf'] = []
        for j in range(num_of_mf_terms):
            c = params[weight_index]
            sigma = params[weight_index + 1]
            md = gmf(input_dict['value'], c, sigma)
            input_dict['mf'].append({'params': [c, sigma], 'MD': md})
            weight_index += 2
        fis_input.append(input_dict)

    # 生成规则列表
    from itertools import product
    mf_combinations = list(product(*[input_dict['mf'] for input_dict in fis_input]))
    rule_list = []
    for mf_tuple in mf_combinations:
        rule = {'antecedent': [], 'prod': None, 'norm': None, 'consequent': None}
        antecedent_mds = [mf['MD'] for mf in mf_tuple]
        # 计算规则的前件部分的乘积
        rule['antecedent'] = antecedent_mds
        rule['prod'] = torch.prod(torch.stack(antecedent_mds), dim=0)
        rule_list.append(rule)

    # 计算所有规则的前件乘积之和
    sum_of_all_rules = torch.sum(torch.stack([rule['prod'] for rule in rule_list]), dim=0) + 1e-8  # 防止除以零

    # 归一化每个规则
    for rule in rule_list:
        rule['norm'] = rule['prod'] / sum_of_all_rules

    # 计算规则的输出
    outputs = torch.zeros(inputs.shape[0], device=device)
    for rule in rule_list:
        # 线性函数的参数
        a_params = params[weight_index:weight_index + num_of_inputs + 1]
        weight_index += num_of_inputs + 1
        # 计算规则的后件部分
        f = torch.matmul(torch.cat((inputs, torch.ones(inputs.shape[0], 1, device=device)), dim=1), a_params)
        rule_output = rule['norm'] * f
        outputs += rule_output

    return outputs

# 初始化粒子群
def init_swarm(n_particles, Lb, Ub):
    return Lb + torch.rand(n_particles, len(Lb), device=device) * (Ub - Lb)

# 初始化速度
def init_velocity(n_particles, ndim, vel_clamping_factor, Ub):
    v_max = Ub * vel_clamping_factor
    v_min = -v_max
    return v_min + (v_max - v_min) * torch.rand(n_particles, ndim, device=device)

# 更新速度
def pso_velocity(velocity, gbest, pbest, particles, w, social_const, cognitive_const, vel_clamping_factor, Ub):
    n_particles, ndim = particles.shape
    v_max = Ub * vel_clamping_factor
    v_min = -v_max
    r1 = torch.rand(n_particles, ndim, device=device)
    r2 = torch.rand(n_particles, ndim, device=device)
    velocity = (w * velocity +
                cognitive_const * r1 * (pbest - particles) +
                social_const * r2 * (gbest - particles))
    # 速度限制
    velocity = torch.clamp(velocity, v_min, v_max)
    return velocity

# 更新粒子位置
def pso_move(particles, velocity, Lb, Ub):
    particles = particles + velocity
    particles = torch.clamp(particles, Lb, Ub)
    return particles

# PSO 训练函数
def pso_train(X_train, y_train, X_val, y_val, total_param, num_of_mf_terms, max_iterations=20, err_tolerance=0.05):
    itr = 1
    # 使用float32的最大值替代1.0e+100
    fbest = torch.finfo(torch.float32).max  # 全局最优值
    total_particles = 15  # 粒子数量
    Ub = torch.ones(total_param, device=device)
    Lb = -torch.ones(total_param, device=device)

    min_inertia_weight = 0.4
    max_inertia_weight = 0.9
    social_const = 2
    cognitive_const = 2
    vel_clamping_factor = 2
    # 使用float32的最大值替代1.0e+100
    pbestval = torch.full((total_particles,), torch.finfo(torch.float32).max, device=device)  # 个体最优值
    pbest = torch.zeros((total_particles, total_param), device=device)  # 个体最优位置

    particles = init_swarm(total_particles, Lb, Ub)
    velocity = init_velocity(total_particles, total_param, vel_clamping_factor, Ub)

    # 存储训练和验证损失
    training_losses = []
    validation_losses = []

    print("Starting PSO training...")
    while (fbest > err_tolerance) and (itr <= max_iterations):
        for i in range(total_particles):
            outputs = anfis_get_output(particles[i], X_train, num_of_mf_terms)
            mse = torch.mean((y_train - outputs) ** 2).item()

            # 更新个体最优
            if mse <= pbestval[i].item():
                pbestval[i] = torch.tensor(mse, device=device)
                pbest[i] = particles[i].clone()

            # 更新全局最优
            if mse <= fbest:
                fbest = mse
                gbest = particles[i].clone()

        # 更新速度和位置
        w = ((max_iterations - itr) * (max_inertia_weight - min_inertia_weight)) / (max_iterations - 1) + min_inertia_weight
        velocity = pso_velocity(velocity, gbest, pbest, particles, w, social_const, cognitive_const, vel_clamping_factor, Ub)
        particles = pso_move(particles, velocity, Lb, Ub)

        # 计算验证损失
        outputs_val = anfis_get_output(gbest, X_val, num_of_mf_terms)
        val_mse = torch.mean((y_val - outputs_val) ** 2).item()

        # 存储损失
        training_losses.append(fbest)
        validation_losses.append(val_mse)

        # 打印训练进度
        print(f"Iteration {itr}/{max_iterations} - Training MSE: {fbest:.6f} - Validation MSE: {val_mse:.6f}")

        itr += 1

    TrainingMSE = fbest
    bestParams = gbest.cpu().numpy()
    Iterations = itr - 1
    return TrainingMSE, bestParams, Iterations, training_losses, validation_losses

# 训练 ANFIS 模型
def train_anfis_pso(X_train_np, y_train_np, X_val_np, y_val_np, num_of_mf_terms=2, max_iterations=20):
    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled_np = scaler_X.fit_transform(X_train_np)
    X_val_scaled_np = scaler_X.transform(X_val_np)
    y_train_scaled_np = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_val_scaled_np = scaler_y.transform(y_val_np.reshape(-1, 1)).flatten()

    # 转换为PyTorch张量并移动到GPU
    X_train_scaled = torch.tensor(X_train_scaled_np, dtype=torch.float32).to(device)
    X_val_scaled = torch.tensor(X_val_scaled_np, dtype=torch.float32).to(device)
    y_train_scaled = torch.tensor(y_train_scaled_np, dtype=torch.float32).to(device)
    y_val_scaled = torch.tensor(y_val_scaled_np, dtype=torch.float32).to(device)

    num_of_inputs = X_train_scaled.shape[1]
    num_of_rules = num_of_mf_terms ** num_of_inputs
    total_param = num_of_inputs * num_of_mf_terms * 2 + num_of_rules * (num_of_inputs + 1)

    # 训练模型
    TrainingMSE, bestParams, Iterations, training_losses, validation_losses = pso_train(
        X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled,
        total_param, num_of_mf_terms, max_iterations
    )

    # 记录训练信息
    training_info = {
        'TrainingMSE': TrainingMSE,
        'Iterations': Iterations,
        'training_losses': training_losses,
        'validation_losses': validation_losses
    }

    return TrainingMSE, bestParams, training_info, scaler_X, scaler_y

# 实验参数
num_of_mf_terms_list = [2]  # 与 SOFENN 实验中 n_rules_list 对应
repeats = 5  # 每种配置重复次数
max_iterations = 1500  # 最大迭代次数

# 创建结果保存的目录
os.makedirs('results_anfis_pso', exist_ok=True)

# 记录实验结果
results_anfis_pso = []

for num_of_mf_terms in num_of_mf_terms_list:
    test_rmse_list = []
    time_list = []
    print(f"\nStarting experiments for num_of_mf_terms={num_of_mf_terms}")
    for repeat in range(repeats):
        start_time = time.time()
        # 进一步将训练集拆分为训练和验证集
        X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
            X_train_np, y_train_np, test_size=0.2, random_state=repeat
        )

        # 训练模型
        TrainingMSE, bestParams, training_info, scaler_X, scaler_y = train_anfis_pso(
            X_train_sub, y_train_sub, X_val_sub, y_val_sub,
            num_of_mf_terms=num_of_mf_terms,
            max_iterations=max_iterations
        )

        # 在测试集上测试模型
        X_test_scaled_np = scaler_X.transform(X_test_np)
        X_test_scaled = torch.tensor(X_test_scaled_np, dtype=torch.float32).to(device)
        # 转换bestParams为PyTorch张量并移动到GPU
        bestParams_tensor = torch.tensor(bestParams, dtype=torch.float32).to(device)
        outputs_test_scaled = anfis_get_output(bestParams_tensor, X_test_scaled, num_of_mf_terms)
        y_test_pred_scaled = outputs_test_scaled.cpu().detach().numpy()
        y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()
        y_test_true = y_test_np  # 原始未标准化的测试集目标值
        test_rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
        test_rmse_list.append(test_rmse)

        # 记录时间
        end_time = time.time()
        time_taken = end_time - start_time
        time_list.append(time_taken)

        print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

        # 保存模型参数
        np.savez(f'results_anfis_pso/anfis_pso_params_nmf{num_of_mf_terms}_repeat{repeat+1}.npz', bestParams=bestParams)

        # 可视化训练和验证损失
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(training_info['training_losses']) + 1), training_info['training_losses'], label='Training MSE')
        plt.plot(range(1, len(training_info['validation_losses']) + 1), training_info['validation_losses'], label='Validation MSE')
        plt.xlabel('Iteration')
        plt.ylabel('MSE')
        plt.title(f'Training and Validation MSE (num_of_mf_terms={num_of_mf_terms}, Repeat={repeat+1})')
        plt.legend()
        plt.grid(True)
        plt.show()

    # 计算平均 RMSE 和时间
    test_rmse_mean = np.mean(test_rmse_list)
    test_rmse_std = np.std(test_rmse_list)
    time_mean = np.mean(time_list)
    time_std = np.std(time_list)

    # 打印结果
    print(f"\nResults for num_of_mf_terms={num_of_mf_terms}:")
    print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
    print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")

    # 保存结果
    result = {
        'num_of_mf_terms': num_of_mf_terms,
        'test_rmse_mean': test_rmse_mean,
        'test_rmse_std': test_rmse_std,
        'time_mean': time_mean,
        'time_std': time_std
    }
    results_anfis_pso.append(result)

# 可视化测试集上的 RMSE 随隶属函数数量的变化
num_of_mf_terms_values = [result['num_of_mf_terms'] for result in results_anfis_pso]
test_rmse_means = [result['test_rmse_mean'] for result in results_anfis_pso]

plt.figure(figsize=(10, 6))
plt.plot(num_of_mf_terms_values, test_rmse_means, marker='o')
plt.xlabel('Number of Membership Functions per Input')
plt.ylabel('Test RMSE')
plt.title('Test RMSE vs. Number of Membership Functions per Input')
plt.grid(True)
plt.show()


# FuBiNFIS

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# 从 UCI ML Repo 导入数据集
from ucimlrepo import fetch_ucirepo

# 禁用不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 从 UCI ML Repo 导入 Auto MPG 数据集
auto_mpg_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

# 定义列名称
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model_year', 'origin', 'car_name']

# 读取数据集，处理缺失值
data = pd.read_csv(auto_mpg_url, delim_whitespace=True, names=column_names, na_values='?')

# 删除含有缺失值的样本
data = data.dropna()

# 特征选择
# 排除 'mpg' 和 'car_name'，将其余作为输入特征
features_to_use = [
    'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin'
]

# 处理目标变量
X = data[features_to_use]
y = data['mpg']

# 将类别变量 'origin' 进行独热编码（如果需要，可以选择保留为数值型）
# 这里保留为数值型，以简化 ANFIS 模型的处理
# 如果希望进行独热编码，请取消下方代码的注释
# X = pd.get_dummies(X, columns=['origin'], drop_first=True)
# features_to_use = X.columns.tolist()

# 检查缺失值并删除含有缺失值的样本（已在读取时完成）

# 将数据拆分为训练集和测试集
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# 更新特征名称以便后续使用
feature_labels = features_to_use
# 定义 FuBiNFS 算法的实现
def fubinfs(X_train_np, y_train_np, X_test_np, y_test_np, n_clusters=3, max_iter=100, tol=1e-5, m=2):
    """
    实现 FuBiNFS 算法并在测试集上评估性能。

    参数：
    - X_train_np: 训练集输入数据，形状：(K, D)
    - y_train_np: 训练集目标数据，形状：(K,)
    - X_test_np: 测试集输入数据，形状：(N_test, D)
    - y_test_np: 测试集目标数据，形状：(N_test,)
    - n_clusters: 聚类数目 C
    - max_iter: 最大迭代次数
    - tol: 收敛阈值
    - m: 模糊化系数

    返回：
    - y_pred_test: 测试集的预测输出
    - test_rmse: 测试集上的 RMSE
    - 其他中间结果
    """
    K, D = X_train_np.shape
    N_test = X_test_np.shape[0]

    # 标准化输入和输出
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_train_scaled = scaler_X.fit_transform(X_train_np)
    X_test_scaled = scaler_X.transform(X_test_np)
    y_train_scaled = scaler_y.fit_transform(y_train_np.reshape(-1, 1)).flatten()
    y_test_scaled = scaler_y.transform(y_test_np.reshape(-1, 1)).flatten()

    # 初始化 U
    np.random.seed(42)
    U = np.random.rand(n_clusters, K, D)
    # 归一化 U，使其满足约束条件 (5)
    U = U / U.sum(axis=0, keepdims=1)

    # 初始化聚类中心 V^(k) 和 V^(d)
    V_k = np.random.rand(n_clusters, D)
    V_d = np.random.rand(n_clusters, K)

    previous_J = np.inf
    for iteration in range(max_iter):
        # 保存上一轮的 U，用于检查收敛
        U_old = U.copy()

        # Step 1: 计算聚类中心 V^(k) among objects，公式 (3)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=0)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=0)  # 修正此处
            V_k[c] = numerator / (denominator + 1e-8)

        # Step 2: 计算聚类中心 V^(d) among attributes，公式 (4)
        for c in range(n_clusters):
            numerator = np.sum((U[c, :, :] ** m) * X_train_scaled, axis=1)  # 修正此处
            denominator = np.sum(U[c, :, :] ** m, axis=1)  # 修正此处
            V_d[c] = numerator / (denominator + 1e-8)

        # Step 3: 更新隶属度矩阵 U，公式 (14)
        for c in range(n_clusters):
            # 计算距离矩阵
            dist_c = (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2  # 形状 (K, D)
            # 初始化分母
            denom = np.zeros((K, D))
            for cc in range(n_clusters):
                dist_cc = (X_train_scaled - V_k[cc]) ** 2 + (X_train_scaled - V_d[cc][:, np.newaxis]) ** 2
                denom += (dist_c / (dist_cc + 1e-8)) ** (1 / (m - 1))
            U[c] = 1 / (denom + 1e-8)

        # 归一化 U，使其满足约束条件 (5)
        U = U / U.sum(axis=0, keepdims=1)

        # Step 4: 计算目标函数 J，公式 (2)
        J = 0
        for c in range(n_clusters):
            J += np.sum((U[c] ** m) * (
                (X_train_scaled - V_k[c]) ** 2 + (X_train_scaled - V_d[c][:, np.newaxis]) ** 2
            ))

        # 检查收敛条件
        if abs(J - previous_J) < tol:
            print(f"Converged at iteration {iteration + 1}")
            break
        previous_J = J

    else:
        print("Reached maximum iterations without convergence.")

    # Step 5: 生成模糊规则
    # 使用高斯隶属函数，标准差 σ 可以设为聚类中心的标准差
    sigma_k = np.std(V_k, axis=0) + 1e-8  # 防止为零

    # Step 6: 对训练集进行模糊推理并进行后验训练以拟合 y_train_scaled
    # 计算规则激活度（训练集）
    activation_train = np.zeros((K, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_train_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_train[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_train = activation_train.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_train = activation_train / total_activation_train

    # 使用归一化激活度作为特征，训练线性模型拟合 y_train_scaled
    from sklearn.linear_model import LinearRegression

    lin_reg = LinearRegression()
    lin_reg.fit(normalized_activation_train, y_train_scaled)

    # Step 7: 对测试集进行模糊推理并预测输出
    activation_test = np.zeros((N_test, n_clusters))
    for c in range(n_clusters):
        # 计算隶属度
        mu_k = np.exp(-0.5 * ((X_test_scaled - V_k[c]) ** 2) / (sigma_k ** 2))
        activation_test[:, c] = np.prod(mu_k, axis=1)

    # 归一化激活度
    total_activation_test = activation_test.sum(axis=1, keepdims=True) + 1e-8
    normalized_activation_test = activation_test / total_activation_test

    # 使用线性模型预测
    y_pred_test_scaled = lin_reg.predict(normalized_activation_test)
    # 反标准化
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled.reshape(-1, 1)).flatten()

    # 计算测试集上的 RMSE
    test_rmse = np.sqrt(mean_squared_error(y_test_np, y_pred_test))

    # 返回结果
    # 添加提取的模糊规则
    rules = []
    V_k_orig = V_k * scaler_X.scale_ + scaler_X.mean_
    sigma_k_orig = sigma_k * scaler_X.scale_
    for c in range(n_clusters):
        antecedent = []
        for d in range(len(features_to_use)):
            c_val = V_k_orig[c, d]
            sigma_val = sigma_k_orig[d]
            antecedent.append(f"{features_to_use[d]} is Gaussian(c={c_val:.4f}, σ={sigma_val:.4f})")
        antecedent_str = " AND ".join(antecedent)
        # 结论部分使用线性模型的系数
        coef = lin_reg.coef_[c]
        intercept = lin_reg.intercept_
        consequent_str = f"{coef:.4f} * Activation_{c+1} + {intercept:.4f}"
        rule = f"Rule {c+1}: IF {antecedent_str} THEN Output = {consequent_str}"
        rules.append(rule)

    return y_pred_test, test_rmse, {
        'V_k': V_k,
        'sigma_k': sigma_k,
        'lin_reg': lin_reg,
        'scaler_X': scaler_X,
        'scaler_y': scaler_y,
        'activation_test': activation_test,
        'rules': rules  # 添加模糊规则到返回结果
    }

# 实验参数
n_clusters_list = [3, 5, 7, 9]  # 可以测试不同的聚类数量
max_iters = 100
tol = 1e-5
repeats = 5  # 每种配置重复次数

# 创建结果保存的目录
os.makedirs('results_fubinfs', exist_ok=True)

# 记录实验结果
results_fubinfs = []

for n_clusters in n_clusters_list:
    test_rmse_list = []
    time_list = []
    print(f"\nStarting experiments for n_clusters={n_clusters}")
    for repeat in range(repeats):
        start_time = time.time()
        # 调用 FuBiNFS 算法
        y_pred_test, test_rmse, intermediate_results = fubinfs(
            X_train_np, y_train_np, X_test_np, y_test_np,
            n_clusters=n_clusters, max_iter=max_iters, tol=tol, m=2
        )
        end_time = time.time()
        time_taken = end_time - start_time
        time_list.append(time_taken)
        test_rmse_list.append(test_rmse)
        print(f"Repeat {repeat+1}/{repeats}: Test RMSE={test_rmse:.4f}, Time={time_taken:.2f}s")

        # 提取模糊规则
        rules = intermediate_results['rules']

        # 打印模糊规则
        print(f"\nFuzzy Rules for n_clusters={n_clusters}, Repeat={repeat+1}:")
        for rule in rules:
            print(rule)
            print()

        # # 保存规则到文件
        # with open(f'results_fubinfs/rules_nclusters{n_clusters}_repeat{repeat+1}.txt', 'w') as f:
        #     for rule in rules:
        #         f.write(rule + '\n')

    # 计算平均 RMSE 和时间
    test_rmse_mean = np.mean(test_rmse_list)
    test_rmse_std = np.std(test_rmse_list)
    time_mean = np.mean(time_list)
    time_std = np.std(time_list)

    # 打印结果
    print(f"\nResults for n_clusters={n_clusters}:")
    print(f"Test RMSE: {test_rmse_mean:.4f} ± {test_rmse_std:.4f}")
    print(f"Time: {time_mean:.2f}s ± {time_std:.2f}s")

    # 保存结果
    result = {
        'n_clusters': n_clusters,
        'test_rmse_mean': test_rmse_mean,
        'test_rmse_std': test_rmse_std,
        'time_mean': time_mean,
        'time_std': time_std
    }
    results_fubinfs.append(result)
