### util function

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import torch
from tqdm import tqdm


# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # 保留原有索引，便于后续处理
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    return X_scaled_df


def load_data_DEF(random_state=42):
    """
    从 CSV 文件中加载数据，并划分训练、验证、测试集，同时构造节点 mask
    """
    # CSV 文件路径（请根据实际情况修改）
    path = '/home/gehongfei/project/TabGNN/dataset/DEF.csv'
    df = pd.read_csv(path, sep=',')
    
    target_col = 'kredit'
    if target_col not in df.columns:
        print(f"Error: '{target_col}' column not found in the dataset.")
        return None, None, None, None, None, None, None, None, None, None, None
    
    y = df[target_col]
    if "ID" in df.columns:
        X = df.drop(columns=["ID", target_col])
    else:
        X = df.drop(columns=[target_col])
    
    # 划分训练、验证和测试集（采用 stratify 保证标签分布均衡）
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=random_state, stratify=y
    )
    X_valid, X_test, y_valid, y_test = train_test_split(
        X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp
    )
    
    # 创建节点 mask（假设每一行数据代表图中的一个节点）
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask   = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask  = torch.zeros(num_nodes, dtype=torch.bool)
    
    train_mask[X_train.index] = True
    val_mask[X_valid.index]   = True
    test_mask[X_test.index]   = True
    
    # 标准化数据
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test  = standard_input(X_test)
    
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask


# ---------------------- 构造联合邻接矩阵（不增加额外原型节点） ----------------------
def compute_adjacency_matrix_by_prototypes(X_train, X_valid, X_test, y_train, y_valid,
                                           n_clusters=1000, n_estimators=50, max_depth=None,
                                           random_state=42, cluster_threshold=0, proto_threshold=0):
    """
    主要步骤：
      1. 在训练+验证集上用 KMeans 聚类，得到 n_clusters 个簇中心，并打印簇内样本数分布；
      2. 对测试集样本利用余弦相似度分配到各簇；
      3. 构造全集 X_all（所有样本）及其 cluster_assignments（长度 N，与 X_all 行数一致）；
      4. 对每个簇，从该簇中优先选取训练+验证样本中距离簇中心最近的节点作为该簇代表（原型），记录其索引（prototype_indices）；
      5. 对于每个簇内部，利用该簇中训练+验证数据训练 RF，计算簇内所有节点之间的 RF 叶节点相似度（S_local），经过阈值过滤；
      6. 对每个簇，强制将簇内所有节点与代表节点连边，构成连接矩阵 S_connect（边权设为 1）；
      7. 对所有簇的代表节点，利用其原始特征训练 RF，计算代表节点之间的 RF 相似度（S_proto），经过 proto_threshold 过滤后构成 S_proto_edges；
      8. 最终联合邻接矩阵 A 为：A = max(S_local, S_connect) 与 S_proto_edges 的并集，且 A 的尺寸为 N×N（N 为所有样本数）。
      
    返回：
      - adj_matrix: csr_matrix 格式，形状为 (N, N)
      - cluster_assignments: ndarray，长度 N，每个样本的聚类编号
      - prototype_indices: ndarray，长度 n_clusters，每个簇代表的样本在 X_all 中的索引（注意：这些索引为位置索引）
      - n_clusters: 簇的数量
    """
    # -------------（1）在训练+验证上聚类 -------------
    X_tv = pd.concat([X_train, X_valid])
    y_tv = pd.concat([y_train, y_valid])
    X_tv_np = X_tv.values if isinstance(X_tv, pd.DataFrame) else np.array(X_tv)
    
    print("开始对训练+验证集进行 KMeans 聚类 ...")
    # 显式设置 n_init=10 避免 FutureWarning
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    kmeans.fit(X_tv_np)
    cluster_labels_tv = kmeans.labels_   # 长度 = len(X_train)+len(X_valid)
    centers = kmeans.cluster_centers_      # shape = (n_clusters, n_features)
    
    # 打印各簇样本数分布
    cluster_counts = Counter(cluster_labels_tv)
    print("训练+验证集聚类样本数分布：")
    for cid, cnt in sorted(cluster_counts.items()):
        print(f"  Cluster {cid}: {cnt} samples")
    
    # -------------（2）对测试集样本分配簇 -------------
    X_test_np = X_test.values if isinstance(X_test, pd.DataFrame) else np.array(X_test)
    sims = cosine_similarity(X_test_np, centers)   # shape = (n_test, n_clusters)
    test_cluster_assignments = np.argmax(sims, axis=1)
    
    # -------------（3）构造全集数据 X_all 与对应标签 -------------
    # 这里不调用 reset_index，以保留原始索引信息，但后续的索引操作均使用 .iloc（位置索引）
    X_all = pd.concat([X_train, X_valid, X_test]).sort_index()
    dummy_y_test = pd.Series([-1] * len(X_test), index=X_test.index)
    y_all = pd.concat([y_train, y_valid, dummy_y_test]).sort_index()
    N = len(X_all)
    
    # 构造 cluster_assignments，利用 X_all 的原始索引（标签）
    cluster_dict = {}
    for idx, label in zip(X_tv.index, cluster_labels_tv):
        cluster_dict[idx] = label
    for idx, label in zip(X_test.index, test_cluster_assignments):
        cluster_dict[idx] = label
    # 根据 X_all 的索引构造 cluster_assignments 数组
    cluster_assignments = np.array([cluster_dict[idx] for idx in X_all.index])
    
    # 构造一个布尔数组 mask_tv 指示哪些样本属于训练+验证（用于 RF 训练）
    # 注意：X_all 是按原始索引排序的，需将训练+验证集原始索引转换为位置索引
    mask_tv = np.zeros(N, dtype=bool)
    tv_indices = list(X_train.index) + list(X_valid.index)
    tv_pos = [X_all.index.get_loc(idx) for idx in tv_indices]
    mask_tv[tv_pos] = True

    # -------------（4）选择每个簇的代表（原型）-------------
    prototype_indices = np.zeros(n_clusters, dtype=int)
    prototype_labels = []
    X_all_np = X_all.values  # shape = (N, n_features)
    for i in range(n_clusters):
        idx_in_cluster = np.where(cluster_assignments == i)[0]
        if len(idx_in_cluster) == 0:
            # 极少情况：若簇为空，随便选择一个索引（后续可做特殊处理）
            prototype_indices[i] = 0
            prototype_labels.append(y_all.iloc[0])
            continue
        # 优先选择训练+验证中的样本
        idx_in_tv = [j for j in idx_in_cluster if mask_tv[j]]
        if len(idx_in_tv) == 0:
            idx_in_tv = idx_in_cluster  # 若该簇中无训练数据，则全部考虑
        # 计算各候选样本与该簇中心的欧式距离
        center = centers[i]
        candidates = X_all_np[idx_in_tv]
        dists = np.linalg.norm(candidates - center, axis=1)
        best_local_idx = idx_in_tv[np.argmin(dists)]
        prototype_indices[i] = best_local_idx
        # 多数投票获得该簇的标签（仅考虑训练+验证）
        labels = y_all.iloc[idx_in_tv].values
        most_common = Counter(labels).most_common(1)[0][0]
        prototype_labels.append(most_common)
    prototype_labels = np.array(prototype_labels)
    
    # -------------（5）计算簇内局部相似度 S_local -------------
    S_local = np.zeros((N, N))
    print("计算各簇内部相似度 S_local ...")
    for c in tqdm(range(n_clusters), desc="簇内 S_local"):
        idx = np.where(cluster_assignments == c)[0]
        if len(idx) < 2:
            continue
        # 仅使用训练+验证数据训练 RF
        idx_tv = [j for j in idx if mask_tv[j]]
        if len(idx_tv) < 1:
            continue
        # 注意：这里使用 .iloc 按位置索引
        X_cluster_train = X_all.iloc[idx_tv].values
        y_cluster_train = y_all.iloc[idx_tv].values
        rf_cluster = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
        rf_cluster.fit(X_cluster_train, y_cluster_train)
        # 对簇内所有样本（包括测试）计算 RF 叶节点编号
        X_cluster_all = X_all.iloc[idx].values
        leaves = rf_cluster.apply(X_cluster_all)  # shape = (n_samples_in_cluster, n_estimators)
        sim_matrix = np.zeros((len(idx), len(idx)))
        for tree in range(n_estimators):
            tree_leaves = leaves[:, tree]
            sim_matrix += (tree_leaves[:, None] == tree_leaves[None, :]).astype(float)
        sim_matrix /= n_estimators
        # 过滤低于 cluster_threshold 的相似度
        sim_matrix = np.where(sim_matrix > cluster_threshold, sim_matrix, 0)
        # 写入 S_local 的对应子矩阵
        S_local[np.ix_(idx, idx)] = sim_matrix
    
    # -------------（6）构造簇内原型–成员连接 S_connect -------------
    S_connect = np.zeros((N, N))
    for c in range(n_clusters):
        idx = np.where(cluster_assignments == c)[0]
        p = prototype_indices[c]
        if len(idx) == 0:
            continue
        # 将该簇内所有节点与代表节点连边（边权 1）
        S_connect[p, idx] = 1.0
        S_connect[idx, p] = 1.0
        S_connect[p, p] = 0.0  # 排除自环
    
    # -------------（7）计算代表节点之间的相似度 S_proto_edges -------------
    # 注意：prototype_indices 中保存的是位置索引，故需用 .iloc 访问
    X_proto = X_all.iloc[prototype_indices].values
    rf_proto = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf_proto.fit(X_proto, prototype_labels)
    leaves_proto = rf_proto.apply(X_proto)  # shape = (n_clusters, n_estimators)
    S_proto = np.zeros((n_clusters, n_clusters))
    for tree in range(n_estimators):
        tree_leaves = leaves_proto[:, tree]
        S_proto += (tree_leaves[:, None] == tree_leaves[None, :]).astype(float)
    S_proto /= n_estimators
    # 过滤低于 proto_threshold 的相似度，并构造 S_proto_edges（映射到原始节点位置）
    S_proto_edges = np.zeros((N, N))
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            if S_proto[i, j] > proto_threshold:
                u = prototype_indices[i]
                v = prototype_indices[j]
                S_proto_edges[u, v] = S_proto[i, j]
                S_proto_edges[v, u] = S_proto[i, j]
    
    # -------------（8）构造最终联合邻接矩阵 A -------------
    A = np.maximum(S_local, S_connect)
    A = np.maximum(A, S_proto_edges)
    
    adj_matrix = csr_matrix(A)
    print(f"联合邻接矩阵构造完成，尺寸为：{adj_matrix.shape}")
    return adj_matrix, cluster_assignments, prototype_indices, n_clusters

    
# ---------------------- 邻接矩阵转换为 edge_index ----------------------
def adjacency_to_edge_index(adj_matrix, prototype_indices, proto_threshold=0.05, cluster_threshold=0.15):
    """
    将联合邻接矩阵转换为 edge_index（图的边列表），
    根据节点是否为原型采用不同阈值：
      - 对于两个原型节点，边权需大于 proto_threshold；
      - 对于两个非原型节点，边权需大于 cluster_threshold；
      - 对于一个原型与一个非原型节点，只要边权大于 0 则保留。
      
    返回 edge_index: torch.tensor，形状为 [2, num_edges]
    """
    A = adj_matrix.toarray()
    N = A.shape[0]
    binary_adj = np.zeros_like(A, dtype=int)
    is_proto = np.zeros(N, dtype=bool)
    is_proto[prototype_indices] = True
    proto_idx = np.where(is_proto)[0]
    nonproto_idx = np.where(~is_proto)[0]
    
    # 原型–原型部分
    if len(proto_idx) > 0:
        sub = A[np.ix_(proto_idx, proto_idx)]
        binary_sub = (sub > proto_threshold).astype(int)
        binary_adj[np.ix_(proto_idx, proto_idx)] = binary_sub
    # 非原型–非原型部分
    if len(nonproto_idx) > 0:
        sub = A[np.ix_(nonproto_idx, nonproto_idx)]
        binary_sub = (sub > cluster_threshold).astype(int)
        binary_adj[np.ix_(nonproto_idx, nonproto_idx)] = binary_sub
    # 原型–非原型部分
    if len(proto_idx) > 0 and len(nonproto_idx) > 0:
        sub = A[np.ix_(proto_idx, nonproto_idx)]
        binary_sub = (sub > 0).astype(int)
        binary_adj[np.ix_(proto_idx, nonproto_idx)] = binary_sub
        binary_adj[np.ix_(nonproto_idx, proto_idx)] = binary_sub.T
    
    coo = csr_matrix(binary_adj).tocoo()
    edge_index = torch.tensor(np.vstack((coo.row, coo.col)), dtype=torch.long)
    print("邻接矩阵转换完成！Edge index 维度:", edge_index.shape)
    return edge_index



In [None]:
# 加载数据
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_DEF(random_state=22)

# 计算联合邻接矩阵（注意：最终节点数应与 X 的行数一致，如 30000×30000）
print("开始计算联合邻接矩阵（不增加额外原型节点） ...")
adj_matrix, cluster_assignments, prototype_indices, n_clusters = compute_adjacency_matrix_by_prototypes(
    X_train, X_valid, X_test, y_train, y_valid,
    n_clusters=1000, n_estimators=50, max_depth=None, random_state=42,
    cluster_threshold=0, proto_threshold=0
)

# 将联合邻接矩阵转换为 edge_index
edge_index = adjacency_to_edge_index(adj_matrix, prototype_indices, proto_threshold=0, cluster_threshold=0.4)

### Batch-Based Optimization

In [None]:
import itertools
import random
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv, GATConv, GCNConv  # 增加 GCNConv 的导入
from torch_geometric.utils import k_hop_subgraph

# 假设设备定义如下
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

#########################################
# 1. 定义模型
#########################################
# GraphSAGE 模型（包含残差结构、dropout 以及聚合邻居信息衰减控制，每层隐藏单元数递减至上一层的 3/4）
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2, dropout_rate=0.5, agg_decay=1.0):
        """
        参数说明：
          in_channels: 输入特征维度
          hidden_channels: 第一层的隐藏单元数
          out_channels: 输出类别数
          num_layers: 图卷积层的总层数（至少为 1）
          dropout_rate: dropout 概率
          agg_decay: 邻居信息聚合时的衰减因子，第一层乘以 1，第二层乘以 agg_decay，第三层乘以 agg_decay^2，以此类推
        """
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.residuals = torch.nn.ModuleList()
        # 第一层：从 in_channels 到 hidden_channels
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        if in_channels != hidden_channels:
            self.residuals.append(torch.nn.Linear(in_channels, hidden_channels))
        else:
            self.residuals.append(torch.nn.Identity())
        current_hidden = hidden_channels
        # 后续每一层：隐藏单元数为上一层的 3/4（向下取整，最小为 1）
        for _ in range(num_layers - 1):
            next_hidden = max(1, int(current_hidden * (1)))
            self.convs.append(SAGEConv(current_hidden, next_hidden))
            if current_hidden != next_hidden:
                self.residuals.append(torch.nn.Linear(current_hidden, next_hidden))
            else:
                self.residuals.append(torch.nn.Identity())
            current_hidden = next_hidden
        # 全连接层：将最后一层的隐藏向量映射到输出类别
        self.fc = torch.nn.Linear(current_hidden, out_channels)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.agg_decay = agg_decay

    def encode(self, x, edge_index):
        """依次通过图卷积层、残差连接、ReLU 和 dropout 提取节点表示，并对每一层的聚合信息乘以衰减因子"""
        for i, (conv, res) in enumerate(zip(self.convs, self.residuals)):
            out = conv(x, edge_index)
            res_x = res(x)
            decay_factor = self.agg_decay ** i  # 第一层: agg_decay^0 = 1，第二层: agg_decay^1，……
            x = self.dropout(torch.relu(decay_factor * out + res_x))
        return x

    def forward(self, data):
        x = self.encode(data.x, data.edge_index)
        x = self.fc(x)
        return x

# GAT 模型（包含残差结构、dropout 以及聚合邻居信息衰减控制，每层隐藏单元数递减至上一层的 3/4）
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2, dropout_rate=0.5, heads=1, concat=True, agg_decay=1.0):
        super(GAT, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.residuals = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()
        # 第一层
        self.convs.append(GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout_rate, concat=concat))
        out_dim = hidden_channels * heads if concat else hidden_channels
        self.batch_norms.append(torch.nn.BatchNorm1d(out_dim))
        if in_channels != out_dim:
            self.residuals.append(torch.nn.Linear(in_channels, out_dim))
        else:
            self.residuals.append(torch.nn.Identity())
        current_dim = out_dim
        # 后续层
        for _ in range(num_layers - 1):
            next_hidden = max(1, int(current_dim * (1/3)))
            self.convs.append(GATConv(current_dim, next_hidden, heads=heads, dropout=dropout_rate, concat=concat))
            new_out_dim = next_hidden * heads if concat else next_hidden
            self.batch_norms.append(torch.nn.BatchNorm1d(new_out_dim))
            if current_dim != new_out_dim:
                self.residuals.append(torch.nn.Linear(current_dim, new_out_dim))
            else:
                self.residuals.append(torch.nn.Identity())
            current_dim = new_out_dim
        self.fc = torch.nn.Linear(current_dim, out_channels)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.agg_decay = agg_decay

    def encode(self, x, edge_index):
        for i, (conv, res, bn) in enumerate(zip(self.convs, self.residuals, self.batch_norms)):
            out = conv(x, edge_index)
            res_x = res(x)
            decay_factor = self.agg_decay ** i
            x = torch.relu(decay_factor * out + res_x)
            x = bn(x)
            x = self.dropout(x)
        return x

    def forward(self, data):
        x = self.encode(data.x, data.edge_index)
        x = self.fc(x)
        return x

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2, dropout_rate=0.5, agg_decay=1.0):
        super(GCN, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.residuals = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()
        # 第一层
        self.convs.append(GCNConv(in_channels, hidden_channels))
        self.batch_norms.append(torch.nn.BatchNorm1d(hidden_channels))
        if in_channels != hidden_channels:
            self.residuals.append(torch.nn.Linear(in_channels, hidden_channels))
        else:
            self.residuals.append(torch.nn.Identity())
        current_hidden = hidden_channels
        # 后续层
        for _ in range(num_layers - 1):
            next_hidden = max(1, int(current_hidden * (1/3)))
            self.convs.append(GCNConv(current_hidden, next_hidden))
            self.batch_norms.append(torch.nn.BatchNorm1d(next_hidden))
            if current_hidden != next_hidden:
                self.residuals.append(torch.nn.Linear(current_hidden, next_hidden))
            else:
                self.residuals.append(torch.nn.Identity())
            current_hidden = next_hidden
        self.fc = torch.nn.Linear(current_hidden, out_channels)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.agg_decay = agg_decay

    def encode(self, x, edge_index):
        for i, (conv, res, bn) in enumerate(zip(self.convs, self.residuals, self.batch_norms)):
            out = conv(x, edge_index)
            res_x = res(x)
            decay_factor = self.agg_decay ** i
            x = torch.relu(decay_factor * out + res_x)
            x = bn(x)
            x = self.dropout(x)
        return x

    def forward(self, data):
        x = self.encode(data.x, data.edge_index)
        x = self.fc(x)
        return x


#########################################
# 2. 定义损失函数
#########################################
# Focal Loss（用于处理类别不平衡）
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction="mean"):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  
        self.reduction = reduction
        self.ce = torch.nn.CrossEntropyLoss(reduction="none")

    def forward(self, inputs, targets):
        ce_loss = self.ce(inputs, targets)
        pt = torch.exp(-ce_loss)
        if self.alpha is not None:
            if isinstance(self.alpha, (list, np.ndarray)):
                alpha = inputs.new_tensor(self.alpha)
            else:
                alpha = self.alpha
            at = alpha.gather(0, targets.data)
            ce_loss = at * ce_loss
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean() if self.reduction == "mean" else focal_loss.sum()

# 修改后的普通对比学习损失（不使用 mask）
class SupConLoss(torch.nn.Module):
    def __init__(self, temperature=0.07):
        """
        Args:
            temperature: 温度参数
        """
        super(SupConLoss, self).__init__()
        self.temperature = temperature

    def forward(self, features):
        """
        Args:
            features: [batch_size, n_views, feature_dim]
                      要求每个样本至少有两个视图，其间互为正样本，其余为负样本。
        Returns:
            对比损失（InfoNCE Loss）
        """
        device = features.device
        if len(features.shape) < 3:
            raise ValueError('`features` 需要形状为 [batch_size, n_views, feature_dim]')
        batch_size, n_views, feature_dim = features.shape

        # 将多个视图拼接为 [batch_size*n_views, feature_dim] 并归一化
        features = features.view(batch_size * n_views, feature_dim)
        features = torch.nn.functional.normalize(features, p=2, dim=1)

        similarity_matrix = torch.matmul(features, features.T) / self.temperature

        # 构造正样本掩码：同一原始样本的不同视图为正样本
        labels = torch.arange(batch_size, device=device).repeat_interleave(n_views)
        mask = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0)).float()
        self_mask = torch.eye(mask.shape[0], device=device)
        mask = mask - self_mask

        exp_sim = torch.exp(similarity_matrix) * (1 - self_mask)
        denom = exp_sim.sum(dim=1, keepdim=True) + 1e-8

        log_prob = similarity_matrix - torch.log(denom)
        numerator = (mask * log_prob).sum(dim=1)
        pos_count = mask.sum(dim=1) + 1e-8
        loss = - (numerator / pos_count)
        return loss.mean()

#########################################
# 3. 数据增强方法
#########################################
def perturb_features(features, noise_level=0.1):
    """对特征进行扰动，生成增强视图"""
    noise = torch.randn_like(features) * noise_level
    return features + noise

def augment_node_drop(features, edge_index, drop_prob=0.1):
    """以一定概率丢弃节点及其相关边"""
    if isinstance(drop_prob, (list, tuple)):
        drop_prob = float(drop_prob[0])
    num_nodes = features.shape[0]
    keep_mask = (torch.rand(num_nodes, device=features.device) > drop_prob)
    features_aug = features * keep_mask.unsqueeze(1).float()
    src, dst = edge_index
    valid_edge_mask = keep_mask[src] & keep_mask[dst]
    edge_index_aug = edge_index[:, valid_edge_mask]
    return features_aug, edge_index_aug

def augment_edge_drop(features, edge_index, drop_prob=0.1):
    """以一定概率删除部分边（节点及其特征保持不变）"""
    if isinstance(drop_prob, (list, tuple)):
        drop_prob = float(drop_prob[0])
    num_edges = edge_index.shape[1]
    mask = (torch.rand(num_edges, device=edge_index.device) > drop_prob)
    edge_index_aug = edge_index[:, mask]
    return features, edge_index_aug

def augment_edge_perturb(features, edge_index, drop_prob=0.1):
    """
    先以一定概率删除部分边，再随机添加数量相当的新边
    """
    if isinstance(drop_prob, (list, tuple)):
        drop_prob = float(drop_prob[0])
    num_edges = edge_index.shape[1]
    num_nodes = features.shape[0]
    mask = (torch.rand(num_edges, device=edge_index.device) > drop_prob)
    edge_index_dropped = edge_index[:, mask]
    num_dropped = num_edges - mask.sum().item()
    if num_dropped > 0:
        new_edges = torch.randint(0, num_nodes, (2, num_dropped), device=features.device)
        edge_index_aug = torch.cat([edge_index_dropped, new_edges], dim=1)
    else:
        edge_index_aug = edge_index_dropped
    return features, edge_index_aug

def augment_data(data, aug_method="feature", aug_ratio=0.1):
    """
    根据指定增强方式对图数据进行增强，返回增强后的节点特征和 edge_index
    """
    if aug_method == "feature":
        x_aug = perturb_features(data.x, noise_level=aug_ratio)
        edge_index_aug = data.edge_index  # 图结构保持不变
        return x_aug, edge_index_aug
    elif aug_method == "node_drop":
        return augment_node_drop(data.x, data.edge_index, drop_prob=aug_ratio)
    elif aug_method == "edge_drop":
        return augment_edge_drop(data.x, data.edge_index, drop_prob=aug_ratio)
    elif aug_method == "edge_perturb":
        return augment_edge_perturb(data.x, data.edge_index, drop_prob=aug_ratio)
    else:
        raise ValueError(f"Unknown augmentation method: {aug_method}")

#########################################
# 3.1 辅助函数：提取 mini-batch 子图
#########################################
def get_mini_batch_data(data, batch_node_idx, num_hops):
    """
    对一小批节点（batch_node_idx）提取 k-hop 子图，并 relabel 节点。
    返回子图 Data 对象及原始目标节点在子图中的映射索引。
    """
    subset, sub_edge_index, mapping, _ = k_hop_subgraph(
        node_idx=batch_node_idx, num_hops=num_hops, edge_index=data.edge_index, relabel_nodes=True)
    sub_data = Data(x=data.x[subset], edge_index=sub_edge_index, y=data.y[subset])
    if hasattr(data, 'train_mask'):
        sub_data.train_mask = data.train_mask[subset]
    if hasattr(data, 'val_mask'):
        sub_data.val_mask = data.val_mask[subset]
    if hasattr(data, 'test_mask'):
        sub_data.test_mask = data.test_mask[subset]
    return sub_data, mapping

#########################################
# 4. 训练函数（预训练 + 微调）——mini-batch 版（基于 k_hop_subgraph）
#########################################
def pretrain_model(data, model, optimizer, criterion_contrast, num_epochs=200, aug_method="feature", aug_ratio=0.1, batch_size=64):
    """
    预训练阶段：仅使用对比损失训练模型（不计算 Focal Loss）。
    当 num_epochs==0 时直接跳过预训练，返回当前模型状态。
    使用 mini-batch（基于 k_hop_subgraph）进行训练，且预训练阶段直接返回最后一个 epoch 的模型权重。
    """
    if num_epochs == 0:
        print("预训练轮次为 0，跳过预训练阶段")
        return model.state_dict()

    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1).tolist()
    num_hops = len(model.convs)  # 使用模型层数作为子图的 hop 数

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        count = 0
        loader = DataLoader(train_idx, batch_size=batch_size, shuffle=True)
        for batch in loader:
            batch = batch.clone().detach().to(device)
            sub_data, mapping = get_mini_batch_data(data, batch, num_hops)
            sub_data = sub_data.to(device)
            optimizer.zero_grad()
            x_aug1, edge_index1 = augment_data(sub_data, aug_method, aug_ratio)
            x_aug2, edge_index2 = augment_data(sub_data, aug_method, aug_ratio)
            embedding_aug1 = model.encode(x_aug1, edge_index1)
            embedding_aug2 = model.encode(x_aug2, edge_index2)
            # 仅计算 mini-batch 中目标节点的增强表示
            target_emb1 = embedding_aug1[mapping]
            target_emb2 = embedding_aug2[mapping]
            features_aug = torch.stack([target_emb1, target_emb2], dim=1)
            loss = criterion_contrast(features_aug)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * len(batch)
            count += len(batch)
        avg_loss = total_loss / count
        print(f"Pretrain Epoch {epoch+1}/{num_epochs}, Contrast Loss: {avg_loss:.4f}")

    return model.state_dict()

def fine_tune_model(data, model, optimizer, criterion_focal, num_epochs=50, batch_size=64):
    """
    微调阶段：仅使用 Focal Loss 进行训练（不计算对比损失）。
    采用 mini-batch 方式（基于 k_hop_subgraph），仅在目标节点上计算损失。
    采用验证集准确率（val_acc）作为保存最佳模型参数的依据，同时在打印时也输出测试集准确率。
    """
    best_val_acc = 0.0
    best_model_state = None
    train_idx = data.train_mask.nonzero(as_tuple=False).view(-1).tolist()
    num_hops = len(model.convs)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        count = 0
        loader = DataLoader(train_idx, batch_size=batch_size, shuffle=True)
        for batch in loader:
            batch = batch.clone().detach().to(device)
            sub_data, mapping = get_mini_batch_data(data, batch, num_hops)
            sub_data = sub_data.to(device)
            optimizer.zero_grad()
            out = model(sub_data)
            loss = criterion_focal(out[mapping], sub_data.y[mapping])
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item() * len(batch)
            count += len(batch)
        avg_loss = total_loss / count
        
        # 计算验证集和测试集上的准确率
        model.eval()
        with torch.no_grad():
            out_full = model(data)
            # 验证集准确率
            preds_val = out_full[data.val_mask].argmax(dim=1)
            true_val = data.y[data.val_mask]
            val_acc = accuracy_score(true_val.cpu(), preds_val.cpu())
            
            # 测试集准确率
            preds_test = out_full[data.test_mask].argmax(dim=1)
            true_test = data.y[data.test_mask]
            test_acc = accuracy_score(true_test.cpu(), preds_test.cpu())
            
            if (epoch + 1) % 10 == 0:
                print(f"Fine-tune Epoch {epoch+1}/{num_epochs}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}, Focal Loss: {avg_loss:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()
            
    return best_model_state


def two_stage_train_model(data, model, optimizer, optimizer_ft, criterion_focal, criterion_contrast,
                          pretrain_epochs, finetune_epochs, aug_method="feature", aug_ratio=0.1,
                          batch_size=64):
    """
    两阶段训练：
      第一阶段：预训练（仅用对比损失，mini-batch 方式）；当 pretrain_epochs 为 0 时跳过预训练阶段
      第二阶段：微调（仅用分类损失，mini-batch 方式）。
    """
    if pretrain_epochs > 0:
        print("========== 开始预训练阶段 ==========")
        pretrain_state = pretrain_model(data, model, optimizer, criterion_contrast,
                                        num_epochs=pretrain_epochs,
                                        aug_method=aug_method, aug_ratio=aug_ratio,
                                        batch_size=batch_size)
        model.load_state_dict(pretrain_state)
    else:
        print("========== 跳过预训练阶段 ==========")

    print("========== 开始微调阶段 ==========")
    best_finetune_state = fine_tune_model(data, model, optimizer_ft,
                                          criterion_focal, num_epochs=finetune_epochs,
                                          batch_size=batch_size)
    return best_finetune_state

#########################################
# 5. 封装随机采样超参数组合的函数
#########################################
def get_continuous_candidates(start, stop, step, decimals):
    """
    生成从 start 到 stop（含）之间，以 step 为步长的候选列表，并保留指定小数位数。
    """
    num_steps = int((stop - start) / step) + 1
    return [round(start + i * step, decimals) for i in range(num_steps)]

def get_random_hyperparameter_combinations(n_iter):
    """
    随机生成 n_iter 个超参数组合，每个组合包含：
      (threshold, random_state, num_layers, hidden_channels, finetune_lr, pretrain_lr,
       gamma, alpha_value, aug_method, aug_ratio, pretrain_epochs, temperature,
       model_type, dropout_rate, agg_decay)
    """
    # 离散变量候选列表
    discrete_candidates = {
        'random_state': [3333],
        'num_layers': [2],
        'hidden_channels': [1024],
        'finetune_lr': [0.001],
        'pretrain_lr': [0.001, 0.0001, 0.00001],
        'aug_method': ["feature", "node_drop", "edge_drop", "edge_perturb"],
        'pretrain_epochs': [0],  # 允许预训练轮次为 0
        'model_type': ["GCN"], # "GraphSAGE", "GAT", 
        'dropout_rate': [0.6],
    }

    # 连续变量候选区间及步长
    continuous_candidates = {
        'threshold': [0.8],
        'gamma': [3],
        'alpha_value': [0.25],
        'aug_ratio': get_continuous_candidates(0.05, 0.25, 0.01, 2),
        'temperature': get_continuous_candidates(0.05, 0.1, 0.01, 2),
        'agg_decay': [0.3]  # 新增参数，控制邻居信息聚合衰减
    }

    combinations = []
    for _ in range(n_iter):
        # 随机采样连续变量
        threshold   = random.choice(continuous_candidates['threshold'])
        gamma       = random.choice(continuous_candidates['gamma'])
        alpha_value = random.choice(continuous_candidates['alpha_value'])
        aug_ratio   = random.choice(continuous_candidates['aug_ratio'])
        temperature = random.choice(continuous_candidates['temperature'])
        agg_decay   = random.choice(continuous_candidates['agg_decay'])
        
        # 随机采样离散变量
        random_state    = random.choice(discrete_candidates['random_state'])
        num_layers      = random.choice(discrete_candidates['num_layers'])
        hidden_channels = random.choice(discrete_candidates['hidden_channels'])
        finetune_lr     = random.choice(discrete_candidates['finetune_lr'])
        pretrain_lr     = random.choice(discrete_candidates['pretrain_lr'])
        aug_method      = random.choice(discrete_candidates['aug_method'])
        pretrain_epochs = random.choice(discrete_candidates['pretrain_epochs'])
        model_type      = random.choice(discrete_candidates['model_type'])
        dropout_rate    = random.choice(discrete_candidates['dropout_rate'])
        
        combination = (
            threshold,      # 阈值
            random_state,   # 随机种子
            num_layers,     # 图卷积层数
            hidden_channels,# 第一层隐藏单元数
            finetune_lr,    # 微调学习率
            pretrain_lr,    # 预训练学习率
            gamma,          # gamma 参数
            alpha_value,    # alpha 参数
            aug_method,     # 增强方式
            aug_ratio,      # 增强比例
            pretrain_epochs,# 预训练轮数（允许 0 轮次，表示跳过预训练）
            temperature,    # 对比学习温度
            model_type,     # 模型类型
            dropout_rate,   # dropout 概率
            agg_decay       # 邻居信息聚合衰减因子
        )
        combinations.append(combination)
    return combinations

#########################################
# 6. 随机搜索超参数并评估模型
#########################################
def grid_search(X, y, train_mask, valid_mask, test_mask, n_iter):
    best_acc = 0.0
    best_overall_model_state = None
    best_overall_params = None

    print("Start random search with {} combinations...".format(n_iter))
    hyperparam_combos = get_random_hyperparameter_combinations(n_iter)
    for i, (cluster_threshold, random_state, num_layers, hidden_channels, finetune_lr,
            pretrain_lr, gamma, alpha_value, aug_method, aug_ratio, pretrain_epochs, temperature,
            model_type, dropout_rate, agg_decay) in enumerate(hyperparam_combos):
        print(f"\nTesting combination {i+1}: cluster_threshold={cluster_threshold:.4f}, random_state={random_state}, "
              f"layers={num_layers}, hidden_channels={hidden_channels}, finetune_lr={finetune_lr}, "
              f"pretrain_lr={pretrain_lr}, gamma={gamma:.4f}, alpha={alpha_value:.4f}, aug_method={aug_method}, "
              f"aug_ratio={aug_ratio:.4f}, pretrain_epochs={pretrain_epochs}, temperature={temperature:.4f}, "
              f"model_type={model_type}, dropout_rate={dropout_rate:.4f}, agg_decay={agg_decay:.4f}")

        # 假设 adj_matrix 与 prototype_indices 已经初始化，此处调用自定义的 adjacency_to_edge_index 函数
        edge_index = adjacency_to_edge_index(adj_matrix, prototype_indices, proto_threshold=0, cluster_threshold=cluster_threshold).to(device)
        
        X_tensor = torch.tensor(X.values, dtype=torch.float)
        y_tensor = torch.tensor(y.values, dtype=torch.long)
        data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index,
                    train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask).to(device)

        # 根据 model_type 选择模型，并传入 agg_decay 参数
        if model_type == "GraphSAGE":
            model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels,
                              out_channels=len(np.unique(y)), num_layers=num_layers, dropout_rate=dropout_rate,
                              agg_decay=agg_decay).to(device)
        elif model_type == "GAT":
            model = GAT(in_channels=X.shape[1], hidden_channels=hidden_channels,
                        out_channels=len(np.unique(y)), num_layers=num_layers, dropout_rate=dropout_rate,
                        agg_decay=agg_decay).to(device)
        elif model_type == "GCN":
            model = GCN(in_channels=X.shape[1], hidden_channels=hidden_channels,
                        out_channels=len(np.unique(y)), num_layers=num_layers, dropout_rate=dropout_rate,
                        agg_decay=agg_decay).to(device)
        else:
            raise ValueError(f"Unknown model type: {model_type}")
        
        optimizer = torch.optim.AdamW(model.parameters(), lr=pretrain_lr, weight_decay=1e-3)
        optimizer_ft = torch.optim.AdamW(model.parameters(), lr=finetune_lr, weight_decay=1e-3)
               
        # 设置 focal loss 中的 alpha 参数
        alpha_list = [1 - alpha_value, alpha_value]
        alpha_tensor = torch.tensor(alpha_list, dtype=torch.float).to(device)
        criterion_focal = FocalLoss(gamma=gamma, alpha=alpha_tensor, reduction="mean")
        criterion_contrast = SupConLoss(temperature=temperature)

        best_model_epoch = two_stage_train_model(data, model, optimizer, optimizer_ft, criterion_focal,
                                                 criterion_contrast, pretrain_epochs=pretrain_epochs, finetune_epochs=500,
                                                 aug_method=aug_method, aug_ratio=aug_ratio, batch_size=30000)
        model.load_state_dict(best_model_epoch)
        model.eval()
        with torch.no_grad():
            test_out = model(data)
            preds = test_out[data.test_mask].argmax(dim=1)
            test_acc = accuracy_score(data.y[data.test_mask].cpu(), preds.cpu())
        print(f"Test Accuracy for current combination: {test_acc:.4f}")
        if test_acc > best_acc:
            best_acc = test_acc
            best_overall_model_state = best_model_epoch
            best_overall_params = (cluster_threshold, random_state, num_layers, hidden_channels,
                                   finetune_lr, pretrain_lr, gamma, alpha_value, aug_method, aug_ratio,
                                   pretrain_epochs, temperature, model_type, dropout_rate, agg_decay)

    return best_overall_params, best_overall_model_state

#########################################
# 7. 主程序：加载数据、随机搜索超参数、加载最佳模型并评估
#########################################
# 假设 X, y, train_mask, valid_mask, test_mask 已提前加载，且全局 adj_matrix 与 prototype_indices 已初始化
best_params, best_model_state = grid_search(X, y, train_mask, valid_mask, test_mask, n_iter=1)
print("\nBest Hyperparameters:", best_params)





In [None]:
# 使用随机森林模型进行训练，并生成测试集上的分类报告
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred_rf = rf_model.predict(X_test)

# 输出测试集上的分类报告
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred_rf, digits=4))


# 为了便于比较，将预测结果转换为与 y_test 相同的索引
y_pred_rf_series = pd.Series(y_pred_rf, index=y_test.index)

# 统计各类样本的索引集合
tp_idx = y_test[(y_test == 1) & (y_pred_rf_series == 1)].index.tolist()
tn_idx = y_test[(y_test == 0) & (y_pred_rf_series == 0)].index.tolist()
fp_idx = y_test[(y_test == 0) & (y_pred_rf_series == 1)].index.tolist()
fn_idx = y_test[(y_test == 1) & (y_pred_rf_series == 0)].index.tolist()

print("True Positives (1被分为1):", tp_idx)
print("True Negatives (0被分为0):", tn_idx)
print("False Positives (0被分为1):", fp_idx)
print("False Negatives (1被分为0):", fn_idx)


import json

# 将四个列表合并为一个列表
error_idx_list = [tp_idx, tn_idx, fp_idx, fn_idx]

# 定义要保存的文件名
filename = "DEF-RFGNN.json"

# 保存为 JSON 文件
with open(filename, "w") as f:
    json.dump(error_idx_list, f, indent=4)

print(f"索引集合已保存至 {filename}")


# 如果只需要错误样本（FP和FN）的索引集合，可以合并如下：
error_idx = {
    "[0,1]": fp_idx,  # 真实为0，但预测为1
    "[1,0]": fn_idx   # 真实为1，但预测为0
}
print("错误样本的索引集合:", error_idx)

# 读取 JSON 文件
with open(filename, "r") as f:
    loaded_error_idx_list = json.load(f)

print("加载的索引集合:", loaded_error_idx_list)
