In [11]:
# util function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix
import torch

def standard_input(X):
    # 标准化输入
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)

def load_data_SGER_RAW(random_state=42):
    # 读取以空格分隔的SGER CSV文件
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    # 确保 'kredit' 列存在
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None, None, None, None, None
    # 目标变量和特征
    y = df['kredit']
    X = df.drop(columns=['kredit'])
    # 划分训练集、验证集和测试集
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp)
    # 计算节点数并创建 mask
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    # 获取索引并设置 mask
    train_mask[X_train.index] = True
    val_mask[X_valid.index] = True
    test_mask[X_test.index] = True
    # 标准化输入
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test = standard_input(X_test)
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask

X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask = load_data_SGER_RAW()
X.shape


(1000, 20)

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import copy
import itertools
import random
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# ----------------------------
# 1. 构造数据集
# 请确保 load_data_SGER_RAW() 已定义，返回以下变量：
# X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask = load_data_SGER_RAW()

# ----------------------------
# 2. 定义残差网络模型
# ----------------------------
class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim, dropout_rate):
        """
        一个残差块，计算：
            out = dropout( ReLU( Linear(x) ) )
        并添加（可能经过投影）的残差连接。
        """
        super(ResidualBlock, self).__init__()
        self.linear = nn.Linear(in_dim, out_dim)
        self.dropout = nn.Dropout(dropout_rate)
        # 当输入输出维度不同时，对输入做投影
        if in_dim != out_dim:
            self.residual_transform = nn.Linear(in_dim, out_dim)
        else:
            self.residual_transform = None

    def forward(self, x):
        out = F.relu(self.linear(x))
        out = self.dropout(out)
        if self.residual_transform is not None:
            residual = self.residual_transform(x)
        else:
            residual = x
        out = out + residual
        return out

class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, dropout_rate):
        """
        Args:
            input_dim: 输入特征数。
            hidden_size: 第一层隐藏层神经元个数。
            num_layers: 总层数，包括输入层与后续残差块。（例如 num_layers=4 表示输入层+3 个残差块）
            dropout_rate: 每个块的 dropout 概率。
        """
        super(ResidualMLP, self).__init__()
        # 输入层：投影 input_dim -> hidden_size
        self.input_layer = nn.Linear(input_dim, hidden_size)
        
        # 构造若干个残差块，每个块使隐藏层维度降低 3/4
        self.hidden_blocks = nn.ModuleList()
        current_dim = hidden_size
        for i in range(num_layers - 1):  # 输入层已计入，总共构造 num_layers-1 个块
            new_dim = max(1, int(current_dim * 0.6))
            block = ResidualBlock(in_dim=current_dim, out_dim=new_dim, dropout_rate=dropout_rate)
            self.hidden_blocks.append(block)
            current_dim = new_dim
        
        # 输出层：从最后一个隐藏层维度投影到 1
        self.output_layer = nn.Linear(current_dim, 1)

    def forward(self, x):
        out = F.relu(self.input_layer(x))
        for block in self.hidden_blocks:
            out = block(out)
        out = torch.sigmoid(self.output_layer(out))
        return out

    def get_hidden_representation(self, x):
        """
        返回经过输入层和残差块后的隐藏层表示（即输出层之前的结果）。
        """
        out = F.relu(self.input_layer(x))
        for block in self.hidden_blocks:
            out = block(out)
        return out

# 定义分类器类，包含训练、评估、预测及不确定性估计等方法
class BinaryClassifier:
    def __init__(self, input_dim, hidden_size=64, num_layers=3, dropout_rate=0.5, learning_rate=0.001):
        self.model = ResidualMLP(input_dim, hidden_size, num_layers, dropout_rate)
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def train(self, X_train, y_train, X_valid, y_valid, epochs=50, batch_size=32):
        train_dataset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1))
        valid_dataset = TensorDataset(torch.tensor(X_valid.values, dtype=torch.float32), 
                                      torch.tensor(y_valid.values, dtype=torch.float32).unsqueeze(1))
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
        
        best_f1 = 0.0
        best_model_state = None
        best_epoch = 0

        for epoch in range(epochs):
            self.model.train()
            train_loss = 0.0
            for X_batch, y_batch in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            valid_loss = self.evaluate(valid_loader)

            # 计算验证集上的 F1 得分
            self.model.eval()
            val_preds = []
            val_targets = []
            with torch.no_grad():
                for X_batch, y_batch in valid_loader:
                    outputs = self.model(X_batch)
                    preds = (outputs.squeeze() > 0.5).int()
                    val_preds.extend(preds.cpu().numpy())
                    val_targets.extend(y_batch.squeeze().cpu().numpy())
            val_f1 = f1_score(val_targets, val_preds)

            if val_f1 > best_f1:
                best_f1 = val_f1
                best_epoch = epoch
                best_model_state = copy.deepcopy(self.model.state_dict())

            # 可根据需要打印每个 epoch 的训练过程，这里注释掉了
            # print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
            #       f"Validation Loss: {valid_loss:.4f}, Validation F1: {val_f1:.4f}")

        # 恢复验证集上 F1 最佳的模型参数
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)
            print(f"Loaded best model from epoch {best_epoch+1} with Validation F1: {best_f1:.4f}")

    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in data_loader:
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                total_loss += loss.item()
        return total_loss / len(data_loader)

    def save_model(self, path):
        torch.save(self.model.state_dict(), path)

    def load_model(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model.eval()

    def predict(self, X_data):
        self.model.eval()
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32)
        with torch.no_grad():
            outputs = self.model(X_tensor)
        predicted_classes = (outputs.squeeze() > 0.5).int()
        return predicted_classes

    def predict_with_uncertainty(self, X_sample, n_iter=100):
        # 启用 dropout 进行不确定性估计
        self.model.train()
        X_sample = torch.tensor(X_sample.values, dtype=torch.float32)
        predictions = torch.zeros(n_iter, X_sample.size(0))

        for i in range(n_iter):
            outputs = self.model(X_sample)
            predictions[i] = outputs.squeeze()

        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
        predicted_classes = (mean_prediction > 0.5).int()

        return predicted_classes, uncertainty

    import torch
    import torch.nn.functional as F
    
    def compute_similarity_and_uncertainty(self, X_data, n_iter=30, device='cuda'):
        self.model.to(device)  # 确保模型在 GPU 上
        self.model.eval()  # 进入评估模式
    
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32, device=device)
    
        # 计算隐藏层表示
        with torch.no_grad():
            hidden_vectors = self.model.get_hidden_representation(X_tensor)
    
        similarity_matrix = torch.mm(hidden_vectors, hidden_vectors.t())  # 计算余弦相似度
    
        # 启用 dropout 进行不确定性估计
        self.model.train()
    
        # 初始化预测结果张量，形状为 (n_iter, n_samples)
        predictions = torch.zeros(n_iter, X_tensor.size(0), device=device)
    
        for i in range(n_iter):
            outputs = self.model(X_tensor)
            predictions[i] = outputs.squeeze()
    
        # 计算均值和标准差
        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
    
        # 计算预测类别
        predicted_classes = (mean_prediction > 0.5).int()
    
        # 计算不确定性矩阵（优化，不使用循环）
        pred_match = (predicted_classes.unsqueeze(1) == predicted_classes.unsqueeze(0)).float()
        uncertainty_matrix = pred_match * (uncertainty.unsqueeze(1) + uncertainty.unsqueeze(0))
        
        # 归一化similarity_matrix中的值，确保相似度时范围在 [0,1]
        # 计算 similarity_matrix 的最小值和最大值
        sim_min = similarity_matrix.min()
        sim_max = similarity_matrix.max()
        
        # 归一化到 [0,1]
        similarity_matrix = (similarity_matrix - sim_min) / (sim_max - sim_min + 1e-8)  # 避免除零

        return similarity_matrix, uncertainty_matrix



# ----------------------------
# 3. 随机挑选参数组合进行 Grid Search
# ----------------------------
# 定义超参数空间
param_grid = {
    'hidden_size': list(range(64, 513, 32)),
    'num_layers': [2, 3, 4, 5],
    'dropout_rate': [0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6],
    'learning_rate': [0.01, 0.001, 0.005, 0.0005]
}

# 生成所有可能的参数组合
all_combinations = list(itertools.product(param_grid['hidden_size'],
                                          param_grid['num_layers'],
                                          param_grid['dropout_rate'],
                                          param_grid['learning_rate']))
print(f"共有 {len(all_combinations)} 种参数组合。")

# 随机挑选部分组合进行尝试，比如随机挑选 100 个组合
n_iter = 10
selected_combinations = random.sample(all_combinations, n_iter)
print("随机挑选的参数组合为：{}个。".format(len(selected_combinations)))
for combo in selected_combinations:
    print(f"  hidden_size={combo[0]}, num_layers={combo[1]}, dropout_rate={combo[2]}, learning_rate={combo[3]}")

grid_search_results = []

# 增加打印正在训练第多少个参数组合
for idx, (hidden_size, num_layers, dropout_rate, learning_rate) in enumerate(selected_combinations):
    print("==========================================")
    print(f"Training combination {idx+1}/{len(selected_combinations)}: hidden_size={hidden_size}, num_layers={num_layers}, dropout_rate={dropout_rate}, learning_rate={learning_rate}")
    
    # 根据当前参数组合初始化分类器
    classifier = BinaryClassifier(
        input_dim=X.shape[1],
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        learning_rate=learning_rate
    )
    
    # 训练模型（例如 30 个 epochs，batch_size 为 64）
    classifier.train(X_train, y_train, X_valid, y_valid, epochs=30, batch_size=64)
    
    # 在验证集上计算 F1 分数
    val_preds = classifier.predict(X_valid)
    val_f1 = f1_score(y_valid, val_preds)
    print(f"Validation F1 for this model: {val_f1:.4f}")
    
    grid_search_results.append({
        'model': copy.deepcopy(classifier),  # 保存该模型
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'learning_rate': learning_rate,
        'val_f1': val_f1
    })

# 选择验证集 F1 分数最高的模型
best_result = max(grid_search_results, key=lambda x: x['val_f1'])
best_classifier = best_result['model']

print("==========================================")
print("Best Hyperparameters:")
print(f"  hidden_size: {best_result['hidden_size']}")
print(f"  num_layers: {best_result['num_layers']}")
print(f"  dropout_rate: {best_result['dropout_rate']}")
print(f"  learning_rate: {best_result['learning_rate']}")
print(f"Best Validation F1 Score: {best_result['val_f1']:.4f}")

# ----------------------------
# 4. 在测试集上评估最佳模型
# ----------------------------
test_predictions = best_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)

print("--------- Test Metrics ---------")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Recall:   {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision:{test_precision:.4f}")

print("--------- Classification Report ---------")
print(classification_report(y_test, test_predictions))

# ----------------------------
# 5. 示例：对单个样本进行预测（并估计不确定性）
# ----------------------------
sample = X_test.iloc[[0]]
print("Sample to Predict:")
print(sample)

predicted_class, uncertainty = best_classifier.predict_with_uncertainty(sample)
print("Predicted Class:", predicted_class.item())
print("Uncertainty:", uncertainty.item())


共有 1680 种参数组合。
随机挑选的参数组合为：10个。
  hidden_size=192, num_layers=3, dropout_rate=0.3, learning_rate=0.0005
  hidden_size=64, num_layers=4, dropout_rate=0.35, learning_rate=0.01
  hidden_size=160, num_layers=3, dropout_rate=0.45, learning_rate=0.0005
  hidden_size=256, num_layers=5, dropout_rate=0.4, learning_rate=0.005
  hidden_size=384, num_layers=3, dropout_rate=0.5, learning_rate=0.01
  hidden_size=288, num_layers=4, dropout_rate=0.6, learning_rate=0.0005
  hidden_size=512, num_layers=5, dropout_rate=0.45, learning_rate=0.0005
  hidden_size=352, num_layers=4, dropout_rate=0.6, learning_rate=0.01
  hidden_size=416, num_layers=4, dropout_rate=0.2, learning_rate=0.005
  hidden_size=384, num_layers=5, dropout_rate=0.4, learning_rate=0.001
Training combination 1/10: hidden_size=192, num_layers=3, dropout_rate=0.3, learning_rate=0.0005
Loaded best model from epoch 8 with Validation F1: 0.6316
Validation F1 for this model: 0.6316
Training combination 2/10: hidden_size=64, num_layers=4, dropout

In [25]:
# Predict a single sample

sample = X_test.iloc[[0]]
print("Sample to Predict:", sample)

predicted_classes, uncertainty = classifier.predict_with_uncertainty(sample)

print("Predicted Class:", predicted_classes.item())
print("Uncertainty:", uncertainty.item())

# Compute similarity and uncertainty matrices
similarity_matrix, uncertainty_matrix = classifier.compute_similarity_and_uncertainty(pd.concat([X_train, X_valid, X_test]))
print("Similarity Matrix:", similarity_matrix)
print("Uncertainty Matrix:", uncertainty_matrix)


Sample to Predict:    laufkont  laufzeit     moral     verw     hoehe  sparkont   beszeit  \
0 -0.490296 -1.294443 -0.579758  0.66335 -1.009094 -0.721009 -1.067797   

       rate    famges    buerge  wohnzeit      verm     alter  weitkred  \
0  0.008981  1.724882 -0.320641 -1.585393 -0.357607 -1.099408  0.444788   

       wohn  bishkred     beruf      pers    telef   gastarb  
0  0.135954  -0.66218 -1.408326  0.377964 -0.84226  0.175863  
Predicted Class: 0
Uncertainty: 0.04254193231463432
Similarity Matrix: tensor([[0.1640, 0.1559, 0.1615,  ..., 0.1613, 0.1577, 0.1577],
        [0.1559, 0.1612, 0.1571,  ..., 0.1518, 0.1639, 0.1567],
        [0.1615, 0.1571, 0.1652,  ..., 0.1587, 0.1600, 0.1598],
        ...,
        [0.1613, 0.1518, 0.1587,  ..., 0.1665, 0.1485, 0.1555],
        [0.1577, 0.1639, 0.1600,  ..., 0.1485, 0.1723, 0.1589],
        [0.1577, 0.1567, 0.1598,  ..., 0.1555, 0.1589, 0.1603]],
       device='cuda:0')
Uncertainty Matrix: tensor([[0.0718, 0.0000, 0.0807,  ..., 0.0

In [26]:
# 计算邻接矩阵
adj_matrix = classifier.compute_adjacency_matrix(similarity_matrix, uncertainty_matrix, threshold=0.8, epsilon=0.3)

# 打印邻接矩阵形状
print("Adjacency matrix shape:", adj_matrix.shape)

# 计算并打印非零元素个数
num_nonzero = torch.count_nonzero(adj_matrix).item()
print("Number of nonzero elements:", num_nonzero)
similarity_matrix

AttributeError: 'BinaryClassifier' object has no attribute 'compute_adjacency_matrix'

### GPU implement


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import copy
import itertools
import random
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# ----------------------------
# 1. 构造数据集
# 请确保 load_data_SGER_RAW() 已定义，返回以下变量：
# X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask = load_data_SGER_RAW()

# ----------------------------
# 2. 定义残差网络模型
# ----------------------------
class ResidualBlock(nn.Module):
    def __init__(self, in_dim, out_dim, dropout_rate):
        """
        一个残差块，计算：
            out = dropout( ReLU( Linear(x) ) )
        并添加（可能经过投影）的残差连接。
        """
        super(ResidualBlock, self).__init__()
        self.linear = nn.Linear(in_dim, out_dim)
        self.dropout = nn.Dropout(dropout_rate)
        # 当输入输出维度不同时，对输入做投影
        if in_dim != out_dim:
            self.residual_transform = nn.Linear(in_dim, out_dim)
        else:
            self.residual_transform = None

    def forward(self, x):
        out = F.relu(self.linear(x))
        out = self.dropout(out)
        if self.residual_transform is not None:
            residual = self.residual_transform(x)
        else:
            residual = x
        out = out + residual
        return out

class ResidualMLP(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, dropout_rate):
        """
        Args:
            input_dim: 输入特征数。
            hidden_size: 第一层隐藏层神经元个数。
            num_layers: 总层数，包括输入层与后续残差块。（例如 num_layers=4 表示输入层+3 个残差块）
            dropout_rate: 每个块的 dropout 概率。
        """
        super(ResidualMLP, self).__init__()
        # 输入层：投影 input_dim -> hidden_size
        self.input_layer = nn.Linear(input_dim, hidden_size)
        
        # 构造若干个残差块，每个块使隐藏层维度降低 3/4
        self.hidden_blocks = nn.ModuleList()
        current_dim = hidden_size
        for i in range(num_layers - 1):  # 输入层已计入，总共构造 num_layers-1 个块
            new_dim = max(1, int(current_dim * 0.75))
            block = ResidualBlock(in_dim=current_dim, out_dim=new_dim, dropout_rate=dropout_rate)
            self.hidden_blocks.append(block)
            current_dim = new_dim
        
        # 输出层：从最后一个隐藏层维度投影到 1
        self.output_layer = nn.Linear(current_dim, 1)

    def forward(self, x):
        out = F.relu(self.input_layer(x))
        for block in self.hidden_blocks:
            out = block(out)
        out = torch.sigmoid(self.output_layer(out))
        return out

    def get_hidden_representation(self, x):
        """
        返回经过输入层和残差块后的隐藏层表示（即输出层之前的结果）。
        """
        out = F.relu(self.input_layer(x))
        for block in self.hidden_blocks:
            out = block(out)
        return out

# 定义分类器类，包含训练、评估、预测及不确定性估计等方法
class BinaryClassifier:
    def __init__(self, input_dim, hidden_size=64, num_layers=3, dropout_rate=0.5, learning_rate=0.001):
        # 自动检测是否有GPU可用
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = ResidualMLP(input_dim, hidden_size, num_layers, dropout_rate).to(self.device)
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def train(self, X_train, y_train, X_valid, y_valid, epochs=50, batch_size=32):
        train_dataset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32),
                                      torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1))
        valid_dataset = TensorDataset(torch.tensor(X_valid.values, dtype=torch.float32),
                                      torch.tensor(y_valid.values, dtype=torch.float32).unsqueeze(1))
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
        
        best_acc = 0.0
        best_model_state = None
        best_epoch = 0

        for epoch in range(epochs):
            self.model.train()
            train_loss = 0.0
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(self.device)
                y_batch = y_batch.to(self.device)
                self.optimizer.zero_grad()
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            valid_loss = self.evaluate(valid_loader)

            # 计算验证集上的 Accuracy
            self.model.eval()
            val_preds = []
            val_targets = []
            with torch.no_grad():
                for X_batch, y_batch in valid_loader:
                    X_batch = X_batch.to(self.device)
                    y_batch = y_batch.to(self.device)
                    outputs = self.model(X_batch)
                    preds = (outputs.squeeze() > 0.5).int()
                    val_preds.extend(preds.cpu().numpy())
                    val_targets.extend(y_batch.squeeze().cpu().numpy())
            val_acc = accuracy_score(val_targets, val_preds)

            if val_acc > best_acc:
                best_acc = val_acc
                best_epoch = epoch
                best_model_state = copy.deepcopy(self.model.state_dict())

            # 可根据需要打印每个 epoch 的训练过程
            # print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
            #       f"Validation Loss: {valid_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

        # 恢复验证集上 Accuracy 最佳的模型参数
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)
            print(f"Loaded best model from epoch {best_epoch+1} with Validation Accuracy: {best_acc:.4f}")

    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in data_loader:
                X_batch = X_batch.to(self.device)
                y_batch = y_batch.to(self.device)
                outputs = self.model(X_batch)
                loss = self.criterion(outputs, y_batch)
                total_loss += loss.item()
        return total_loss / len(data_loader)

    def save_model(self, path):
        torch.save(self.model.state_dict(), path)

    def load_model(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model.to(self.device)
        self.model.eval()

    def predict(self, X_data):
        self.model.eval()
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            outputs = self.model(X_tensor)
        predicted_classes = (outputs.squeeze() > 0.5).int()
        return predicted_classes.cpu()  # 返回 CPU 上的数据

    def predict_with_uncertainty(self, X_sample, n_iter=100):
        # 启用 dropout 进行不确定性估计
        self.model.train()  # 保持 dropout
        X_sample = torch.tensor(X_sample.values, dtype=torch.float32).to(self.device)
        predictions = torch.zeros(n_iter, X_sample.size(0)).to(self.device)

        for i in range(n_iter):
            outputs = self.model(X_sample)
            predictions[i] = outputs.squeeze()

        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
        predicted_classes = (mean_prediction > 0.5).int()

        return predicted_classes.cpu(), uncertainty.cpu()

    def compute_similarity_and_uncertainty(self, X_data, n_iter=30, device='cuda'):
        self.model.to(device)  # 确保模型在 GPU 上
        self.model.eval()  # 进入评估模式
    
        X_tensor = torch.tensor(X_data.values, dtype=torch.float32, device=device)
    
        # 计算隐藏层表示
        with torch.no_grad():
            hidden_vectors = self.model.get_hidden_representation(X_tensor)
    
        # 归一化隐藏向量，确保计算余弦相似度时范围在 [0,1]
        hidden_vectors = F.normalize(hidden_vectors, p=2, dim=1)  # L2 归一化
        similarity_matrix = torch.mm(hidden_vectors, hidden_vectors.t())  # 计算余弦相似度
    
        # 启用 dropout 进行不确定性估计
        self.model.train()
    
        # 初始化预测结果张量，形状为 (n_iter, n_samples)
        predictions = torch.zeros(n_iter, X_tensor.size(0), device=device)
    
        for i in range(n_iter):
            outputs = self.model(X_tensor)
            predictions[i] = outputs.squeeze()
    
        # 计算均值和标准差
        mean_prediction = predictions.mean(dim=0)
        uncertainty = predictions.std(dim=0)
    
        # 计算预测类别
        predicted_classes = (mean_prediction > 0.5).int()
    
        # 计算不确定性矩阵（优化，不使用循环）
        pred_match = (predicted_classes.unsqueeze(1) == predicted_classes.unsqueeze(0)).float()
        uncertainty_matrix = pred_match * (uncertainty.unsqueeze(1) + uncertainty.unsqueeze(0))
        
        # 计算 similarity_matrix 的最小值和最大值
        sim_min = similarity_matrix.min()
        sim_max = similarity_matrix.max()
        
        # 归一化到 [0,1]
        similarity_matrix = (similarity_matrix - sim_min) / (sim_max - sim_min + 1e-8)  # 避免除零
    
        return similarity_matrix, uncertainty_matrix
    
    
    def compute_adjacency_matrix(self, similarity_matrix, uncertainty_matrix, threshold=0.5, epsilon=0.1, device='cuda'):
        """
        基于相似度矩阵和不确定性矩阵计算邻接矩阵，并进行 GPU 加速优化。
    
        参数：
            similarity_matrix (torch.Tensor): 相似度矩阵，形状为 (n_samples, n_samples)
            uncertainty_matrix (torch.Tensor): 不确定性矩阵，形状为 (n_samples, n_samples)
            threshold (float): 设定相似度的阈值，大于此值的样本对可以相连
            epsilon (float): 设定不确定性的上限，小于此值的样本对可以相连
            device (str): 计算设备 ('cuda' or 'cpu')
    
        返回：
            adj (torch.Tensor): 计算得到的邻接矩阵，形状为 (n_samples, n_samples)
        """
        # 迁移到 GPU（如果可用）
        similarity_matrix = similarity_matrix.to(device)
        uncertainty_matrix = uncertainty_matrix.to(device)
    
        # 创建全零矩阵
        adj = torch.zeros_like(similarity_matrix, device=device)
    
        # 使用布尔索引（避免循环）
        mask = (similarity_matrix > threshold) & (uncertainty_matrix > 0) & (uncertainty_matrix < epsilon)
    
        # 直接赋值，减少计算量
        adj[mask] = 1
    
        return adj


# ----------------------------
# 3. 随机挑选参数组合进行 Grid Search
# ----------------------------
# 定义超参数空间
param_grid = {
    'hidden_size': list(range(64, 513, 32)),
    'num_layers': [2, 3, 4, 5],
    'dropout_rate': [0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6],
    'learning_rate': [0.01, 0.001, 0.005, 0.0005]
}

# 生成所有可能的参数组合
all_combinations = list(itertools.product(param_grid['hidden_size'],
                                          param_grid['num_layers'],
                                          param_grid['dropout_rate'],
                                          param_grid['learning_rate']))
print(f"共有 {len(all_combinations)} 种参数组合。")

# 随机挑选部分组合进行尝试，比如随机挑选 100 个组合
n_iter = 10
selected_combinations = random.sample(all_combinations, n_iter)
print("随机挑选的参数组合为：{}个。".format(len(selected_combinations)))
for combo in selected_combinations:
    print(f"  hidden_size={combo[0]}, num_layers={combo[1]}, dropout_rate={combo[2]}, learning_rate={combo[3]}")

grid_search_results = []

# 增加打印正在训练第多少个参数组合
for idx, (hidden_size, num_layers, dropout_rate, learning_rate) in enumerate(selected_combinations):
    print("==========================================")
    print(f"Training combination {idx+1}/{len(selected_combinations)}: hidden_size={hidden_size}, num_layers={num_layers}, dropout_rate={dropout_rate}, learning_rate={learning_rate}")
    
    # 根据当前参数组合初始化分类器
    classifier = BinaryClassifier(
        input_dim=X.shape[1],
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        learning_rate=learning_rate
    )
    
    # 训练模型（例如 30 个 epochs，batch_size 为 64）
    classifier.train(X_train, y_train, X_valid, y_valid, epochs=30, batch_size=64)
    
    # 在验证集上计算 Accuracy
    val_preds = classifier.predict(X_valid)
    val_acc = accuracy_score(y_valid, val_preds)
    print(f"Validation Accuracy for this model: {val_acc:.4f}")
    
    grid_search_results.append({
        'model': copy.deepcopy(classifier),  # 保存该模型
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'dropout_rate': dropout_rate,
        'learning_rate': learning_rate,
        'val_acc': val_acc
    })

# 选择验证集 Accuracy 最高的模型
best_result = max(grid_search_results, key=lambda x: x['val_acc'])
best_classifier = best_result['model']

print("==========================================")
print("Best Hyperparameters:")
print(f"  hidden_size: {best_result['hidden_size']}")
print(f"  num_layers: {best_result['num_layers']}")
print(f"  dropout_rate: {best_result['dropout_rate']}")
print(f"  learning_rate: {best_result['learning_rate']}")
print(f"Best Validation Accuracy: {best_result['val_acc']:.4f}")

# ----------------------------
# 4. 在测试集上评估最佳模型
# ----------------------------
test_predictions = best_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)

print("--------- Test Metrics ---------")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Recall:   {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision:{test_precision:.4f}")

print("--------- Classification Report ---------")
print(classification_report(y_test, test_predictions))

# ----------------------------
# 5. 示例：对单个样本进行预测（并估计不确定性）
# ----------------------------
sample = X_test.iloc[[0]]
print("Sample to Predict:")
print(sample)

predicted_class, uncertainty = best_classifier.predict_with_uncertainty(sample)
print("Predicted Class:", predicted_class.item())
print("Uncertainty:", uncertainty.item())


共有 1680 种参数组合。
随机挑选的参数组合为：10个。
  hidden_size=64, num_layers=5, dropout_rate=0.4, learning_rate=0.001
  hidden_size=64, num_layers=2, dropout_rate=0.2, learning_rate=0.01
  hidden_size=64, num_layers=4, dropout_rate=0.4, learning_rate=0.001
  hidden_size=384, num_layers=5, dropout_rate=0.3, learning_rate=0.0005
  hidden_size=96, num_layers=5, dropout_rate=0.5, learning_rate=0.001
  hidden_size=352, num_layers=2, dropout_rate=0.35, learning_rate=0.005
  hidden_size=192, num_layers=5, dropout_rate=0.6, learning_rate=0.001
  hidden_size=160, num_layers=3, dropout_rate=0.6, learning_rate=0.001
  hidden_size=448, num_layers=2, dropout_rate=0.3, learning_rate=0.001
  hidden_size=96, num_layers=2, dropout_rate=0.6, learning_rate=0.01
Training combination 1/10: hidden_size=64, num_layers=5, dropout_rate=0.4, learning_rate=0.001
Loaded best model from epoch 9 with Validation Accuracy: 0.8200
Validation Accuracy for this model: 0.8200
Training combination 2/10: hidden_size=64, num_layers=2, dropo

In [20]:
sample = X_test.iloc[[0]]
print("Sample to Predict:", sample)

predicted_classes, uncertainty = classifier.predict_with_uncertainty(sample)

print("Predicted Class:", predicted_classes.item())
print("Uncertainty:", uncertainty.item())

# Compute similarity and uncertainty matrices
similarity_matrix, uncertainty_matrix = classifier.compute_similarity_and_uncertainty(pd.concat([X_train, X_valid, X_test]))
print("Similarity Matrix:", similarity_matrix)
print("Uncertainty Matrix:", uncertainty_matrix)

# 计算邻接矩阵
adj_matrix = classifier.compute_adjacency_matrix(similarity_matrix, uncertainty_matrix, threshold=0.1, epsilon=0.1)

# 打印邻接矩阵形状
print("Adjacency matrix shape:", adj_matrix.shape)

# 计算并打印非零元素个数
num_nonzero = torch.count_nonzero(adj_matrix).item()
print("Number of nonzero elements:", num_nonzero)
similarity_matrix

Sample to Predict:    laufkont  laufzeit     moral     verw     hoehe  sparkont   beszeit  \
0 -0.490296 -1.294443 -0.579758  0.66335 -1.009094 -0.721009 -1.067797   

       rate    famges    buerge  wohnzeit      verm     alter  weitkred  \
0  0.008981  1.724882 -0.320641 -1.585393 -0.357607 -1.099408  0.444788   

       wohn  bishkred     beruf      pers    telef   gastarb  
0  0.135954  -0.66218 -1.408326  0.377964 -0.84226  0.175863  
Predicted Class: 0
Uncertainty: 0.0
Similarity Matrix: tensor([[1.0000, 1.0000, 0.9988,  ..., 1.0000, 0.9999, 0.9999],
        [1.0000, 1.0000, 0.9990,  ..., 1.0000, 1.0000, 1.0000],
        [0.9988, 0.9990, 1.0000,  ..., 0.9990, 0.9992, 0.9993],
        ...,
        [1.0000, 1.0000, 0.9990,  ..., 0.9999, 1.0000, 0.9999],
        [0.9999, 1.0000, 0.9992,  ..., 1.0000, 1.0000, 1.0000],
        [0.9999, 1.0000, 0.9993,  ..., 0.9999, 1.0000, 1.0000]],
       device='cuda:0')
Uncertainty Matrix: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0

tensor([[1.0000, 1.0000, 0.9988,  ..., 1.0000, 0.9999, 0.9999],
        [1.0000, 1.0000, 0.9990,  ..., 1.0000, 1.0000, 1.0000],
        [0.9988, 0.9990, 1.0000,  ..., 0.9990, 0.9992, 0.9993],
        ...,
        [1.0000, 1.0000, 0.9990,  ..., 0.9999, 1.0000, 0.9999],
        [0.9999, 1.0000, 0.9992,  ..., 1.0000, 1.0000, 1.0000],
        [0.9999, 1.0000, 0.9993,  ..., 0.9999, 1.0000, 1.0000]],
       device='cuda:0')

In [21]:
adj_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')