In [1]:
# util function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix
import torch

def standard_input(X):
    # 标准化输入
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)



def load_data_SGER_RAW(random_state=42):
    # 读取以空格分隔的SGER CSV文件
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    # 确保 'kredit' 列存在
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None, None, None, None, None
    # 目标变量和特征
    y = df['kredit']
    X = df.drop(columns=['kredit'])
    # 划分训练集、验证集和测试集
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp)
    # 计算节点数并创建 mask
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    # 获取索引并设置 mask
    train_mask[X_train.index] = True
    val_mask[X_valid.index] = True
    test_mask[X_test.index] = True
    # 标准化输入
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test = standard_input(X_test)
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask


def load_data_SGER(mode='raw', aug_pct=100):
    """
    加载数据并进行预处理，返回以下变量：
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask

    参数：
        mode (str): 数据读取模式，可选 'raw' 或 'aug'
            - 'raw': 仅使用 raw 数据（SGER1000.csv），逻辑与原 load_data_SGER_RAW 一致
            - 'aug': raw 数据按原逻辑划分，且读取 LLM 数据（SGERLLM.csv）并将其按 90%/10%分别追加到
                     训练集和验证集中；测试集仍使用 raw 数据划分出的部分
        aug_pct (int 或 float): 使用的增强数据比例，取值范围 0~100，默认 100 表示使用全部 LLM 数据。
                                  当 mode 为 'aug' 时有效。
    """
    # 定义文件路径
    path_raw = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    path_llm = '/home/gehongfei/project/TabGNN/dataset/SGERLLM.csv'
    
    # ---------------------------
    # 1. 处理 raw 数据
    # ---------------------------
    try:
        df_raw = pd.read_csv(path_raw, sep='\s+')
    except Exception as e:
        print(f"Error reading raw file {path_raw}: {e}")
        return None
    
    if 'kredit' not in df_raw.columns:
        print("Error: 'kredit' 列未在 raw 数据中找到。")
        return None

    # 分离特征和目标变量
    y_raw = df_raw['kredit']
    X_raw = df_raw.drop(columns=['kredit'])
    
    # 划分 raw 数据：70% 为训练集，30% 为临时集合
    X_train_raw, X_temp, y_train_raw, y_temp = train_test_split(
        X_raw, y_raw, test_size=0.3, random_state=42, stratify=y_raw
    )
    # 将临时集合再划分为验证集和测试集，其中 1/3（约 10%）为验证集，2/3（约 20%）为测试集
    X_valid_raw, X_test_raw, y_valid_raw, y_test_raw = train_test_split(
        X_temp, y_temp, test_size=2/3, random_state=42, stratify=y_temp
    )
    
    # ---------------------------
    # 2. 根据 mode 处理数据
    # ---------------------------
    if mode == 'raw':
        # 直接使用 raw 数据划分结果
        X_train_final = X_train_raw.reset_index(drop=True)
        y_train_final = y_train_raw.reset_index(drop=True)
        X_valid_final = X_valid_raw.reset_index(drop=True)
        y_valid_final = y_valid_raw.reset_index(drop=True)
        X_test_final  = X_test_raw.reset_index(drop=True)
        y_test_final  = y_test_raw.reset_index(drop=True)
        
    elif mode == 'aug':
        # 读取 LLM 数据，并将数据追加到 raw 的训练集和验证集中
        try:
            df_llm = pd.read_csv(path_llm, sep='\s+')
        except Exception as e:
            print(f"Error reading LLM file {path_llm}: {e}")
            return None
        
        if 'kredit' not in df_llm.columns:
            print("Error: 'kredit' 列未在 LLM 数据中找到。")
            return None
        
        # 根据 aug_pct 参数调整使用的增强数据比例
        if aug_pct < 100:
            df_llm = df_llm.sample(frac=aug_pct/100, random_state=42).reset_index(drop=True)
        
        # 分离 LLM 数据中的特征和目标变量
        y_llm = df_llm['kredit']
        X_llm = df_llm.drop(columns=['kredit'])
        
        # 划分 LLM 数据：90% 为训练，10% 为验证
        X_llm_train, X_llm_valid, y_llm_train, y_llm_valid = train_test_split(
            X_llm, y_llm, test_size=0.1, random_state=42, stratify=y_llm
        )
        
        # 重置索引，防止后续合并时出现冲突
        X_train_raw = X_train_raw.reset_index(drop=True)
        y_train_raw = y_train_raw.reset_index(drop=True)
        X_valid_raw = X_valid_raw.reset_index(drop=True)
        y_valid_raw = y_valid_raw.reset_index(drop=True)
        X_llm_train = X_llm_train.reset_index(drop=True)
        y_llm_train = y_llm_train.reset_index(drop=True)
        X_llm_valid = X_llm_valid.reset_index(drop=True)
        y_llm_valid = y_llm_valid.reset_index(drop=True)
        X_test_raw  = X_test_raw.reset_index(drop=True)
        y_test_raw  = y_test_raw.reset_index(drop=True)
        
        # 追加 LLM 数据：训练集增加 LLM 训练数据，验证集增加 LLM 验证数据
        X_train_final = pd.concat([X_train_raw, X_llm_train], axis=0).reset_index(drop=True)
        y_train_final = pd.concat([y_train_raw, y_llm_train], axis=0).reset_index(drop=True)
        X_valid_final = pd.concat([X_valid_raw, X_llm_valid], axis=0).reset_index(drop=True)
        y_valid_final = pd.concat([y_valid_raw, y_llm_valid], axis=0).reset_index(drop=True)
        # 测试集仍使用 raw 划分出的数据
        X_test_final  = X_test_raw
        y_test_final  = y_test_raw
    else:
        print("Error: mode 必须为 'raw' 或 'aug'")
        return None

    # ---------------------------
    # 3. 合并所有数据，并构造掩码
    # ---------------------------
    # 合并训练、验证、测试数据为完整数据集
    X = pd.concat([X_train_final, X_valid_final, X_test_final], axis=0).reset_index(drop=True)
    y = pd.concat([y_train_final, y_valid_final, y_test_final], axis=0).reset_index(drop=True)
    
    # 构造掩码，保证返回的变量名与要求一致
    num_total = len(X)
    train_mask = torch.zeros(num_total, dtype=torch.bool)
    valid_mask = torch.zeros(num_total, dtype=torch.bool)
    test_mask  = torch.zeros(num_total, dtype=torch.bool)
    
    # 训练集占合并数据的前部分
    len_train = len(X_train_final)
    train_mask[:len_train] = True
    # 验证集紧随训练集之后
    len_valid = len(X_valid_final)
    valid_mask[len_train:len_train+len_valid] = True
    # 测试集为剩余部分
    test_mask[len_train+len_valid:] = True

    # ---------------------------
    # 4. 标准化数据
    # ---------------------------
    X = standard_input(X)
    X_train = standard_input(X_train_final)
    X_valid = standard_input(X_valid_final)
    X_test  = standard_input(X_test_final)
    
    return X, y, X_train, X_valid, X_test, \
           y_train_final, y_valid_final, y_test_final, \
           train_mask, valid_mask, test_mask

# 使用示例：



# 训练Random Forest并计算相似性
# Function to compute adjacency matrices for train, validation, and test data
def compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, n_estimators=100, max_depth=None, threshold=0.15, random_state=42):
    # 合并训练、验证和测试数据
    X_combined = pd.concat([X_train, X_valid, X_test], axis=0)
    num_samples = X_combined.shape[0]
    # 训练Random Forest
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf.fit(pd.concat([X_train, X_valid], axis=0), pd.concat([y_train, y_valid], axis=0))
    # 获取每棵树的叶子索引
    leaf_indices = rf.apply(X_combined)
    # 计算相似性矩阵
    adjacency_matrix = np.zeros((num_samples, num_samples))
    for tree_idx in range(leaf_indices.shape[1]):  # 遍历每棵树
        leaf_to_samples = {}
        for sample_idx, leaf_id in enumerate(leaf_indices[:, tree_idx]):
            if leaf_id not in leaf_to_samples:
                leaf_to_samples[leaf_id] = []
            leaf_to_samples[leaf_id].append(sample_idx)
        # 更新相似性矩阵
        for sample_list in leaf_to_samples.values():
            for i in sample_list:
                for j in sample_list:
                    if i != j:
                        adjacency_matrix[i, j] += 1
    # 归一化相似性
    adjacency_matrix /= adjacency_matrix.max()
    # 应用阈值，转换为二值矩阵
    adjacency_matrix = (adjacency_matrix > threshold).astype(int)
    # 转换为稀疏矩阵
    adjacency_matrix_sparse = csr_matrix(adjacency_matrix)
    return adjacency_matrix_sparse

# 从稀疏邻接矩阵提取边索引
def adjacency_to_edge_index(adj_matrix):
    coo_matrix = adj_matrix.tocoo()  # 转换为COO格式
    edge_index = torch.tensor(np.vstack((coo_matrix.row, coo_matrix.col)), dtype=torch.long)
    return edge_index

1

1

In [21]:
# single round
from sklearn.metrics import classification_report, f1_score
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import numpy as np

# 载入数据
# 计算邻接矩阵
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_SGER()
adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, threshold=0.19, random_state=42)
edge_index = adjacency_to_edge_index(adj_matrix)

print(adj_matrix.shape)
print(edge_index.shape)  # 输出 (2, num_edges)
print(edge_index)  # 打印边索引


num_features = X.shape[1]
num_classes = len(np.unique(y))

edge_index = adjacency_to_edge_index(adj_matrix)

# 转换数据格式
X_tensor = torch.tensor(X.values, dtype=torch.float)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 创建 PyG Data 对象
data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

# 计算类别权重（解决类别不平衡）
class_counts = np.bincount(y_tensor.numpy())
print(class_counts)
weights = torch.tensor(1.0 / class_counts, dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=weights)

# 定义 GNN 模型
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

model = GraphSAGE(in_channels=num_features, hidden_channels=128, out_channels=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# 训练过程
best_f1 = 0.0
best_model_state = None

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    # 验证
    model.eval()
    with torch.no_grad():
        val_out = model(data)
        val_preds = val_out[data.val_mask].argmax(dim=1)
        val_true = data.y[data.val_mask]
        val_f1 = f1_score(val_true.cpu(), val_preds.cpu(), average="macro")  # 宏平均 F1-score
    
    # 记录最佳模型（基于 F1-score）
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_model_state = model.state_dict()
    
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}, Val F1-score: {val_f1:.4f}')

# 加载最佳模型
model.load_state_dict(best_model_state)

# 测试
model.eval()
with torch.no_grad():
    test_out = model(data)
    preds = test_out[data.test_mask].argmax(dim=1)
    true_labels = data.y[data.test_mask]
    report = classification_report(true_labels.cpu(), preds.cpu(), target_names=["Class 0", "Class 1"], digits=4)
    print("Classification Report:\n", report)

(1000, 1000)
torch.Size([2, 25540])
tensor([[  1,   1,   1,  ..., 999, 999, 999],
        [ 46, 243, 329,  ..., 858, 871, 926]])
[700 300]
Epoch 1, Loss: 0.6829, Val F1-score: 0.2773
Epoch 2, Loss: 0.6834, Val F1-score: 0.5572
Epoch 3, Loss: 0.5597, Val F1-score: 0.7932
Epoch 4, Loss: 0.5416, Val F1-score: 0.8095
Epoch 5, Loss: 0.5315, Val F1-score: 0.7549
Epoch 6, Loss: 0.4880, Val F1-score: 0.6757
Epoch 7, Loss: 0.4582, Val F1-score: 0.6511
Epoch 8, Loss: 0.4531, Val F1-score: 0.6511
Epoch 9, Loss: 0.4475, Val F1-score: 0.6693
Epoch 10, Loss: 0.4291, Val F1-score: 0.7149
Epoch 11, Loss: 0.4077, Val F1-score: 0.7426
Epoch 12, Loss: 0.3956, Val F1-score: 0.7613
Epoch 13, Loss: 0.3918, Val F1-score: 0.7898
Epoch 14, Loss: 0.3853, Val F1-score: 0.7707
Epoch 15, Loss: 0.3712, Val F1-score: 0.7426
Epoch 16, Loss: 0.3555, Val F1-score: 0.7149
Epoch 17, Loss: 0.3447, Val F1-score: 0.7058
Epoch 18, Loss: 0.3378, Val F1-score: 0.7175
Epoch 19, Loss: 0.3293, Val F1-score: 0.7175
Epoch 20, Loss:

In [8]:
# Grid Search CPU implement
import itertools
import random
import torch
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

# GraphSAGE Model Definition (with normalization layers)
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        
        # Adding more layers as specified by num_layers
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.fc = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = self.fc(x)
        return x

def grid_search(X, y, X_train, X_valid, X_test, y_train, y_valid, train_mask, valid_mask, test_mask, n_iter=10):
    best_acc = 0.0
    best_overall_model_state = None
    best_overall_params = None
    
    # Defining the hyperparameter search space
    thresholds = [0.15, 0.2, 0.25, 0.3]
    random_states = [42, 100, 202, 303]
    num_layers_list = [1, 2, 3, 4]
    hidden_channels_list = list(range(32, 257, 32))  # From 32 to 256 with step 32
    lrs = [0.01, 0.05, 0.001]
    
    # Randomly select `n_iter` combinations without repetition
    param_combinations = list(itertools.product(thresholds, random_states, num_layers_list, hidden_channels_list, lrs))
    random.shuffle(param_combinations)
    
    for i, (threshold, random_state, num_layers, hidden_channels, lr) in enumerate(param_combinations[:n_iter]):
        print(f"\nTesting combination {i+1}: threshold={threshold}, random_state={random_state}, "
              f"layers={num_layers}, hidden_channels={hidden_channels}, lr={lr}")

        # Compute adjacency matrix and edge index
        adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                                threshold=threshold, random_state=random_state)
        edge_index = adjacency_to_edge_index(adj_matrix)

        # Convert data to tensor
        X_tensor = torch.tensor(X.values, dtype=torch.float)
        y_tensor = torch.tensor(y.values, dtype=torch.long)
        data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                    train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

        # Create the model
        model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                          out_channels=len(np.unique(y)), num_layers=num_layers)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        # Loss function with class weights
        class_counts = np.bincount(y_tensor.numpy())
        weights = torch.tensor(1.0 / class_counts, dtype=torch.float)
        criterion = torch.nn.CrossEntropyLoss(weight=weights)

        # Train the model
        best_f1_epoch = 0.0
        best_acc_epoch = 0.0
        best_model_epoch = None

        for epoch in range(100):  # Fixed number of epochs
            model.train()
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            # Validation
            model.eval()
            with torch.no_grad():
                val_out = model(data)
                val_preds = val_out[data.val_mask].argmax(dim=1)
                val_true = data.y[data.val_mask]
                val_f1 = f1_score(val_true.cpu(), val_preds.cpu(), average="macro")
                val_acc = accuracy_score(val_true.cpu(), val_preds.cpu())

            # Update best model based on validation F1-score
            if val_f1 > best_f1_epoch:
                best_f1_epoch = val_f1
                best_acc_epoch = val_acc
                best_model_epoch = model.state_dict()

        # Save the best model if this combination performs better on validation accuracy
        if best_acc_epoch > best_acc:
            best_acc = best_acc_epoch
            best_overall_model_state = best_model_epoch
            best_overall_params = (threshold, random_state, num_layers, hidden_channels, lr)
        
        # Test and print classification report for the best model of this combination
        model.load_state_dict(best_model_epoch)
        model.eval()
        with torch.no_grad():
            test_out = model(data)
            preds = test_out[data.test_mask].argmax(dim=1)
            true_labels = data.y[data.test_mask]

            test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
            test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
            test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
            test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
            
            report = classification_report(true_labels.cpu(), preds.cpu(), 
                                           target_names=["Class 0", "Class 1"], digits=4)
            print(f"Classification Report for combination {i+1}:")
            print(report)
            print(f"Test Set Metrics for combination {i+1}:")
            print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
                  f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")
    
    # After all iterations, print the best model's classification report based on overall accuracy
    print("\nBest Model Overall (based on highest test accuracy):")
    best_model = GraphSAGE(in_channels=X.shape[1], hidden_channels=best_overall_params[3], 
                           out_channels=len(np.unique(y)), num_layers=best_overall_params[2])
    best_model.load_state_dict(best_overall_model_state)
    best_model.eval()
    
    # Recreate the data object using the best hyperparameters for evaluation
    threshold, random_state, num_layers, hidden_channels, lr = best_overall_params
    adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                            threshold=threshold, random_state=random_state)
    edge_index = adjacency_to_edge_index(adj_matrix)
    X_tensor = torch.tensor(X.values, dtype=torch.float)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)
    
    with torch.no_grad():
        test_out = best_model(data)
        preds = test_out[data.test_mask].argmax(dim=1)
        true_labels = data.y[data.test_mask]
        
        test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        report = classification_report(true_labels.cpu(), preds.cpu(), 
                                       target_names=["Class 0", "Class 1"], digits=4)
        print("Best Model Classification Report:")
        print(report)
        print("Best Model Test Set Metrics:")
        print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
              f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

    # Return the best model parameters and state
    return best_overall_params, best_overall_model_state

# Example function to call grid search
def main():
    # Load data (assumed function, please replace with your actual data-loading function)
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_SGER()
    print("X shape:", X.shape)

    # Perform grid search
    best_params, best_model_state = grid_search(X, y, X_train, X_valid, X_test, 
                                                  y_train, y_valid, train_mask, valid_mask, test_mask, n_iter=5)

    print("\nBest Hyperparameters:", best_params)

    # Recreate the edge_index using the best hyperparameters
    threshold, random_state, num_layers, hidden_channels, lr = best_params
    adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                            threshold=threshold, random_state=random_state)
    edge_index = adjacency_to_edge_index(adj_matrix)

    # Recreate the data object
    X_tensor = torch.tensor(X.values, dtype=torch.float)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

    # Load the best model
    model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                      out_channels=len(np.unique(y)), num_layers=num_layers)
    model.load_state_dict(best_model_state)

    # Evaluate the best model on the test set and print all metrics
    model.eval()
    with torch.no_grad():
        test_out = model(data)
        preds = test_out[data.test_mask].argmax(dim=1)
        true_labels = data.y[data.test_mask]
        
        report = classification_report(true_labels.cpu(), preds.cpu(), 
                                       target_names=["Class 0", "Class 1"], digits=4)
        test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print("\nBest Model Classification Report on Test Set:")
        print(report)
        print("Best Model Test Set Metrics:")
        print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
              f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    main()


X shape: (1000, 20)

Testing combination 1: threshold=0.25, random_state=202, layers=2, hidden_channels=64, lr=0.05
Classification Report for combination 1:
              precision    recall  f1-score   support

     Class 0     0.8099    0.8214    0.8156       140
     Class 1     0.5690    0.5500    0.5593        60

    accuracy                         0.7400       200
   macro avg     0.6894    0.6857    0.6875       200
weighted avg     0.7376    0.7400    0.7387       200

Test Set Metrics for combination 1:
Precision: 0.6894, Recall: 0.6857, F1: 0.6875, Accuracy: 0.7400

Testing combination 2: threshold=0.25, random_state=42, layers=1, hidden_channels=128, lr=0.01
Classification Report for combination 2:
              precision    recall  f1-score   support

     Class 0     0.7877    0.8214    0.8042       140
     Class 1     0.5370    0.4833    0.5088        60

    accuracy                         0.7200       200
   macro avg     0.6624    0.6524    0.6565       200
weighte

In [4]:
# Grid Search GPU implement

import itertools
import random
import torch
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# GraphSAGE Model Definition (with normalization layers)
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        
        # Adding more layers as specified by num_layers
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.fc = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = self.fc(x)
        return x

def grid_search(X, y, X_train, X_valid, X_test, y_train, y_valid, train_mask, valid_mask, test_mask, n_iter):
    best_acc = 0.0
    best_overall_model_state = None
    best_overall_params = None
    
    thresholds = np.arange(0.1, 0.4, 0.01).tolist()
    random_states = list(range(501))
    num_layers_list = [1, 2, 3, 4, 5]
    hidden_channels_list = list(range(20, 300, 10))
    lrs = [0.01, 0.05, 0.001, 0.005]
    
    param_combinations = list(itertools.product(thresholds, random_states, num_layers_list, hidden_channels_list, lrs))
    random.shuffle(param_combinations)
    
    for i, (threshold, random_state, num_layers, hidden_channels, lr) in enumerate(param_combinations[:n_iter]):
        print(f"\nTesting combination {i+1}: threshold={threshold}, random_state={random_state}, "
              f"layers={num_layers}, hidden_channels={hidden_channels}, lr={lr}")

        adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                                threshold=threshold, random_state=random_state)
        edge_index = adjacency_to_edge_index(adj_matrix)

        X_tensor = torch.tensor(X.values, dtype=torch.float)
        y_tensor = torch.tensor(y.values, dtype=torch.long)
        data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                    train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

        model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                          out_channels=len(np.unique(y)), num_layers=num_layers).to("cuda" if torch.cuda.is_available() else "cpu")
        data = data.to("cuda" if torch.cuda.is_available() else "cpu")
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        class_counts = np.bincount(y_tensor.cpu().numpy())
        weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")
        criterion = torch.nn.CrossEntropyLoss(weight=weights)

        best_f1_epoch = 0.0
        best_model_epoch = None

        for epoch in range(100):
            model.train()
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            model.eval()
            with torch.no_grad():
                val_out = model(data)
                val_preds = val_out[data.val_mask].argmax(dim=1)
                val_true = data.y[data.val_mask]
                val_f1 = f1_score(val_true.cpu(), val_preds.cpu(), average="macro")

            if val_f1 > best_f1_epoch:
                best_f1_epoch = val_f1
                best_model_epoch = model.state_dict()

        model.load_state_dict(best_model_epoch)
        model.eval()
        with torch.no_grad():
            test_out = model(data)
            preds = test_out[data.test_mask].argmax(dim=1)
            true_labels = data.y[data.test_mask]
            test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        if test_acc > best_acc:
            best_acc = test_acc
            best_overall_model_state = best_model_epoch
            best_overall_params = (threshold, random_state, num_layers, hidden_channels, lr)
    
    return best_overall_params, best_overall_model_state


def main():
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_SGER_RAW(random_state=42)
    
    best_params, best_model_state = grid_search(X, y, X_train, X_valid, X_test, 
                                                y_train, y_valid, train_mask, valid_mask, test_mask, n_iter=10)
    print("\nBest Hyperparameters:", best_params)

    threshold, random_state, num_layers, hidden_channels, lr = best_params
    adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                          threshold=threshold, random_state=random_state)
    edge_index = adjacency_to_edge_index(adj_matrix).to(device)

    X_tensor = torch.tensor(X.values, dtype=torch.float).to(device)
    y_tensor = torch.tensor(y.values, dtype=torch.long).to(device)
    data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                train_mask=train_mask.to(device), val_mask=valid_mask.to(device), test_mask=test_mask.to(device))

    model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                      out_channels=len(np.unique(y)), num_layers=num_layers).to(device)
    model.load_state_dict(best_model_state)
    model.eval()

    with torch.no_grad():
        test_out = model(data)
        preds = test_out[data.test_mask].argmax(dim=1)
        true_labels = data.y[data.test_mask]
        
        report = classification_report(true_labels.cpu(), preds.cpu(), target_names=["Class 0", "Class 1"], digits=4)
        test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print("\nBest Model Classification Report on Test Set:")
        print(report)
        print("Best Model Test Set Metrics:")
        print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
              f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    main()



Testing combination 1: threshold=0.3199999999999999, random_state=187, layers=5, hidden_channels=200, lr=0.001

Testing combination 2: threshold=0.2599999999999999, random_state=424, layers=1, hidden_channels=70, lr=0.05

Testing combination 3: threshold=0.20999999999999996, random_state=386, layers=5, hidden_channels=270, lr=0.001

Testing combination 4: threshold=0.11, random_state=444, layers=4, hidden_channels=280, lr=0.005

Testing combination 5: threshold=0.24999999999999992, random_state=125, layers=1, hidden_channels=200, lr=0.05

Testing combination 6: threshold=0.34999999999999987, random_state=418, layers=5, hidden_channels=50, lr=0.001

Testing combination 7: threshold=0.14999999999999997, random_state=405, layers=1, hidden_channels=290, lr=0.001


KeyboardInterrupt: 

### Imbalance Grid Search GPU implement

In [4]:
import itertools
import random
import torch
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 定义 Focal Loss，用于替代 CrossEntropyLoss
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction="mean"):
        """
        :param gamma: focusing parameter, gamma > 0 调整容易分类样本的权重
        :param alpha: 类别权重，通常可以设置为一个列表或者 tensor
        :param reduction: 损失的计算方式，"mean" 或 "sum"
        """
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # alpha 通常用于对不同类别加权
        self.reduction = reduction
        self.ce = torch.nn.CrossEntropyLoss(reduction="none")

    def forward(self, inputs, targets):
        # 计算基本的交叉熵损失
        ce_loss = self.ce(inputs, targets)
        # 计算概率值（注意：torch.exp(-ce_loss) 就是模型给出的对应类别的概率）
        pt = torch.exp(-ce_loss)
        # 根据 alpha 调整不同类别的权重
        if self.alpha is not None:
            if isinstance(self.alpha, (list, np.ndarray)):
                alpha = inputs.new_tensor(self.alpha)
            else:
                alpha = self.alpha
            at = alpha.gather(0, targets.data)
            ce_loss = at * ce_loss
        # Focal Loss 的公式
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

# GraphSAGE 模型定义（可包含多个图卷积层）
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        
        # 添加更多层
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.fc = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = self.fc(x)
        return x

# 主 grid search 函数（这里使用 FocalLoss 替代 CrossEntropyLoss）
def grid_search(X, y, X_train, X_valid, X_test, y_train, y_valid, train_mask, valid_mask, test_mask, n_iter):
    best_acc = 0.0
    best_overall_model_state = None
    best_overall_params = None
    
    thresholds = np.arange(0.1, 0.4, 0.02).tolist()
    random_states = list(range(0, 300, 10))
    num_layers_list = [1, 2, 3, 4, 5]
    hidden_channels_list = list(range(20, 300, 10))
    lrs = [0.01, 0.05, 0.001, 0.005]
    gammas = np.arange(0.1, 4, 0.2).tolist()
    alphas = np.arange(0.1, 0.9, 0.1).tolist()
    
    print("Start grid search...")

    # 将所有超参数组合在一起
    param_combinations = list(itertools.product(thresholds, random_states, num_layers_list, 
                                                hidden_channels_list, lrs, gammas, alphas))
    # 打印所有组合（可选）
    print("Total parameter combinations:", len(param_combinations))
    random.shuffle(param_combinations)
    
    for i, (threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value) in enumerate(param_combinations[:n_iter]):
        print(f"\nTesting combination {i+1}: threshold={threshold}, random_state={random_state}, "
              f"layers={num_layers}, hidden_channels={hidden_channels}, lr={lr}, gamma={gamma}, alpha={alpha_value}")

        adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                                threshold=threshold, random_state=random_state)
        edge_index = adjacency_to_edge_index(adj_matrix)

        X_tensor = torch.tensor(X.values, dtype=torch.float)
        y_tensor = torch.tensor(y.values, dtype=torch.long)
        data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                    train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

        model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                          out_channels=len(np.unique(y)), num_layers=num_layers).to(device)
        data = data.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        # 利用 grid search 中的 alpha 参数构造类别权重
        # 假设二分类任务中，将类别 1 设为少数类，其权重取 alpha_value，类别 0 的权重为 1 - alpha_value
        alpha_list = [1 - alpha_value, alpha_value]
        alpha_tensor = torch.tensor(alpha_list, dtype=torch.float).to(device)
        print("Alpha weights:", alpha_tensor)
        criterion = FocalLoss(gamma=gamma, alpha=alpha_tensor, reduction="mean")

        best_f1_epoch = 0.0
        best_model_epoch = None

        for epoch in range(100):
            model.train()
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            model.eval()
            with torch.no_grad():
                val_out = model(data)
                val_preds = val_out[data.val_mask].argmax(dim=1)
                val_true = data.y[data.val_mask]
                val_f1 = f1_score(val_true.cpu(), val_preds.cpu(), average="macro")

            if val_f1 > best_f1_epoch:
                best_f1_epoch = val_f1
                best_model_epoch = model.state_dict()

        model.load_state_dict(best_model_epoch)
        model.eval()
        with torch.no_grad():
            test_out = model(data)
            preds = test_out[data.test_mask].argmax(dim=1)
            true_labels = data.y[data.test_mask]
            test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print(f"Test Accuracy for current combination: {test_acc:.4f}")
        if test_acc > best_acc:
            best_acc = test_acc
            best_overall_model_state = best_model_epoch
            best_overall_params = (threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value)
    
    return best_overall_params, best_overall_model_state

def main():
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_SGER_RAW(random_state=42)
    
    # 此处 n_iter 可以根据需要调整，示例中仅搜索 3 个组合
    best_params, best_model_state = grid_search(X, y, X_train, X_valid, X_test, 
                                                y_train, y_valid, train_mask, valid_mask, test_mask, n_iter=500)
    print("\nBest Hyperparameters:", best_params)

    # 解包最佳超参数（新增 alpha 参数）
    threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value = best_params
    adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, y_train, y_valid, 
                                          threshold=threshold, random_state=random_state)
    edge_index = adjacency_to_edge_index(adj_matrix).to(device)

    X_tensor = torch.tensor(X.values, dtype=torch.float).to(device)
    y_tensor = torch.tensor(y.values, dtype=torch.long).to(device)
    data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                train_mask=train_mask.to(device), val_mask=valid_mask.to(device), test_mask=test_mask.to(device))

    model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                      out_channels=len(np.unique(y)), num_layers=num_layers).to(device)
    model.load_state_dict(best_model_state)
    model.eval()

    with torch.no_grad():
        test_out = model(data)
        preds = test_out[data.test_mask].argmax(dim=1)
        true_labels = data.y[data.test_mask]
        
        report = classification_report(true_labels.cpu(), preds.cpu(), target_names=["Class 0", "Class 1"], digits=4)
        test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print("\nBest Model Classification Report on Test Set:")
        print(report)
        print("Best Model Test Set Metrics:")
        print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
              f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    main()



Start grid search...
Total parameter combinations: 43008000

Testing combination 1: threshold=0.14, random_state=180, layers=1, hidden_channels=80, lr=0.001, gamma=0.1, alpha=0.2
Alpha weights: tensor([0.8000, 0.2000], device='cuda:0')
Test Accuracy for current combination: 0.7000

Testing combination 2: threshold=0.1, random_state=100, layers=4, hidden_channels=40, lr=0.005, gamma=2.900000000000001, alpha=0.8
Alpha weights: tensor([0.2000, 0.8000], device='cuda:0')
Test Accuracy for current combination: 0.7200

Testing combination 3: threshold=0.1, random_state=220, layers=5, hidden_channels=240, lr=0.005, gamma=0.1, alpha=0.6
Alpha weights: tensor([0.4000, 0.6000], device='cuda:0')
Test Accuracy for current combination: 0.7000

Testing combination 4: threshold=0.14, random_state=40, layers=5, hidden_channels=120, lr=0.005, gamma=2.1000000000000005, alpha=0.6
Alpha weights: tensor([0.4000, 0.6000], device='cuda:0')
Test Accuracy for current combination: 0.7000

Testing combination 5: 