# RF GNN re-implement

### util function

In [1]:
# util function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import torch

def standard_input(X):
    # 标准化输入
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)

def load_data_SGER_RAW(random_state=42):
    # 读取以空格分隔的SGER CSV文件
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    # 确保 'kredit' 列存在
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None, None, None, None, None
    # 目标变量和特征
    y = df['kredit']
    X = df.drop(columns=['kredit'])
    # 划分训练集、验证集和测试集
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp)
    # 计算节点数并创建 mask
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    # 获取索引并设置 mask
    train_mask[X_train.index] = True
    val_mask[X_valid.index] = True
    test_mask[X_test.index] = True
    # 标准化输入
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test = standard_input(X_test)
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask


def compute_adjacency_matrix(X_train, X_valid, X_test, k=5):
    """
    使用 k 近邻（kNN）方法构建邻接矩阵
    :param X_train: 训练集特征
    :param X_valid: 验证集特征
    :param X_test: 测试集特征
    :param k: 近邻个数
    :return: 稀疏邻接矩阵 (csr_matrix)
    """
    # 合并所有数据
    X_combined = pd.concat([X_train, X_valid, X_test], axis=0).values
    num_samples = X_combined.shape[0]

    # 使用 kNN 计算最近邻
    knn = NearestNeighbors(n_neighbors=k + 1, metric='euclidean')  # k+1 因为包含自身
    knn.fit(X_combined)
    distances, indices = knn.kneighbors(X_combined)

    # 创建邻接矩阵
    adjacency_matrix = np.zeros((num_samples, num_samples), dtype=int)

    # 设定邻接关系
    for i in range(num_samples):
        for j in indices[i][1:]:  # 跳过自身（indices[i][0]）
            adjacency_matrix[i, j] = 1
            adjacency_matrix[j, i] = 1  # 无向图

    # 转换为稀疏矩阵
    adjacency_matrix_sparse = csr_matrix(adjacency_matrix)
    return adjacency_matrix_sparse


# 从稀疏邻接矩阵提取边索引
def adjacency_to_edge_index(adj_matrix):
    coo_matrix = adj_matrix.tocoo()  # 转换为COO格式
    edge_index = torch.tensor(np.vstack((coo_matrix.row, coo_matrix.col)), dtype=torch.long)
    return edge_index

# 加载数据
X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask = load_data_SGER_RAW()
# 计算邻接矩阵
adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, k=5)
# 打印邻接矩阵的形状
print("Adjacency Matrix Shape:", adj_matrix.shape)
# 转换为边索引
edge_index = adjacency_to_edge_index(adj_matrix)
# 打印边索引的形状
print("Edge Index Shape:", edge_index.shape)


Adjacency Matrix Shape: (1000, 1000)
Edge Index Shape: torch.Size([2, 7570])


In [2]:
import itertools
import random
import torch
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 定义 Focal Loss，用于替代 CrossEntropyLoss
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction="mean"):
        """
        :param gamma: focusing parameter, gamma > 0 调整容易分类样本的权重
        :param alpha: 类别权重，通常可以设置为一个列表或者 tensor
        :param reduction: 损失的计算方式，"mean" 或 "sum"
        """
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # alpha 通常用于对不同类别加权
        self.reduction = reduction
        self.ce = torch.nn.CrossEntropyLoss(reduction="none")

    def forward(self, inputs, targets):
        # 计算基本的交叉熵损失
        ce_loss = self.ce(inputs, targets)
        # 计算概率值（注意：torch.exp(-ce_loss) 就是模型给出的对应类别的概率）
        pt = torch.exp(-ce_loss)
        # 根据 alpha 调整不同类别的权重
        if self.alpha is not None:
            if isinstance(self.alpha, (list, np.ndarray)):
                alpha = inputs.new_tensor(self.alpha)
            else:
                alpha = self.alpha
            at = alpha.gather(0, targets.data)
            ce_loss = at * ce_loss
        # Focal Loss 的公式
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

# GraphSAGE 模型定义（可包含多个图卷积层）
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        
        # 添加更多层
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.fc = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        x = self.fc(x)
        return x

# 主 grid search 函数（这里使用 FocalLoss 替代 CrossEntropyLoss）
def grid_search(X, y, X_train, X_valid, X_test, y_train, y_valid, train_mask, valid_mask, test_mask, n_iter):
    best_acc = 0.0
    best_overall_model_state = None
    best_overall_params = None
    
    # 生成随机的 k 值，从 1 到 10 之间随机选择 10 个不同的整数
    thresholds = random.sample(range(1, 11), 10)
    random_states = list(range(0, 300, 10))
    num_layers_list = [1, 2, 3, 4, 5]
    hidden_channels_list = list(range(20, 300, 10))
    lrs = [0.01, 0.05, 0.001, 0.005]
    gammas = np.arange(0.1, 4, 0.2).tolist()
    alphas = np.arange(0.1, 0.9, 0.1).tolist()
    
    print("Start grid search...")

    # 将所有超参数组合在一起
    param_combinations = list(itertools.product(thresholds, random_states, num_layers_list, 
                                                hidden_channels_list, lrs, gammas, alphas))
    # 打印所有组合（可选）
    print("Total parameter combinations:", len(param_combinations))
    random.shuffle(param_combinations)
    
    for i, (threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value) in enumerate(param_combinations[:n_iter]):
        print(f"\nTesting combination {i+1}: threshold={threshold}, random_state={random_state}, "
              f"layers={num_layers}, hidden_channels={hidden_channels}, lr={lr}, gamma={gamma}, alpha={alpha_value}")

        adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, k=threshold)
        edge_index = adjacency_to_edge_index(adj_matrix)

        X_tensor = torch.tensor(X.values, dtype=torch.float)
        y_tensor = torch.tensor(y.values, dtype=torch.long)
        data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                    train_mask=train_mask, val_mask=valid_mask, test_mask=test_mask)

        model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                          out_channels=len(np.unique(y)), num_layers=num_layers).to(device)
        data = data.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

        # 利用 grid search 中的 alpha 参数构造类别权重
        # 假设二分类任务中，将类别 1 设为少数类，其权重取 alpha_value，类别 0 的权重为 1 - alpha_value
        alpha_list = [1 - alpha_value, alpha_value]
        alpha_tensor = torch.tensor(alpha_list, dtype=torch.float).to(device)
        print("Alpha weights:", alpha_tensor)
        criterion = FocalLoss(gamma=gamma, alpha=alpha_tensor, reduction="mean")

        best_f1_epoch = 0.0
        best_model_epoch = None

        for epoch in range(100):
            model.train()
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            model.eval()
            with torch.no_grad():
                val_out = model(data)
                val_preds = val_out[data.val_mask].argmax(dim=1)
                val_true = data.y[data.val_mask]
                val_f1 = f1_score(val_true.cpu(), val_preds.cpu(), average="macro")

            if val_f1 > best_f1_epoch:
                best_f1_epoch = val_f1
                best_model_epoch = model.state_dict()

        model.load_state_dict(best_model_epoch)
        model.eval()
        with torch.no_grad():
            test_out = model(data)
            preds = test_out[data.test_mask].argmax(dim=1)
            true_labels = data.y[data.test_mask]
            test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print(f"Test Accuracy for current combination: {test_acc:.4f}")
        if test_acc > best_acc:
            best_acc = test_acc
            best_overall_model_state = best_model_epoch
            best_overall_params = (threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value)
    
    return best_overall_params, best_overall_model_state

def main():
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, valid_mask, test_mask = load_data_SGER_RAW(random_state=42)
    
    # 此处 n_iter 可以根据需要调整，示例中仅搜索 3 个组合
    best_params, best_model_state = grid_search(X, y, X_train, X_valid, X_test, 
                                                y_train, y_valid, train_mask, valid_mask, test_mask, n_iter=100)
    print("\nBest Hyperparameters:", best_params)

    # 解包最佳超参数（新增 alpha 参数）
    threshold, random_state, num_layers, hidden_channels, lr, gamma, alpha_value = best_params
    adj_matrix = compute_adjacency_matrix(X_train, X_valid, X_test, k=threshold)
    edge_index = adjacency_to_edge_index(adj_matrix).to(device)

    X_tensor = torch.tensor(X.values, dtype=torch.float).to(device)
    y_tensor = torch.tensor(y.values, dtype=torch.long).to(device)
    data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, 
                train_mask=train_mask.to(device), val_mask=valid_mask.to(device), test_mask=test_mask.to(device))

    model = GraphSAGE(in_channels=X.shape[1], hidden_channels=hidden_channels, 
                      out_channels=len(np.unique(y)), num_layers=num_layers).to(device)
    model.load_state_dict(best_model_state)
    model.eval()

    with torch.no_grad():
        test_out = model(data)
        preds = test_out[data.test_mask].argmax(dim=1)
        true_labels = data.y[data.test_mask]
        
        report = classification_report(true_labels.cpu(), preds.cpu(), target_names=["Class 0", "Class 1"], digits=4)
        test_precision = precision_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_recall = recall_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_f1 = f1_score(true_labels.cpu(), preds.cpu(), average="macro")
        test_acc = accuracy_score(true_labels.cpu(), preds.cpu())
        
        print("\nBest Model Classification Report on Test Set:")
        print(report)
        print("Best Model Test Set Metrics:")
        print(f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
              f"F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    main()



Start grid search...
Total parameter combinations: 26880000

Testing combination 1: threshold=10, random_state=50, layers=3, hidden_channels=250, lr=0.01, gamma=3.7000000000000006, alpha=0.8
Alpha weights: tensor([0.2000, 0.8000], device='cuda:0')
Test Accuracy for current combination: 0.6850

Testing combination 2: threshold=1, random_state=90, layers=3, hidden_channels=240, lr=0.01, gamma=2.7000000000000006, alpha=0.1
Alpha weights: tensor([0.9000, 0.1000], device='cuda:0')
Test Accuracy for current combination: 0.7500

Testing combination 3: threshold=7, random_state=180, layers=5, hidden_channels=260, lr=0.005, gamma=0.9000000000000001, alpha=0.5
Alpha weights: tensor([0.5000, 0.5000], device='cuda:0')
Test Accuracy for current combination: 0.7000

Testing combination 4: threshold=7, random_state=190, layers=5, hidden_channels=280, lr=0.05, gamma=2.7000000000000006, alpha=0.2
Alpha weights: tensor([0.8000, 0.2000], device='cuda:0')
Test Accuracy for current combination: 0.7000

Tes