In [139]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler

def standard_input(X):
    # 标准化输入
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)

def load_data_SGER1000():
    # 读取以空格分隔的SGER CSV文件
    path = '/home/gehongfei/project/TabGNN/dataset/SGER1000.csv'
    df = pd.read_csv(path, sep='\s+')
    
    # 确保 'kredit' 列存在
    if 'kredit' not in df.columns:
        print("Error: 'kredit' column not found.")
        return None, None, None, None
    
    # 目标变量和特征
    y = df['kredit']
    X = df.drop(columns=['kredit'])
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=42)

    # 计算节点数并创建 mask
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    
    # 获取索引并设置 mask
    train_mask[X_train.index] = True
    test_mask[X_test.index] = True
    
    # 标准化输入
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_test = standard_input(X_test)
    
    return X, y, X_train, y_train, X_test, y_test, train_mask, test_mask



In [140]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix

# 训练Random Forest并计算相似性
# Function to compute adjacency matrix for train and test data
def compute_adjacency_matrix(X_train, X_test, y_train, n_estimators=100, max_depth=None, threshold=0.20, random_state=42):
    # 合并训练和测试数据
    X_combined = pd.concat([X_train, X_test], axis=0)
    num_samples = X_combined.shape[0]
    
    # 训练Random Forest
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    rf.fit(X_train, y_train)  # 仅使用训练数据训练模型
    
    # 获取每棵树的叶子索引
    leaf_indices = rf.apply(X_combined)
    
    # 计算相似性矩阵
    adjacency_matrix = np.zeros((num_samples, num_samples))
    for tree_idx in range(leaf_indices.shape[1]):  # 遍历每棵树
        leaf_to_samples = {}
        for sample_idx, leaf_id in enumerate(leaf_indices[:, tree_idx]):
            if leaf_id not in leaf_to_samples:
                leaf_to_samples[leaf_id] = []
            leaf_to_samples[leaf_id].append(sample_idx)
        
        # 更新相似性矩阵
        for sample_list in leaf_to_samples.values():
            for i in sample_list:
                for j in sample_list:
                    if i != j:
                        adjacency_matrix[i, j] += 1
    
    # 归一化相似性
    adjacency_matrix /= adjacency_matrix.max()
    
    # 应用阈值，转换为二值矩阵
    adjacency_matrix = (adjacency_matrix > threshold).astype(int)
    
    # 转换为稀疏矩阵
    adjacency_matrix_sparse = csr_matrix(adjacency_matrix)
    
    return adjacency_matrix_sparse



import torch
# 从稀疏邻接矩阵提取边索引
def adjacency_to_edge_index(adj_matrix):
    coo_matrix = adj_matrix.tocoo()  # 转换为COO格式
    edge_index = torch.tensor(np.vstack((coo_matrix.row, coo_matrix.col)), dtype=torch.long)
    return edge_index


In [141]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.6, training=self.training)  # 手动添加 Dropout
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)  # 在中间层也添加 Dropout
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)



In [150]:
from sklearn.metrics import classification_report, f1_score

import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import numpy as np

# 载入数据
X, y, X_train,y_train, X_test,y_test, train_mask, test_mask = load_data_SGER1000()
adj_matrix = compute_adjacency_matrix(X_train, X_test, y_train)
edge_index = adjacency_to_edge_index(adj_matrix)

num_features = X.shape[1]
num_classes = len(np.unique(y))

edge_index = adjacency_to_edge_index(adj_matrix)

# 转换数据格式
X_tensor = torch.tensor(X.values, dtype=torch.float)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 创建 PyG Data 对象
data = Data(x=X_tensor, y=y_tensor, edge_index=edge_index, train_mask=train_mask, test_mask=test_mask)

# 计算类别权重（解决类别不平衡）
class_counts = np.bincount(y_tensor.numpy())
print(class_counts)
weights = torch.tensor(1.0 / class_counts, dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=weights)

# 定义 GNN 模型
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

model = GraphSAGE(in_channels=num_features, hidden_channels=128, out_channels=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# 训练和测试代码

def train(model, data, optimizer, criterion, epochs=200):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

def test(model, data):
    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits.argmax(dim=1)
        test_preds = preds[data.test_mask]
        test_labels = data.y[data.test_mask]
        print("Classification Report:")
        print(classification_report(test_labels.cpu(), test_preds.cpu()))

# 训练模型
train(model, data, optimizer, criterion, epochs=80)

# 在测试集上进行评估
test(model, data)



Training Decision Tree...
Training Logistic Regression...
Training SVM...
Training Random Forest...
Training Naive Bayes...
Training MLP...
Training LDA...

Test Performance:
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       143
           1       0.54      0.44      0.49        57

    accuracy                           0.73       200
   macro avg       0.67      0.65      0.65       200
weighted avg       0.72      0.73      0.73       200


Best Model: MLP based on F1-score


In [149]:
import random

def train_and_evaluate(model, data, optimizer, criterion, epochs=200):
    """ 训练并评估模型 """
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        logits = model(data)
        preds = logits.argmax(dim=1)
        test_preds = preds[data.test_mask]
        test_labels = data.y[data.test_mask]
        # 在测试集上进行评估
        test(model, data)
        return f1_score(test_labels.cpu(), test_preds.cpu(), average='macro')

class GraphSAGEImproved(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout):
        super(GraphSAGEImproved, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.bns = torch.nn.ModuleList()
        self.dropout = dropout
        # 第一层
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        # 额外隐藏层
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))

        # 输出层
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for i in range(len(self.convs) - 1):
            x = self.convs[i](x, edge_index)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x

# 随机搜索超参数优化
def random_search(data, num_trials=10):
    param_grid = {
        'hidden_channels': [20, 40, 60, 80, 100, 120, 140, 160, 180, 200],
        'num_layers': [1, 2, 3, 4],
        'learning_rate': [0.01, 0.005, 0.001],
        'epochs': [30, 50, 100, 150],
        'dropout': [0.3, 0.4, 0.5, 0.6]
    }
    
    best_f1 = 0
    best_params = None
    
    for _ in range(num_trials):
        hidden_channels = random.choice(param_grid['hidden_channels'])
        num_layers = random.choice(param_grid['num_layers'])
        lr = random.choice(param_grid['learning_rate'])
        epochs = random.choice(param_grid['epochs'])
        dropout = random.choice(param_grid['dropout'])
        
        model = GraphSAGEImproved(num_features, hidden_channels, num_classes, num_layers, dropout)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
        
        f1 = train_and_evaluate(model, data, optimizer, criterion, epochs)
        print(f"Params: {(hidden_channels, num_layers, lr, epochs, dropout)}, F1 Score: {f1:.4f}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_params = (hidden_channels, num_layers, lr, epochs, dropout)
    
    print(f"Best Params: {best_params}, Best F1 Score: {best_f1:.4f}")
    return best_params

# 运行随机搜索
best_hyperparams = random_search(data)


Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       138
           1       0.55      0.52      0.53        62

    accuracy                           0.72       200
   macro avg       0.67      0.66      0.67       200
weighted avg       0.72      0.72      0.72       200

Params: (128, 3, 0.01, 100, 0.3), F1 Score: 0.6667
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       138
           1       0.54      0.63      0.58        62

    accuracy                           0.72       200
   macro avg       0.68      0.69      0.69       200
weighted avg       0.73      0.72      0.73       200

Params: (64, 3, 0.005, 100, 0.5), F1 Score: 0.6858
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.76      0.77       138
           1       0.51      0.55      0.53        62

    accurac

In [63]:
type(X)

pandas.core.frame.DataFrame