In [13]:
### util function
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import torch
from sklearn.semi_supervised.tests.test_self_training import X_test
from tqdm import tqdm


# Standardize the input data
def standard_input(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # 保留原有索引，便于后续处理
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    return X_scaled_df


def load_data_DEF(random_state=42):
    """
    从 CSV 文件中加载数据，并划分训练、验证、测试集，同时构造节点 mask
    """
    # CSV 文件路径（请根据实际情况修改）
    path = '/home/gehongfei/project/TabGNN/dataset/DEF.csv'
    df = pd.read_csv(path, sep=',')
    
    target_col = 'label'
    if target_col not in df.columns:
        print(f"Error: '{target_col}' column not found in the dataset.")
        return None, None, None, None, None, None, None, None, None, None, None
    
    y = df[target_col]
    if "ID" in df.columns:
        X = df.drop(columns=["ID", target_col])
    else:
        X = df.drop(columns=[target_col])
    
    # 划分训练、验证和测试集（采用 stratify 保证标签分布均衡）
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=random_state, stratify=y
    )
    X_valid, X_test, y_valid, y_test = train_test_split(
        X_temp, y_temp, test_size=2/3, random_state=random_state, stratify=y_temp
    )
    
    # 创建节点 mask（假设每一行数据代表图中的一个节点）
    num_nodes = len(df)
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask   = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask  = torch.zeros(num_nodes, dtype=torch.bool)
    
    train_mask[X_train.index] = True
    val_mask[X_valid.index]   = True
    test_mask[X_test.index]   = True
    
    # 标准化数据
    X = standard_input(X)
    X_train = standard_input(X_train)
    X_valid = standard_input(X_valid)
    X_test  = standard_input(X_test)
    
    return X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, train_mask, val_mask, test_mask

In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import itertools, random
from sklearn.metrics import classification_report, f1_score

# -------------------- Focal Loss 实现 --------------------
class FocalLoss(nn.Module):
    """
    Focal Loss 实现：
      focal_loss = alpha * (1 - p_t)^gamma * CE_loss
    """
    def __init__(self, alpha=1.0, gamma=2.0, reduction="mean"):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # 计算每个样本的 CrossEntropyLoss，不做 reduction
        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)  # p_t
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

# -------------------- 灵活的 MLP 模型 --------------------
class FlexibleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, activation, dropout_rate, num_classes):
        """
        构造一个多层 MLP：
          - 第一层：输入 -> hidden_dim
          - 后续 (num_layers - 1) 层：hidden_dim -> hidden_dim
          - 输出层：hidden_dim -> num_classes
        每层均采用 BatchNorm、指定的激活函数和 Dropout。
        """
        super(FlexibleMLP, self).__init__()
        layers = []
        # 第一层
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.BatchNorm1d(hidden_dim))
        layers.append(activation())
        layers.append(nn.Dropout(dropout_rate))
        # 如果要求多层隐藏层，则叠加若干层
        for i in range(num_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(activation())
            layers.append(nn.Dropout(dropout_rate))
        # 输出层
        layers.append(nn.Linear(hidden_dim, num_classes))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

# -------------------- 网格搜索函数 --------------------
def grid_search_mlp():
    # -------------------- 数据加载与转换 --------------------
    X, y, X_train, X_valid, X_test, y_train, y_valid, y_test, _, _, _ = load_data_DEF(random_state=42)
    
    # 转换为 torch.tensor
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
    X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32)
    y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)
    X_test_tensor  = torch.tensor(X_test.values,  dtype=torch.float32)
    y_test_tensor  = torch.tensor(y_test.values,  dtype=torch.long)
    
    # 设备设置
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_train_tensor, y_train_tensor = X_train_tensor.to(device), y_train_tensor.to(device)
    X_valid_tensor, y_valid_tensor = X_valid_tensor.to(device), y_valid_tensor.to(device)
    X_test_tensor, y_test_tensor   = X_test_tensor.to(device), y_test_tensor.to(device)
    
    input_dim = X_train_tensor.shape[1]
    num_classes = len(np.unique(y_train))
    
    # -------------------- 定义超参数搜索空间 --------------------
    # 注意：对于 focal_alpha 和 focal_gamma，只有当 loss 选择 "focal" 时才会生效
    param_grid = {
        "num_layers": [1, 2],
        "hidden_dim": [128, 256],
        "learning_rate": [0.001, 0.0005],
        "activation": [nn.ReLU, nn.LeakyReLU],
        "loss": ["crossentropy", "focal"],
        "focal_alpha": [0.25, 0.5],
        "focal_gamma": [1, 2],
        "dropout_rate": [0.3, 0.5]
    }
    
    # 生成所有组合
    all_combinations = list(itertools.product(*param_grid.values()))
    # 为节约时间，这里随机选取部分组合进行试验（也可遍历所有组合）
    n_trials = min(5, len(all_combinations))
    sampled_combinations = random.sample(all_combinations, n_trials)
    
    best_val_f1 = 0.0
    best_params = None
    best_model_state = None
    num_epochs = 100  # 可根据需要调整训练轮数

    # -------------------- 开始搜索 --------------------
    for trial_idx, comb in enumerate(sampled_combinations):
        # 解包超参数组合
        num_layers, hidden_dim, learning_rate, activation_func, loss_type, focal_alpha, focal_gamma, dropout_rate = comb
        print(f"================ Trial {trial_idx+1}/{n_trials} ================")
        print(f"Params: num_layers={num_layers}, hidden_dim={hidden_dim}, "
              f"learning_rate={learning_rate}, activation={activation_func.__name__}, "
              f"loss={loss_type}, focal_alpha={focal_alpha}, focal_gamma={focal_gamma}, "
              f"dropout_rate={dropout_rate}")
        
        # 构建模型（每个 trial 都重新初始化模型）
        model = FlexibleMLP(input_dim, hidden_dim, num_layers, activation_func, dropout_rate, num_classes)
        model.to(device)
        
        # 选择损失函数
        if loss_type == "focal":
            criterion = FocalLoss(alpha=focal_alpha, gamma=focal_gamma)
        else:
            criterion = nn.CrossEntropyLoss()
        
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        best_trial_f1 = 0.0
        best_trial_state = None
        
        # 训练过程
        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()
            
            # 验证评估
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_valid_tensor)
                val_loss = criterion(val_outputs, y_valid_tensor)
                val_pred = torch.argmax(val_outputs, dim=1)
                val_f1 = f1_score(y_valid_tensor.cpu().numpy(), val_pred.cpu().numpy(), average="macro")
            
            # 记录该 trial 中最优的模型状态
            if val_f1 > best_trial_f1:
                best_trial_f1 = val_f1
                best_trial_state = model.state_dict()
            
            # 每 10 个 epoch 打印一次
            if (epoch+1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}]  Val Loss: {val_loss.item():.4f}, Val F1: {val_f1:.4f}")
        
        print(f"--> Best Val F1 for Trial {trial_idx+1}: {best_trial_f1:.4f}\n")
        
        # 若当前 trial 的最优效果优于之前的最优效果，则更新全局最优
        if best_trial_f1 > best_val_f1:
            best_val_f1 = best_trial_f1
            best_params = {
                "num_layers": num_layers,
                "hidden_dim": hidden_dim,
                "learning_rate": learning_rate,
                "activation": activation_func.__name__,
                "loss": loss_type,
                "focal_alpha": focal_alpha,
                "focal_gamma": focal_gamma,
                "dropout_rate": dropout_rate
            }
            best_model_state = best_trial_state
    
    print("#######################")
    print("Best Hyperparameters Found:")
    print(best_params)
    print(f"Best Validation F1: {best_val_f1:.4f}")
    
    # -------------------- 用最优模型在测试集上评估 --------------------
    # 重新构建模型（注意激活函数名称需要转回类，可以用 getattr(nn, ...)）
    best_activation_class = getattr(nn, best_params["activation"])
    model = FlexibleMLP(input_dim, best_params["hidden_dim"], best_params["num_layers"],
                        best_activation_class, best_params["dropout_rate"], num_classes)
    model.to(device)
    model.load_state_dict(best_model_state)
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_pred = torch.argmax(test_outputs, dim=1)
    test_pred_np = test_pred.cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()
    
    print("\nTest Classification Report (Best Model):")
    print(classification_report(y_test_np, test_pred_np))

if __name__ == "__main__":
    grid_search_mlp()


Params: num_layers=2, hidden_dim=256, learning_rate=0.0005, activation=LeakyReLU, loss=crossentropy, focal_alpha=0.5, focal_gamma=2, dropout_rate=0.3
Epoch [10/100]  Val Loss: 0.6267, Val F1: 0.6507
Epoch [20/100]  Val Loss: 0.5176, Val F1: 0.6855
Epoch [30/100]  Val Loss: 0.4676, Val F1: 0.6809
Epoch [40/100]  Val Loss: 0.4545, Val F1: 0.6763
Epoch [50/100]  Val Loss: 0.4481, Val F1: 0.6749
Epoch [60/100]  Val Loss: 0.4454, Val F1: 0.6762
Epoch [70/100]  Val Loss: 0.4450, Val F1: 0.6819
Epoch [80/100]  Val Loss: 0.4437, Val F1: 0.6825
Epoch [90/100]  Val Loss: 0.4437, Val F1: 0.6861
Epoch [100/100]  Val Loss: 0.4431, Val F1: 0.6856
--> Best Val F1 for Trial 1: 0.6864

Params: num_layers=1, hidden_dim=128, learning_rate=0.001, activation=LeakyReLU, loss=crossentropy, focal_alpha=0.25, focal_gamma=2, dropout_rate=0.5
Epoch [10/100]  Val Loss: 0.6039, Val F1: 0.6093
Epoch [20/100]  Val Loss: 0.5201, Val F1: 0.5541
Epoch [30/100]  Val Loss: 0.5080, Val F1: 0.5874
Epoch [40/100]  Val Loss:

In [None]:
### 可能的采样优化 交叉

In [10]:
X_test.shape

(6000, 23)