In [1]:
import warnings

warnings.simplefilter("ignore")  # 忽略警告信息
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn.functional as F
import torch.optim
from torch import Tensor
import torch.nn.functional as F
import copy
from torch.utils.data import Dataset, DataLoader

# 导入 rtdl_revisiting_models 包中的模型类
from rtdl_revisiting_models import FTTransformer

warnings.resetwarnings()

# 设置运行设备为GPU（如可用）或CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 设置随机种子确保可复现性
# （delu.random.seed 会同时为 numpy、random、torch 等设置种子）
import delu
delu.random.seed(999)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

from dataloader import load_data

Data directory: d:\DSS5104\data


In [2]:
# def prepare_data(dataset: str):
#     """
#     Load and preprocess dataset.
#     Supports 'adult' (Adult Income dataset) and 'california' (California Housing dataset).
#     Returns a tuple: (data_dict, n_cont_features, cat_cardinalities, d_out, task_type)
#     """
#     dataset = dataset.lower()
#     # Load dataset
#     if dataset.startswith("adult"):
#         # Adult dataset (binary classification)
#         data_train,data_test = load_data("adult")
        
#         X_train = data_train.drop(columns=['income'])
#         y_train = data_train['income']
#         X_val = data_test.drop(columns=['income'])
#         y_val = data_test['income']
#         # Convert target to binary 0/1
#         y_train = (y_train == '>50K').astype(int)  # 1 for >50K, 0 for <=50K
#         y_val = (y_val == '>50K').astype(int)  # 1 for >50K, 0 for <=50K
#         task_type = "classification"
        
#     elif dataset.startswith("california"):
#         X_train, X_val, y_train, y_val = load_data("california")
#         # California housing dataset (regression)
#         task_type = "regression"
        
#     elif dataset.startswith("higgs"):
#         # Higgs dataset (binary classification)
#         X_train, X_val, y_train, y_val = load_data("higgs")
#         # Convert target to binary 0/1
#         y_train = (y_train == 1).astype(int)  # 1 for signal, 0 for background
#         y_val = (y_val == 1).astype(int)  # 1 for signal, 0 for background
#         task_type = "classification"
        
#     elif dataset.startswith("churn"):
#         # Churn dataset (binary classification)
#         X_train, X_val, y_train, y_val = load_data("churn")
#         # Convert target to binary 0/1
#         y_train = (y_train == 'Yes').astype(int)
#         y_val = (y_val == 'Yes').astype(int)  # 1 for Yes, 0 for No
#         task_type = "classification"
        
#     elif dataset.startswith("creditcard"):
#         # Credit Card Fraud Detection dataset (binary classification)
#         X_train, X_val, y_train, y_val = load_data("credit")
#         # Convert target to binary 0/1
#         y_train = (y_train == 1).astype(int)
#         y_val = (y_val == 1).astype(int)  # 1 for fraud, 0 for non-fraud
#         task_type = "classification"
        
#     else:
#         raise ValueError(f"Unsupported dataset: {dataset}")
    
#     # Identify categorical and continuous columns
#     cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
#     cont_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
#     # Label-encode categorical columns
#     cat_cardinalities = []
#     if cat_cols:
#         for col in cat_cols:
#             # Convert to categorical dtype and then to codes
#             X_train[col] = X_train[col].astype('category')
#             X_val[col] = X_val[col].astype('category')
#             # Save cardinality (number of unique categories)
#             cat_cardinalities.append(X_train[col].nunique())
#             # Replace column with codes (0 ... n-1)
#             X_train[col] = X_train[col].cat.codes
#             X_val[col] = X_val[col].cat.codes
#             # # If any -1 codes (from NaN), replace with a new category code at end
#             # if (X[col] < 0).any():
#             #     X[col] = X[col].replace(-1, X[col].nunique())  # treat missing as new category
#             #     cat_cardinalities[-1] += 1
    
#     # All continuous features to float32
#     X_train[cont_cols] = X_train[cont_cols].astype('float32')
#     X_val[cont_cols] = X_val[cont_cols].astype('float32')
#     # Convert target to appropriate type
#     if task_type == "classification":
#         # Classification (target as int64 for PyTorch)
#         y_train = y_train.astype('int64')
#         y_val = y_val.astype('int64')
#         # Determine number of classes for output
#         n_classes = y_train.nunique()
#         d_out = 1 if n_classes == 2 else n_classes
#     else:
#         # Regression (target as float32)
#         y_train = y_train.astype('float32')
#         y_val = y_val.astype('float32')
#         d_out = 1  # one output for regression
    
    
#     # Feature scaling (fit on train, apply to all splits)
#     scaler = StandardScaler().fit(X_train[cont_cols])
#     X_train[cont_cols] = scaler.transform(X_train[cont_cols])
#     X_val[cont_cols]   = scaler.transform(X_val[cont_cols])

    
#     # Convert features and targets to tensors on the chosen device
#     def to_tensor(dataframe):
#         # Separate cont and cat features and convert to torch tensor
#         x_cont_tensor = torch.tensor(dataframe[cont_cols].values, dtype=torch.float32, device=device)
#         if cat_cols:
#             x_cat_tensor = torch.tensor(dataframe[cat_cols].values, dtype=torch.int64, device=device)
#         else:
#             x_cat_tensor = None
#         return x_cont_tensor, x_cat_tensor
    
#     x_cont_train, x_cat_train = to_tensor(X_train)
#     x_cont_val,   x_cat_val   = to_tensor(X_val)

#     y_train_tensor = torch.tensor(y_train.values, device=device)
#     y_val_tensor   = torch.tensor(y_val.values, device=device)

#     # For binary classification, use float targets for BCE loss
#     if task_type == "classification" and d_out == 1:
#         y_train_tensor = y_train_tensor.float()
#         y_val_tensor   = y_val_tensor.float()

    
#     # Package data into dictionaries for convenience
#     data = {
#         "train": {"x_cont": x_cont_train, "x_cat": x_cat_train, "y": y_train_tensor},
#         "val":   {"x_cont": x_cont_val,   "x_cat": x_cat_val,   "y": y_val_tensor},
#     }
#     n_cont_features = len(cont_cols)
#     return data, n_cont_features, cat_cardinalities, d_out, task_type

In [2]:
def build_model(n_cont_features: int, cat_cardinalities: list, d_out: int):
    """
    Build an FT-Transformer model.
    n_cont_features: number of continuous (numeric) features
    cat_cardinalities: list of cardinalities for each categorical feature (empty if none)
    d_out: dimension of model output (e.g. number of classes or 1)
    """
    model = FTTransformer(
        n_cont_features=n_cont_features,
        cat_cardinalities=cat_cardinalities,
        d_out=d_out,
        **FTTransformer.get_default_kwargs()  # use default recommended hyperparameters
    ).to(device)
    return model

In [4]:
# def train_model(model, data: dict, task_type: str, d_out: int,
#                 n_epochs: int = 100, batch_size: int = 256, patience: int = 10, 
#                 lr: float = 3e-4, weight_decay: float = 1e-5):
#     """
#     Train the model using the provided data.
#     Returns a dict with best validation (and corresponding test) metrics and epoch.
#     """
#     # Select appropriate loss function
#     if task_type == "classification":
#         # Binary classification vs multiclass
#         if d_out == 1:
#             loss_fn = F.binary_cross_entropy_with_logits  # expects logits and float targets
#         else:
#             loss_fn = F.cross_entropy                    # expects logits and class indices
#     else:
#         loss_fn = F.mse_loss  # regression
    
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
#     best_val_score = None  # best validation metric (accuracy or negative loss)
#     best_epoch = -1
#     best_state = None  # store best model parameters
    
#     # Determine if we maximize or minimize the val metric
#     maximize_metric = True if task_type == "classification" else False
    
#     patience_counter = 0
#     n_train = data["train"]["y"].shape[0]
    
#     for epoch in range(1, n_epochs+1):
#         model.train()
#         # Shuffle training indices for each epoch
#         indices = torch.randperm(n_train, device=device)
#         total_loss = 0.0
#         total_correct = 0
#         total_samples = 0
        
#         # Mini-batch training
#         for start in range(0, n_train, batch_size):
#             end = start + batch_size
#             batch_idx = indices[start:end]
#             x_cont_batch = data["train"]["x_cont"][batch_idx]
#             y_batch = data["train"]["y"][batch_idx]
#             if data["train"]["x_cat"] is not None:
#                 x_cat_batch = data["train"]["x_cat"][batch_idx]
#                 logits = model(x_cont_batch, x_cat_batch)
#             else:
#                 logits = model(x_cont_batch, None)
#             # For binary/regression, squeeze output to 1D tensor
#             if task_type == "classification":
#                 if d_out == 1:
#                     logits = logits.squeeze(-1)  # shape (batch,)
#             else:
#                 logits = logits.squeeze(-1)      # regression output shape (batch,)
#             loss = loss_fn(logits, y_batch)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
            
#             # Accumulate training loss
#             # Multiply by batch size to aggregate (loss is mean per batch by default for BCE/MSE/XEnt)
#             total_loss += loss.item() * len(batch_idx)
#             total_samples += len(batch_idx)
#             # Accumulate correct predictions for classification
#             if task_type == "classification":
#                 if d_out == 1:
#                     # Binary classification: predict label 0/1 by thresholding logits
#                     preds = (logits > 0).long()
#                     targets = y_batch.long()  # y_batch is float for BCE, convert to long for comparison
#                 else:
#                     # Multiclass: pick class with highest logit
#                     preds = torch.argmax(logits, dim=1)
#                     targets = y_batch
#                 total_correct += (preds == targets).sum().item()
        
#         # Compute average training loss and accuracy
#         avg_train_loss = total_loss / total_samples
#         if task_type == "classification":
#             train_acc = total_correct / total_samples
#         else:
#             train_acc = None  # not applicable for regression
        
#         # Evaluate on validation and test sets
#         model.eval()
#         with torch.no_grad():
#             # Validation set
#             x_cont_val = data["val"]["x_cont"]
#             y_val = data["val"]["y"]
#             if data["val"]["x_cat"] is not None:
#                 x_cat_val = data["val"]["x_cat"]
#                 val_logits = model(x_cont_val, x_cat_val)
#             else:
#                 val_logits = model(x_cont_val, None)
#             if task_type == "classification":
#                 if d_out == 1:
#                     val_logits = val_logits.squeeze(-1)
#                     val_loss = F.binary_cross_entropy_with_logits(val_logits, y_val)
#                     # Convert logits to predicted labels for accuracy
#                     val_preds = (val_logits > 0).long()
#                     val_targets = y_val.long()
#                     val_correct = (val_preds == val_targets).sum().item()
#                     val_acc = val_correct / len(y_val)
#                 else:
#                     # Multiclass
#                     val_loss = F.cross_entropy(val_logits, y_val)
#                     val_preds = torch.argmax(val_logits, dim=1)
#                     val_correct = (val_preds == y_val).sum().item()
#                     val_acc = val_correct / len(y_val)
#             else:
#                 # Regression
#                 val_logits = val_logits.squeeze(-1)
#                 val_loss = F.mse_loss(val_logits, y_val)
#                 val_acc = None  # no accuracy for regression (we will use loss for early stopping)
        
#         # Determine the metric to use for early stopping and best model tracking
#         if task_type == "classification":
#             current_val_score = val_acc
#         else:
#             current_val_score = -val_loss.item()  # use negative loss so that "higher is better" (to simplify logic)
        
#         # Check improvement for early stopping
#         if best_val_score is None or current_val_score > best_val_score:
#             best_val_score = current_val_score
#             best_epoch = epoch
#             # Save model state at best epoch
#             best_state = copy.deepcopy(model.state_dict())
#             patience_counter = 0
#         else:
#             patience_counter += 1
        
#         # Logging epoch results
#         if task_type == "classification":
#             print(f"Epoch {epoch:03d}: "
#                   f"Train Loss = {avg_train_loss:.4f}, Train Acc = {train_acc:.4f}, "
#                   f"Val Loss = {val_loss.item():.4f}, Val Acc = {val_acc:.4f}, ")
#         else:
#             # For regression, report RMSE for interpretability
#             train_rmse = np.sqrt(avg_train_loss)
#             val_rmse = np.sqrt(val_loss.item())
#             print(f"Epoch {epoch:03d}: "
#                   f"Train MSE = {avg_train_loss:.4f} (RMSE={train_rmse:.4f}), "
#                   f"Val MSE = {val_loss.item():.4f} (RMSE={val_rmse:.4f})")
        
#         # Early stopping check
#         if patience_counter >= patience:
#             print(f"Early stopping triggered after epoch {epoch}.")
#             break
        
#         torch.cuda.empty_cache()  # 在每个Epoch结束后调用，释放未使用的显存
    
#     # Restore best model weights (early stopping uses the model at best epoch)
#     if best_state is not None:
#         model.load_state_dict(best_state)
#         torch.save(model.state_dict(), "FT_best_model.pth")
    
#     # Prepare results
#     if task_type == "classification":
#         best_val_acc = best_val_score  # since best_val_score holds actual accuracy
#         result = {
#             "best_epoch": best_epoch,
#             "best_val_acc": float(best_val_acc),
#         }
#         print(f"Training completed. Best epoch = {best_epoch}, Best Val Accuracy = {best_val_acc:.4f}")
#     else:
#         # Convert stored negative loss back to positive MSE
#         best_val_mse = -best_val_score if best_val_score is not None else None
#         result = {
#             "best_epoch": best_epoch,
#             "best_val_mse": float(best_val_mse) if best_val_mse is not None else None,
#         }
#         if best_val_mse is not None:
#             print(f"Training completed. Best epoch = {best_epoch}, Best Val RMSE = {np.sqrt(best_val_mse):.4f}")
#     return result


In [5]:
# # Example usage for Adult dataset (binary classification)
# data, n_cont, cat_cardinalities, d_out, task_type = prepare_data("adult")
# model = build_model(n_cont, cat_cardinalities, d_out)
# results = train_model(model, data, task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
# print("Adult Dataset Results:", results)

In [6]:
# # Example usage for California Housing dataset (regression)
# data, n_cont, cat_cardinalities, d_out, task_type = prepare_data("california")
# model = build_model(n_cont, cat_cardinalities, d_out)
# results = train_model(model, data, task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
# print("California Housing Results:", results)

In [7]:
# # Example usage for Higgs dataset (binary classification)
# data, n_cont, cat_cardinalities, d_out, task_type = prepare_data("higgs")
# model = build_model(n_cont, cat_cardinalities, d_out)
# results = train_model(model, data, task_type, d_out, n_epochs=100, batch_size=32, patience=10, lr=3e-4)
# print("Higgs Results:", results)

In [8]:
# # Example usage for Credit Card Fraud Detection dataset (binary classification)
# data, n_cont, cat_cardinalities, d_out, task_type = prepare_data("creditcard")
# model = build_model(n_cont, cat_cardinalities, d_out)
# results = train_model(model, data, task_type, d_out, n_epochs=100, batch_size=64, patience=10, lr=3e-4)
# print("Credit Card Fraud Detection Results:", results)

In [3]:
class TabularDataset(Dataset):
    def __init__(self, X_df, y, cat_cols, cont_cols, task_type, is_train=False, scaler=None, cat_categories=None):
        """
        初始化Dataset，执行数据预处理（类别编码、数值缩放等）。
        X_df: pandas DataFrame，特征数据
        y: pandas Series或numpy数组，目标数据
        cat_cols: 类别特征列名列表
        cont_cols: 连续特征列名列表
        task_type: "classification" 或 "regression"，任务类型
        is_train: 是否为训练集（训练集会拟合Scaler等）
        scaler: 训练集拟合的StandardScaler（验证集/测试集传入同一个Scaler以保证一致性）
        cat_categories: 可选，训练集每个类别特征的类别列表（用于在验证集上保持类别编码一致）
        """
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols
        self.task_type = task_type
        # 复制一份数据，避免修改原始DataFrame
        X = X_df.copy()
        
        # 类别型特征处理：转换为categorical类型并编码
        self.cat_cardinalities = []      # 保存每个类别特征的基数（unique个数）
        self.cat_categories = {}         # 保存训练集中每个类别特征的类别值列表
        if self.cat_cols:
            for col in self.cat_cols:
                if is_train:
                    # 训练集：将特征转换为categorical并获取类别列表
                    X[col] = X[col].astype('category')
                    self.cat_categories[col] = X[col].cat.categories  # 保存类别值
                    self.cat_cardinalities.append(X[col].nunique())   # 唯一值数量作为类别基数
                else:
                    # 验证/测试集：若提供了训练集的类别列表，则使用它保证编码一致
                    if cat_categories is not None and col in cat_categories:
                        X[col] = pd.Categorical(X[col], categories=cat_categories[col])
                    else:
                        X[col] = X[col].astype('category')
                # 将类别值映射为编码 (0,...,n-1)，缺失或未知类别将被编码为 -1
                X[col] = X[col].cat.codes

        
        # 连续型特征处理：转换类型并标准化
        self.scaler = None
        if self.cont_cols:
            # 确保连续特征为float32类型
            X[self.cont_cols] = X[self.cont_cols].astype('float32')
            if is_train:
                # 拟合StandardScaler并应用于训练数据
                self.scaler = StandardScaler()
                X[self.cont_cols] = self.scaler.fit_transform(X[self.cont_cols])
            else:
                # 使用训练集的Scaler对验证/测试集进行变换
                X[self.cont_cols] = scaler.transform(X[self.cont_cols])
        
        # 保存处理后的特征为Tensor
        if self.cont_cols:
            # 连续特征转换为浮点Tensor
            self.X_cont = torch.tensor(X[self.cont_cols].values, dtype=torch.float32)
        else:
            # 若没有连续特征，则用None占位
            self.X_cont = None
        if self.cat_cols:
            # 类别特征转换为长整型Tensor
            self.X_cat = torch.tensor(X[self.cat_cols].values, dtype=torch.long)
        else:
            self.X_cat = None
        
        # 目标变量处理：根据任务类型选择dtype
        # 对于分类任务，默认使用long张量存储类别（若二分类且使用BCELoss，后续会转换为float）
        # 对于回归任务，使用float张量
        y_array = np.array(y)  # 将Series转换为numpy数组
        if task_type == "classification":
            # 检查y的数据类型，若已经是浮点（表示二分类），则用float32，否则用long
            target_dtype = torch.float32 if str(y_array.dtype).startswith('float') else torch.long
            self.y = torch.tensor(y_array, dtype=target_dtype)
        else:
            self.y = torch.tensor(y_array, dtype=torch.float32)
    
    def __len__(self):
        # 返回数据集样本数量
        return len(self.y)
    
    def __getitem__(self, idx):
        # 根据索引idx返回一个样本的特征和标签
        # 提取连续特征，如果没有连续特征则返回空Tensor
        x_cont = self.X_cont[idx] if self.X_cont is not None else torch.tensor([], dtype=torch.float32)
        # 提取类别特征，如果没有类别特征则返回None
        x_cat = self.X_cat[idx] if self.X_cat is not None else None
        y = self.y[idx]
        return x_cont, x_cat, y

# 定义自定义的collate_fn函数，处理批次数据的合并（特别是处理x_cat可能为None的情况）
def collate_fn(batch):
    """
    将一批(batch)的列表样本合并成一个批次输出。
    batch参数是一个列表，内含若干来自Dataset的__getitem__返回的元组 (x_cont, x_cat, y)。
    该函数将这些样本堆叠成批次张量，并处理None的情况。
    """
    x_cont_list, x_cat_list, y_list = [], [], []
    for (x_cont, x_cat, y) in batch:
        # 将连续特征和标签加入列表
        x_cont_list.append(x_cont)
        y_list.append(y)
        # 将类别特征加入列表（如果存在）
        if x_cat is not None:
            x_cat_list.append(x_cat)
    # 将列表堆叠为tensor
    x_cont_batch = torch.stack(x_cont_list) if len(x_cont_list) > 0 else None
    y_batch = torch.stack(y_list)
    # 类别特征列表可能为空（表示没有类别特征），注意区分处理
    if len(x_cat_list) > 0:
        x_cat_batch = torch.stack(x_cat_list)
    else:
        x_cat_batch = None
    return x_cont_batch, x_cat_batch, y_batch

# 使用Dataset和DataLoader改写数据加载流程
def prepare_data(dataset: str,batch_size: int = 256):   
    dataset = dataset.lower()
    # 根据数据集名称加载数据并进行初步处理（划分训练/验证集，编码标签等）
    if dataset.startswith("adult"):
        # Adult数据集（二分类）
        data_train, data_test = load_data("adult")
        X_train = data_train.drop(columns=['income'])
        y_train = data_train['income']
        X_val = data_test.drop(columns=['income'])
        y_val = data_test['income']
        # 将收入标签转换为0/1（二分类）
        y_train = (y_train == '>50K').astype(int)
        y_val = (y_val == '>50K').astype(int)
        task_type = "classification"
    elif dataset.startswith("california"):
        # 加州房价数据集（回归）
        X_train, X_val, y_train, y_val = load_data("california")
        task_type = "regression"
    elif dataset.startswith("higgs"):
        # Higgs数据集（二分类）
        X_train, X_val, y_train, y_val = load_data("higgs")
        # 将标签转换为0/1（二分类：1表示signal，0表示background）
        y_train = (y_train == 1).astype(int)
        y_val = (y_val == 1).astype(int)
        task_type = "classification"
    elif dataset.startswith("churn"):
        # 用户流失(Churn)数据集（二分类）
        X_train, X_val, y_train, y_val = load_data("churn")
        # 将标签转换为0/1（'Yes'->1表示流失，'No'->0表示未流失）
        y_train = (y_train == 'Yes').astype(int)
        y_val = (y_val == 'Yes').astype(int)
        task_type = "classification"
    elif dataset.startswith("creditcard"):
        # 信用卡欺诈检测数据集（二分类）
        X_train, X_val, y_train, y_val = load_data("credit")
        # 将标签转换为0/1（1表示欺诈，0表示正常）
        y_train = (y_train == 1).astype(int)
        y_val = (y_val == 1).astype(int)
        task_type = "classification"
    else:
        raise ValueError(f"Unsupported dataset: {dataset}")
    
    # 确定类别和连续特征列
    cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
    cont_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # 根据任务类型转换目标数据类型（classification用int64，regression用float32）
    if task_type == "classification":
        y_train = y_train.astype('int64')
        y_val = y_val.astype('int64')
        # 判断是否为二分类任务，以决定输出维度和目标张量类型
        n_classes = pd.Series(y_train).nunique()  # 类别数量
        d_out = 1 if n_classes == 2 else n_classes
        if d_out == 1:
            # 二分类任务将标签转为float32，以便使用BCE损失
            y_train = y_train.astype('float32')
            y_val = y_val.astype('float32')
    else:
        # 回归任务，将标签转为float32
        y_train = y_train.astype('float32')
        y_val = y_val.astype('float32')
        d_out = 1
    
    # 创建训练Dataset（在初始化时会执行特征编码和标准化）
    train_dataset = TabularDataset(X_train, y_train, cat_cols, cont_cols, task_type, is_train=True)
    # 创建验证Dataset，使用训练集的Scaler以保持相同的特征缩放
    val_dataset = TabularDataset(X_val, y_val, cat_cols, cont_cols, task_type, is_train=False,
                                 scaler=train_dataset.scaler, cat_categories=train_dataset.cat_categories)
    
    # 使用DataLoader创建训练和验证集的迭代器
    # 可以根据需要调整batch_size，下面设置一个默认值
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
    # 提取特征维度和类别基数等信息供模型构建使用
    n_cont_features = len(cont_cols)                          # 连续特征数量
    cat_cardinalities = train_dataset.cat_cardinalities       # 每个类别特征的基数列表
    task_type = task_type
    # d_out 已根据任务类型计算
    
    # 返回包含DataLoader的字典，以及特征和任务信息
    data_loaders = {"train": train_loader, "val": val_loader}
    return data_loaders, n_cont_features, cat_cardinalities, d_out, task_type


In [4]:
def train_model(model, train_loader, val_loader, task_type, d_out, 
                n_epochs=100, batch_size=256, patience=10, lr=3e-4, weight_decay=1e-5):
    """
    训练模型，使用提供的训练和验证DataLoader按批次加载数据。
    返回包含最佳验证结果的字典和对应的epoch。
    """
    # 根据任务类型选择合适的损失函数
    if task_type == "classification":
        loss_fn = F.binary_cross_entropy_with_logits if d_out == 1 else F.cross_entropy
    else:
        loss_fn = F.mse_loss  # 回归任务使用均方误差损失
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_score = None
    best_epoch = -1
    best_state = None
    patience_counter = 0
    # 判断验证指标是最大化（分类准确率）还是最小化（回归损失）
    maximize_metric = True if task_type == "classification" else False
    
    for epoch in range(1, n_epochs+1):
        model.train()
        total_loss = 0.0
        total_correct = 0
        total_samples = 0
        # 遍历训练集DataLoader，每次获取一个批次的数据
        for x_cont_batch, x_cat_batch, y_batch in train_loader:
            # 将批次数据移动到计算设备(device)上
            x_cont_batch = x_cont_batch.to(device) if x_cont_batch is not None else None
            x_cat_batch = x_cat_batch.to(device) if x_cat_batch is not None else None
            y_batch = y_batch.to(device)
            # 前向传播：根据是否存在类别特征选择模型输入
            logits = model(x_cont_batch, x_cat_batch) if x_cat_batch is not None else model(x_cont_batch, None)
            # 对于二分类或回归，需要将输出logits变形为一维
            if task_type == "classification" and d_out == 1:
                logits = logits.squeeze(-1)  # 二分类，形状(batch,)
            elif task_type == "regression":
                logits = logits.squeeze(-1)  # 回归，形状(batch,)
            # 计算损失
            # 注：若为二分类，loss_fn=BCELoss，此时y_batch应为float类型
            loss = loss_fn(logits, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 累积训练损失和样本数（loss默认是batch均值，因此乘以批大小）
            batch_size_actual = y_batch.size(0)
            total_loss += loss.item() * batch_size_actual
            total_samples += batch_size_actual
            # 累积正确预测数量（仅分类任务需要）
            if task_type == "classification":
                if d_out == 1:
                    # 二分类：logits大于0视为预测正类(1)
                    preds = (logits > 0).long()
                    targets = y_batch.long()  # 将目标转换为long以便比较
                else:
                    # 多分类：取最大logit对应的类别
                    preds = torch.argmax(logits, dim=1)
                    targets = y_batch
                total_correct += (preds == targets).sum().item()
        # 计算平均训练损失
        avg_train_loss = total_loss / total_samples
        # 计算训练准确率（分类任务）
        train_acc = total_correct / total_samples if task_type == "classification" else None
        
        # 在验证集上评估
        model.eval()
        val_loss_total = 0.0
        val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for x_cont_val, x_cat_val, y_val in val_loader:
                x_cont_val = x_cont_val.to(device) if x_cont_val is not None else None
                x_cat_val = x_cat_val.to(device) if x_cat_val is not None else None
                y_val = y_val.to(device)
                # 前向传播得到验证集预测
                val_logits = model(x_cont_val, x_cat_val) if x_cat_val is not None else model(x_cont_val, None)
                if task_type == "classification" and d_out == 1:
                    val_logits = val_logits.squeeze(-1)
                elif task_type == "regression":
                    val_logits = val_logits.squeeze(-1)
                # 计算验证损失
                val_loss = loss_fn(val_logits, y_val)
                # 累积验证损失和样本数
                batch_val_size = y_val.size(0)
                val_loss_total += val_loss.item() * batch_val_size
                total_val_samples += batch_val_size
                # 累积验证集上的正确预测数（分类任务）
                if task_type == "classification":
                    if d_out == 1:
                        # 二分类准确率计算
                        val_preds = (val_logits > 0).long()
                        val_targets = y_val.long()
                    else:
                        # 多分类准确率计算
                        val_preds = torch.argmax(val_logits, dim=1)
                        val_targets = y_val
                    val_correct += (val_preds == val_targets).sum().item()
        # 计算平均验证损失和准确率
        avg_val_loss = val_loss_total / total_val_samples
        if task_type == "classification":
            val_acc = val_correct / total_val_samples
        else:
            val_acc = None  # 回归任务无需准确率
        
        # 根据任务类型确定当前验证指标（分类使用准确率，回归使用损失的负值）
        current_val_score = val_acc if task_type == "classification" else -avg_val_loss
        # Early Stopping: 检查验证指标是否改进
        if best_val_score is None or (maximize_metric and current_val_score > best_val_score) or (not maximize_metric and current_val_score < best_val_score):
            best_val_score = current_val_score
            best_epoch = epoch
            best_state = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
        
        # 打印当前轮次的训练和验证结果
        if task_type == "classification":
            print(f"Epoch {epoch:03d}: Train Loss = {avg_train_loss:.4f}, Train Acc = {train_acc:.4f}, "
                  f"Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}")
        else:
            # 对于回归问题，计算RMSE便于解释
            train_rmse = np.sqrt(avg_train_loss)
            val_rmse = np.sqrt(avg_val_loss)
            print(f"Epoch {epoch:03d}: Train MSE = {avg_train_loss:.4f} (RMSE={train_rmse:.4f}), "
                  f"Val MSE = {avg_val_loss:.4f} (RMSE={val_rmse:.4f})")
        
        # 若超过若干轮次未提升，则提前停止训练
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break
    
    # 恢复模型在验证集上最佳的状态
    if best_state is not None:
        model.load_state_dict(best_state)
    # 返回最佳验证指标（以及对应测试集指标）的结果和最佳轮次
    results = {"best_epoch": best_epoch}
    if task_type == "classification":
        results["best_val_acc"] = val_acc if best_val_score == val_acc else (best_val_score if maximize_metric else None)
    else:
        results["best_val_loss"] = avg_val_loss if best_val_score == -avg_val_loss else (-best_val_score if not maximize_metric else None)
    return results

In [11]:
# # churn dataset
# data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("churn", batch_size=256)
# model = build_model(n_cont, cat_cardinalities, d_out)
# results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
# print("Churn Results:", results)

In [12]:
# adult dataset
data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("adult", batch_size=256)
model = build_model(n_cont, cat_cardinalities, d_out)
results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
print("Churn Results:", results)

binary classification
(30162, 15) (30162,)
(15060, 15) (15060,)
Epoch 001: Train Loss = 0.3453, Train Acc = 0.8414, Val Loss = 0.3176, Val Acc = 0.8509
Epoch 002: Train Loss = 0.3178, Train Acc = 0.8525, Val Loss = 0.3184, Val Acc = 0.8563
Epoch 003: Train Loss = 0.3131, Train Acc = 0.8549, Val Loss = 0.3157, Val Acc = 0.8520
Epoch 004: Train Loss = 0.3112, Train Acc = 0.8558, Val Loss = 0.3108, Val Acc = 0.8550
Epoch 005: Train Loss = 0.3079, Train Acc = 0.8567, Val Loss = 0.3131, Val Acc = 0.8539
Epoch 006: Train Loss = 0.3091, Train Acc = 0.8560, Val Loss = 0.3118, Val Acc = 0.8534
Epoch 007: Train Loss = 0.3074, Train Acc = 0.8555, Val Loss = 0.3145, Val Acc = 0.8562
Epoch 008: Train Loss = 0.3040, Train Acc = 0.8583, Val Loss = 0.3140, Val Acc = 0.8552
Epoch 009: Train Loss = 0.3029, Train Acc = 0.8602, Val Loss = 0.3213, Val Acc = 0.8492
Epoch 010: Train Loss = 0.3024, Train Acc = 0.8594, Val Loss = 0.3092, Val Acc = 0.8560
Epoch 011: Train Loss = 0.3006, Train Acc = 0.8613, Val 

In [13]:
# california dataset
data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("california", batch_size=256)
model = build_model(n_cont, cat_cardinalities, d_out)
results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
print("California Results:", results)

regression
(16512, 8) (16512,)
(4128, 8) (4128,)
Epoch 001: Train MSE = 0.7261 (RMSE=0.8521), Val MSE = 0.5238 (RMSE=0.7238)
Epoch 002: Train MSE = 0.3882 (RMSE=0.6231), Val MSE = 0.3421 (RMSE=0.5849)
Epoch 003: Train MSE = 0.3466 (RMSE=0.5888), Val MSE = 0.3519 (RMSE=0.5932)
Epoch 004: Train MSE = 0.3350 (RMSE=0.5788), Val MSE = 0.3578 (RMSE=0.5982)
Epoch 005: Train MSE = 0.3242 (RMSE=0.5694), Val MSE = 0.3163 (RMSE=0.5624)
Epoch 006: Train MSE = 0.3130 (RMSE=0.5595), Val MSE = 0.3107 (RMSE=0.5574)
Epoch 007: Train MSE = 0.3126 (RMSE=0.5591), Val MSE = 0.3279 (RMSE=0.5726)
Epoch 008: Train MSE = 0.3049 (RMSE=0.5522), Val MSE = 0.2981 (RMSE=0.5460)
Epoch 009: Train MSE = 0.2929 (RMSE=0.5412), Val MSE = 0.3057 (RMSE=0.5529)
Epoch 010: Train MSE = 0.2960 (RMSE=0.5441), Val MSE = 0.3004 (RMSE=0.5481)
Epoch 011: Train MSE = 0.2909 (RMSE=0.5394), Val MSE = 0.3110 (RMSE=0.5577)
Early stopping triggered.
California Results: {'best_epoch': 1, 'best_val_loss': 0.5238264870736026}


In [5]:
# higgs dataset
data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("higgs", batch_size=256)
model = build_model(n_cont, cat_cardinalities, d_out)
results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
print("Higgs Results:", results)

binary classification
(880000, 28) (880000,)
(220000, 28) (220000,)
Epoch 001: Train Loss = 0.5427, Train Acc = 0.7179, Val Loss = 0.5266, Val Acc = 0.7300
Epoch 002: Train Loss = 0.5225, Train Acc = 0.7334, Val Loss = 0.5152, Val Acc = 0.7401
Epoch 003: Train Loss = 0.5157, Train Acc = 0.7384, Val Loss = 0.5085, Val Acc = 0.7427
Epoch 004: Train Loss = 0.5094, Train Acc = 0.7429, Val Loss = 0.5032, Val Acc = 0.7480
Epoch 005: Train Loss = 0.5045, Train Acc = 0.7460, Val Loss = 0.4999, Val Acc = 0.7518
Epoch 006: Train Loss = 0.5008, Train Acc = 0.7486, Val Loss = 0.4978, Val Acc = 0.7517
Epoch 007: Train Loss = 0.4980, Train Acc = 0.7504, Val Loss = 0.4948, Val Acc = 0.7524
Epoch 008: Train Loss = 0.4952, Train Acc = 0.7528, Val Loss = 0.4953, Val Acc = 0.7533
Epoch 009: Train Loss = 0.4928, Train Acc = 0.7542, Val Loss = 0.4909, Val Acc = 0.7557
Epoch 010: Train Loss = 0.4910, Train Acc = 0.7554, Val Loss = 0.4915, Val Acc = 0.7550
Epoch 011: Train Loss = 0.4889, Train Acc = 0.7566, 

In [14]:
# creditcard dataset
data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("creditcard", batch_size=256)
model = build_model(n_cont, cat_cardinalities, d_out)
results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
print("Credit Card Results:", results)

binary classification
(227845, 30) (227845,)
(56962, 30) (56962,)
Epoch 001: Train Loss = 0.0066, Train Acc = 0.9992, Val Loss = 0.0030, Val Acc = 0.9995
Epoch 002: Train Loss = 0.0035, Train Acc = 0.9994, Val Loss = 0.0031, Val Acc = 0.9994
Epoch 003: Train Loss = 0.0034, Train Acc = 0.9993, Val Loss = 0.0030, Val Acc = 0.9994
Epoch 004: Train Loss = 0.0033, Train Acc = 0.9994, Val Loss = 0.0043, Val Acc = 0.9990
Epoch 005: Train Loss = 0.0032, Train Acc = 0.9994, Val Loss = 0.0026, Val Acc = 0.9995
Epoch 006: Train Loss = 0.0031, Train Acc = 0.9993, Val Loss = 0.0030, Val Acc = 0.9992
Epoch 007: Train Loss = 0.0031, Train Acc = 0.9994, Val Loss = 0.0028, Val Acc = 0.9994
Epoch 008: Train Loss = 0.0031, Train Acc = 0.9994, Val Loss = 0.0025, Val Acc = 0.9995
Epoch 009: Train Loss = 0.0030, Train Acc = 0.9994, Val Loss = 0.0026, Val Acc = 0.9995
Epoch 010: Train Loss = 0.0029, Train Acc = 0.9994, Val Loss = 0.0024, Val Acc = 0.9995
Epoch 011: Train Loss = 0.0030, Train Acc = 0.9994, Va

In [None]:
# poker dataset
data_loaders, n_cont, cat_cardinalities, d_out, task_type = prepare_data("poker", batch_size=256)
model = build_model(n_cont, cat_cardinalities, d_out)
results = train_model(model, data_loaders["train"], data_loaders["val"], task_type, d_out, n_epochs=100, batch_size=256, patience=10, lr=3e-4)
print("Poker Results:", results)