In [2]:
# 导入数据
# 对比五个简单的分类器
# import mrmr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix, classification_report

In [3]:
tongji_data = pd.read_csv('./feature/tongji/dl_feature_tongji.csv', index_col=0).iloc[:, 3:]
xiangyang_data = pd.read_csv('./feature/xiangyang/dl_feature_xiangyang.csv', index_col=0).iloc[:, 3:]
kits_data = pd.read_csv('./feature/kits/dl_feature_kits.csv', index_col=0).iloc[:, 3:]
henan_data = pd.read_csv('./feature/henan/dl_feature_henan.csv', index_col=0).iloc[:, 3:]
# index使用最后一个作为索引
tongji_data.index = tongji_data.index.str.split('/').str[-1]
tongji_data.index = tongji_data.index.str.split('.').str[0]
xiangyang_data.index = xiangyang_data.index.str.split('/').str[-1]
xiangyang_data.index = xiangyang_data.index.str.split('.').str[0]
kits_data.index = kits_data.index.str.split('/').str[-1]
kits_data.index = kits_data.index.str.split('.').str[0]
henan_data.index = henan_data.index.str.split('/').str[-1]
henan_data.index = henan_data.index.str.split('.').str[0]

tongji_label = pd.read_csv('./feature/tongji/tongji.csv', index_col=0)
xiangyang_label = pd.read_csv('./feature/xiangyang/xiangyang.csv', index_col=0)
kits_label = pd.read_csv('./feature/kits/kits.csv', index_col=0)
henan_label = pd.read_csv('./feature/henan/henan.csv', index_col=0)

feature_data = pd.concat([tongji_data, xiangyang_data], axis=0)
feature_label = pd.concat([tongji_label, xiangyang_label], axis=0)

# 数据分组
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_data, feature_label, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(478, 4096) (206, 4096) (478, 1) (206, 1)


In [8]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import matplotlib.pyplot as plt

def set_seed(seed=42):
    '''Set all random seeds for reproducibility'''
    random.seed(seed)  # Python的random模块
    np.random.seed(seed)  # Numpy模块
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # PyTorch GPU (多GPU)
    os.environ['PYTHONHASHSEED'] = str(seed)  # Python hash种子
    torch.backends.cudnn.deterministic = True  # 确保每次返回的卷积算法是确定的
    torch.backends.cudnn.benchmark = False  # True的话会自动寻找最适合当前配置的高效算法，来达到优化运行效率的问题。False可以保证实验结果的可重现性。

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        # 确保输入形状正确
        # inputs = inputs.squeeze()
        targets = targets.float()
        # print(inputs.shape, targets.shape)
        
        # 使用 BCEWithLogitsLoss 代替 CrossEntropy
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * bce_loss
        return focal_loss.mean()

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class DeepClassifier(nn.Module):
    def __init__(self, input_dim=4096, dropout_rate=0.5):
        super().__init__()
        self.network = nn.Sequential(
            # 第一层：4096 -> 2048
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.BatchNorm1d(2048),
            nn.Dropout(dropout_rate),
            
            # 第二层：2048 -> 1024
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(dropout_rate),
            
            # 第三层：1024 -> 256
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(dropout_rate),
            
            # 第四层：256 -> 64
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout_rate),
            
            # 输出层：64 -> 1
            nn.Linear(64, 1)  # 二分类问题，使用一个输出节点
        )
    
    def forward(self, x):
        return self.network(x)

def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        predictions.extend((torch.sigmoid(outputs) > 0.5).float().cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    
    epoch_loss = total_loss / len(train_loader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary', zero_division=0)
    
    return epoch_loss, accuracy, precision, recall, f1

@torch.no_grad()
def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    probabilities = []
    
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        total_loss += loss.item()
        probs = torch.sigmoid(outputs)
        predictions.extend((probs > 0.5).float().cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        probabilities.extend(probs.cpu().numpy())
    
    val_loss = total_loss / len(val_loader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary', zero_division=0)
    try:
        auc = roc_auc_score(true_labels, probabilities)
    except:
        auc = 0.0
    
    return val_loss, accuracy, precision, recall, f1, auc

def train_model(X_train, X_test, y_train, y_test, config):
    # 数据预处理
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 创建数据加载器
    train_dataset = CustomDataset(X_train_scaled, y_train.values)
    test_dataset = CustomDataset(X_test_scaled, y_test.values)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=config['batch_size'], 
        shuffle=True
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=config['batch_size']
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 初始化模型（添加input_dim参数）
    model = DeepClassifier(
        input_dim=X_train.shape[1],
        dropout_rate=config['dropout_rate']
    ).to(device)
    
    criterion = FocalLoss(alpha=config['focal_alpha'], gamma=config['focal_gamma'])
    optimizer = optim.Adam(
        model.parameters(), 
        lr=config['learning_rate'], 
        weight_decay=config['weight_decay']
    )
    
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    #     optimizer, 
    #     mode='min', 
    #     factor=0.5, 
    #     patience=20, 
    #     verbose=True
    # )
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, 
        T_max=config['epochs'], 
        eta_min=0.0001, 
        verbose=True
    )
    
    history = {
        'train_loss': [], 'train_acc': [], 
        'val_loss': [], 'val_acc': [],
        'val_auc': []
    }
    
    best_val_auc = 0
    best_model_state = None
    
    for epoch in range(config['epochs']):
        train_loss, train_acc, train_prec, train_recall, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        val_loss, val_acc, val_prec, val_recall, val_f1, val_auc = validate(
            model, test_loader, criterion, device
        )
        
        # scheduler.step(val_loss)
        scheduler.step()
        
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_state = model.state_dict().copy()
            
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['val_auc'].append(val_auc)
        
        print(f"Epoch {epoch+1}/{config['epochs']}")
        print(f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, AUC: {val_auc:.4f}")
        print(f"Val Precision: {val_prec:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")
        print("-"*50)
    
    model.load_state_dict(best_model_state)
    return model, history

# plot_training_history 函数保持不变

if __name__ == "__main__":
    set_seed(42)
    config = {
        'batch_size': 32,
        'learning_rate': 0.005,
        'weight_decay': 1e-4,
        'dropout_rate': 0.4,
        'epochs': 100,
        'focal_alpha': 0.7,
        'focal_gamma': 2
    }
    model, history = train_model(X_train, X_test, y_train, y_test, config)
    print(max(history['val_auc']))

Adjusting learning rate of group 0 to 5.0000e-03.
Adjusting learning rate of group 0 to 4.9988e-03.
Epoch 1/100
Train Loss: 0.1574, Acc: 0.5607
Val Loss: 0.1057, Acc: 0.7233, AUC: 0.3867
Val Precision: 0.0250, Recall: 0.0526, F1: 0.0339
--------------------------------------------------
Adjusting learning rate of group 0 to 4.9952e-03.
Epoch 2/100
Train Loss: 0.1051, Acc: 0.6862
Val Loss: 0.0877, Acc: 0.8252, AUC: 0.5356
Val Precision: 0.1304, Recall: 0.1579, F1: 0.1429
--------------------------------------------------
Adjusting learning rate of group 0 to 4.9891e-03.
Epoch 3/100
Train Loss: 0.0859, Acc: 0.8013
Val Loss: 0.0709, Acc: 0.9078, AUC: 0.4444
Val Precision: 0.0000, Recall: 0.0000, F1: 0.0000
--------------------------------------------------
Adjusting learning rate of group 0 to 4.9807e-03.
Epoch 4/100
Train Loss: 0.0715, Acc: 0.8598
Val Loss: 0.0644, Acc: 0.9078, AUC: 0.5916
Val Precision: 0.0000, Recall: 0.0000, F1: 0.0000
-------------------------------------------------