## 1. data cleansing

- Replace new data file (categorical variables have been mapped to values and changed to descriptive): updated albumin wash data

In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# ======================
# 1. Read Data
# ======================
df1 = pd.read_csv('/home/mailiyi/Poisoning_Prediction/all_poisoning_data_wide_clean_albumin_20251106.csv')

# ======================
# 2. Define Features
# ======================
x_features_continuous = ['Age',
 'Length of Stay',
 'Weight',
 'Systolic Blood Pressure',
 'Diastolic Blood Pressure',
 'Respiratory Rate', 
 'Heart Rate',
 'White Blood Cell Count',
 'Red Blood Cell Count',
 'Hemoglobin Concentration',
 'Mean Corpuscular Volume',
 'Mean Corpuscular Hemoglobin',
 'Mean Corpuscular Hemoglobin Concentration',
 'Platelet Count',
 'Mean Platelet Volume',
 'Alanine Aminotransferase (ALT)',
 'Total Bilirubin',
 'Direct Bilirubin',
 'Lactate Dehydrogenase (LDH)',
 'Urea',
 'Serum Creatinine',
 'Uric Acid',
 'Creatine Kinase (CK)',
 'Creatine Kinase-MB Isoenzyme',
 'Troponin I',
 'High-Sensitivity C-Reactive Protein (hs-CRP)',
 'Homocysteine',
 'Potassium',
 'Sodium',
 'Chloride',
 'Carbon Dioxide',
 'Prothrombin Time',
 'D-Dimer',
 'Lactate',
 'Blood Cholinesterase Test Results',
 'Albumin (First Measurement)',
 'Albumin (Last Measurement)',
 'Number of Hemoperfusion Sessions',
 'Number of Blood Purification Sessions',
 'Hyperbaric Oxygen Therapy Duration and Frequency',
 'Atropine Dosage',
 'Long-acting Nitroglycerin Dosage',
 'Pralidoxime Dosage',
 ] ## Removed features containing "increasing"
x_features_categorical = ['Gender','Education Level','Type of Poisoning','Hypertension','Hyperlipidemia','Diabetes Mellitus','Cerebrovascular Disease','Heart Disease','Allergy History','Cancer','Poisoning','degree of poisoning','Smoking Status','Alcohol Consumption Status','Shortness of Breath','Chest Pain','Cough','Pre-syncope','Altered Consciousness or Syncope','Sore Throat','Fever','Fatigue','Lower Limb Edema','Palpitations','Vomiting','Nausea','Weakness','Headache','Residence'] # List of categorical variables

## Prediction Target: y
y_column = 'Outcome_other' ## Whether the patient died

# ======================
# 3. Shuffle Data
# ======================
df2 = df1.sample(frac=1, random_state=42).reset_index(drop=True)

# missing_summary = df2[x_features_continuous].isna().sum()
# print("Continuous features with missing values in the original data:")
# print(missing_summary[missing_summary > 0])

In [None]:
# Statistics Distribution of Outcome_other and Outcome
print("\n Outcome_other distribution (death or not)")
print(df2["Outcome_other"].value_counts(dropna=False))

print("\n Outcome distribution (not cured):")
print(df2["Outcome"].value_counts(dropna=False))

- Remove variables with deletion rate>90%

In [3]:
df2.columns.tolist()

['Hospital ID',
 'Gender',
 'Education Level',
 'Type of Poisoning',
 'Hypertension',
 'Hyperlipidemia',
 'Diabetes Mellitus',
 'Cerebrovascular Disease',
 'Heart Disease',
 'Allergy History',
 'Cancer',
 'Poisoning',
 'degree of poisoning',
 'Smoking Status',
 'Alcohol Consumption Status',
 'Shortness of Breath',
 'Chest Pain',
 'Cough',
 'Pre-syncope',
 'Altered Consciousness or Syncope',
 'Sore Throat',
 'Fever',
 'Fatigue',
 'Lower Limb Edema',
 'Palpitations',
 'Vomiting',
 'Nausea',
 'Weakness',
 'Headache',
 'Residence',
 'Age',
 'Length of Stay',
 'Weight',
 'Systolic Blood Pressure',
 'Diastolic Blood Pressure',
 'Respiratory Rate',
 'Heart Rate',
 'White Blood Cell Count',
 'Red Blood Cell Count',
 'Hemoglobin Concentration',
 'Mean Corpuscular Volume',
 'Mean Corpuscular Hemoglobin',
 'Mean Corpuscular Hemoglobin Concentration',
 'Platelet Count',
 'Mean Platelet Volume',
 'Alanine Aminotransferase (ALT)',
 'Total Bilirubin',
 'Direct Bilirubin',
 'Lactate Dehydrogenase (LDH

In [None]:
## Calculate the missing ratio of continuous variables

# Calculate the missing ratio (column-wise)
missing_ratios = df2[x_features_continuous+x_features_categorical].isnull().mean()

# Convert to percentage and sort (descending order)
missing_summary = (missing_ratios * 100).round(2).sort_values(ascending=False)

# Print the results
print("Variable missing ratio (%):")
print(missing_summary)

In [None]:
# Select feature names with missing rate > 90%
high_missing_features = missing_ratios[missing_ratios > 0.90].index.tolist()

# Optional: Print these features
print("Continuous variables with missing rate > 90%:")
for feat in high_missing_features:
    print(f"{feat}: {missing_ratios[feat]*100:.2f}%")

In [None]:
print(len(x_features_continuous))
print(len(high_missing_features))
x_features_continuous = [feat for feat in x_features_continuous if feat not in high_missing_features]
print(len(x_features_continuous))

# df2 Remove variables from high_missing_features
df2 = df2.drop(columns=high_missing_features)
print(df2.shape)

43
11
32
(971, 95)


In [None]:
# ======================
# 4. Fill missing values with the median of all samples
# ======================
median_values = df2[x_features_continuous].median()
df2[x_features_continuous] = df2[x_features_continuous].fillna(median_values)

# Check if there are still missing values
missing_summary = df2[x_features_continuous].isna().sum()
print("Continuous features still containing missing values:")
print(missing_summary[missing_summary > 0])

# ======================
# 5. Normalize (standardize) each column of continuous variables
# ======================
scaler = StandardScaler()
df2[x_features_continuous] = scaler.fit_transform(df2[x_features_continuous])

# Optional: Output the mean and standard deviation of each column to confirm normalization
check_means = df2[x_features_continuous].mean().round(3)
check_stds = df2[x_features_continuous].std().round(3)
print("\nMeans and standard deviations of some continuous features after standardization (should be close to 0 and 1):")
print(pd.DataFrame({'mean': check_means.head(10), 'std': check_stds.head(10)}))

- one-hot: categorical variables are populated first (missing values are populated as Unknown)

In [None]:
## First, fill missing values in categorical variables with "Unknown"
for col in x_features_categorical:
    if col in df2.columns:
        df2[col] = df2[col].fillna('Unknown')

# ======================
# 6. One-Hot Encoding
# ======================
x_columns = x_features_categorical + x_features_continuous
datax = df2[x_columns]
datay = df2[y_column]

datax_encoded = pd.get_dummies(datax, columns=x_features_categorical, drop_first=False)

# Explicitly convert boolean values to float (True→1.0, False→0.0)
datax_encoded = datax_encoded.astype(float)

print(f"\nOriginal number of features: {len(x_columns)}")
print(f"Number of features after One-Hot Encoding: {datax_encoded.shape[1]}")
print(f"Number of samples: {datax_encoded.shape[0]}")

In [None]:
# ======================
# 7. conversion tensor
# ======================
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

print(f"\nTensor form：X={X_tensor.shape}, y={y_tensor.shape}")


Tensor 形状：X=torch.Size([971, 107]), y=torch.Size([971, 1])


## 2.1 Model building (50% cross-validation, 20% test set)

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

# =============== 0. Fix random seed to ensure reproducibility ===============
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensure determinism in convolution operations
    torch.backends.cudnn.benchmark = False     # Disable automatic optimization algorithms
    print(f"Random seed fixed as {seed}")

set_seed(42)

# =============== 1. Save path ===============
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death/"
os.makedirs(save_path, exist_ok=True)

# =============== 2. Device selection ===============
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# =============== 3. Data tensorization ===============
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# =============== 4. Define DNN model ===============
## Remove the Sigmoid() in the last layer and output only logits (raw scores), 
## because we will use BCEWithLogitsLoss (this function internally includes Sigmoid)
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),  # Slightly increase dropout to enhance generalization
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
            # nn.Sigmoid()  # ❌ Do not add Sigmoid
        )

    def forward(self, x):
        return self.net(x)

# =============== 5. Five-fold cross-validation ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []  # ← Used to aggregate prediction results for all patients

for train_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")

    # Fix random seed again (ensure consistent behavior for each fold)
    set_seed(42 + fold)

    # Data splitting
    X_train, X_test = X_tensor[train_index], X_tensor[test_index]
    y_train, y_test = y_tensor[train_index], y_tensor[test_index]

    # Calculate the number of positive and negative samples
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32).to(device)
    print(f"Fold {fold}: pos_weight = {pos_weight:.2f}")

    # DataLoader (reduce batch_size to 16)
    # train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
    # DataLoader: Add worker_init_fn to fix randomness for each worker
    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=16,
        shuffle=True,
        num_workers=0,  # Recommended to set to 0 or less than 4 for full control
        worker_init_fn=worker_init_fn
    )

    # Initialize model, loss, and optimizer
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    # criterion = nn.BCELoss()
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  # ✅ Use class-weighted loss function
    
    # Add L2 regularization (weight decay)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    # =============== 6. Early Stopping settings ===============
    patience = 8
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # =============== 7. Model training ===============
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # === Evaluate AUROC on the validation set at the end of each epoch ===
        model.eval()
        with torch.no_grad():
            # y_pred_prob = model(X_test.to(device)).squeeze().detach().cpu().numpy()
            logits = model(X_test.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()  # ✅ logits → probabilities
            y_true = y_test.squeeze().cpu().numpy()
            auroc = roc_auc_score(y_true, y_pred_prob)

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Test AUROC: {auroc:.4f}")

        # === Early Stopping check ===
        if auroc > best_auroc:
            best_auroc = auroc
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best AUROC={best_auroc:.4f})")
                break

    # =============== 8. Load the best model for final validation ===============
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        # y_pred_prob = model(X_test.to(device)).squeeze().detach().cpu().numpy()
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()  # ✅ logits → probabilities
        y_true = y_test.squeeze().cpu().numpy()

        # Calculate AUROC and AUPRC
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Final AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        # Save prediction results
        result_df = pd.DataFrame({
            "y_test": y_true,
            "y_pred": y_pred_prob
        })
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)

        # Aggregate into overall results
        all_results.append(result_df)

    fold += 1

# =============== 9. Mean and 95% confidence interval ===============
def mean_ci(data, confidence=0.95):
    arr = np.array(data)
    mean = np.mean(arr)
    se = np.std(arr, ddof=1) / np.sqrt(len(arr))
    h = 1.96 * se
    return mean, (mean - h, mean + h)

mean_auroc, ci_auroc = mean_ci(auroc_list)
mean_auprc, ci_auprc = mean_ci(auprc_list)

print("\n===== Five-fold cross-validation results =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({ci_auroc[0]:.4f}, {ci_auroc[1]:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({ci_auprc[0]:.4f}, {ci_auprc[1]:.4f})")

# -----------------------------
# Aggregate results for all patients and save
# -----------------------------
all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)

print(f"\n✅ Prediction results for all folds have been merged and saved as: {all_results_path}")


Random seed fixed as 42
Using device: cuda

===== Fold 1 =====
Random seed fixed as 43
Fold 1: pos_weight = 11.12
Epoch 001 | Loss: 1.2873 | Test AUROC: 0.7797
Epoch 002 | Loss: 0.9911 | Test AUROC: 0.7994
Epoch 003 | Loss: 0.7455 | Test AUROC: 0.7847
Epoch 004 | Loss: 0.6295 | Test AUROC: 0.7652
Epoch 005 | Loss: 0.5492 | Test AUROC: 0.7370
Epoch 006 | Loss: 0.4775 | Test AUROC: 0.7335
Epoch 007 | Loss: 0.4169 | Test AUROC: 0.7279
Epoch 008 | Loss: 0.4615 | Test AUROC: 0.7542
Epoch 009 | Loss: 0.4563 | Test AUROC: 0.7483
Epoch 010 | Loss: 0.4114 | Test AUROC: 0.7357
Early stopping at epoch 10 (best AUROC=0.7994)
[Fold 1] Final AUROC: 0.7994, AUPRC: 0.3490

===== Fold 2 =====
Random seed fixed as 44
Fold 2: pos_weight = 10.60
Epoch 001 | Loss: 1.2649 | Test AUROC: 0.9181
Epoch 002 | Loss: 1.0273 | Test AUROC: 0.9263
Epoch 003 | Loss: 0.8260 | Test AUROC: 0.9240
Epoch 004 | Loss: 0.7939 | Test AUROC: 0.9151
Epoch 005 | Loss: 0.6931 | Test AUROC: 0.9203
Epoch 006 | Loss: 0.4978 | Test AU

## 2.2 Build model: 10% cross validation, 10% validation set, 10% test set

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

# =============== 0. Fix random seed for reproducibility ===============
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensure deterministic convolution operations
    torch.backends.cudnn.benchmark = False     # Disable automatic optimization algorithms
    print(f"Random seed fixed as {seed}")

set_seed(42)

# =============== 1. Save path ===============
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_valid_test/"
os.makedirs(save_path, exist_ok=True)

# =============== 2. Device selection ===============
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# =============== 3. Tensorize data ===============
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# =============== 4. Define DNN model ===============
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

# =============== 5. 10-fold cross-validation ===============
kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []

for train_val_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")

    set_seed(42 + fold)

    # Initial split: 90% train_val, 10% test
    X_train_val, X_test = X_tensor[train_val_index], X_tensor[test_index]
    y_train_val, y_test = y_tensor[train_val_index], y_tensor[test_index]

    # Further split train_val: 80% train, 10% validation (overall ratio 80/10/10)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=1/9, random_state=42, stratify=y_train_val
    )

    # Calculate positive and negative sample ratios
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32).to(device)
    print(f"Fold {fold}: pos_weight = {pos_weight:.2f}")

    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=16,
        shuffle=True,
        num_workers=0,
        worker_init_fn=worker_init_fn
    )

    # Initialize model and optimizer
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    # =============== 6. Early Stopping ===============
    patience = 8
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # =============== 7. Model training ===============
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # === Validate AUROC after each epoch ===
        model.eval()
        with torch.no_grad():
            logits = model(X_val.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()
            y_true = y_val.squeeze().cpu().numpy()
            auroc_val = roc_auc_score(y_true, y_pred_prob)

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f}")

        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # =============== 8. Load best model and evaluate on test set ===============
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()
        y_true = y_test.squeeze().cpu().numpy()
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Test AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        result_df = pd.DataFrame({
            "y_test": y_true,
            "y_pred": y_pred_prob
        })
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
        all_results.append(result_df)

    fold += 1

# =============== 9. Calculate mean and 95% confidence interval ===============
def mean_ci(data, confidence=0.95):
    arr = np.array(data)
    mean = np.mean(arr)
    se = np.std(arr, ddof=1) / np.sqrt(len(arr))
    h = 1.96 * se
    return mean, (mean - h, mean + h)

mean_auroc, ci_auroc = mean_ci(auroc_list)
mean_auprc, ci_auprc = mean_ci(auprc_list)

print("\n===== 10-Fold Cross-Validation Results =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({ci_auroc[0]:.4f}, {ci_auroc[1]:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({ci_auprc[0]:.4f}, {ci_auprc[1]:.4f})")

# -----------------------------
# Combine all patient prediction results
# -----------------------------
all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)

print(f"\n✅ All fold prediction results have been combined and saved to: {all_results_path}")


## 2.3. 5-fold cross validation: Divide 1/8 of the training set into validation sets (i.e. 70% training set, 10% validation set, 20% test set)

- 2.3.1. Calculate 95% CI and meanAUC with auc verified by 5-fold cross

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

# =============== 0. Fix random seed for reproducibility ===============
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensure deterministic convolution operations
    torch.backends.cudnn.benchmark = False     # Disable automatic optimization algorithms
    print(f"Random seed fixed as {seed}")

set_seed(42)

# =============== 1. Save path ===============
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_valid_test_5cv/"
os.makedirs(save_path, exist_ok=True)

# =============== 2. Device selection ===============
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# =============== 3. Data tensorization ===============
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# =============== 4. Define DNN model ===============
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)
    
# =============== 5. 5-fold cross-validation ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []

for train_val_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")

    set_seed(42 + fold)

    # Initial split: 80% train_val, 20% test
    X_train_val, X_test = X_tensor[train_val_index], X_tensor[test_index]
    y_train_val, y_test = y_tensor[train_val_index], y_tensor[test_index]

    # Split train_val: training set 7/8, validation set 1/8 (overall ~70/10/20)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=1/8, random_state=42, stratify=y_train_val
    )

    # Calculate positive and negative sample ratio
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32).to(device)
    print(f"Fold {fold}: pos_weight = {pos_weight:.2f}")

    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=16,
        shuffle=True,
        num_workers=0,
        worker_init_fn=worker_init_fn
    )

    # Initialize model and optimizer
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    # =============== 6. Early Stopping ===============
    patience = 8
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # =============== 7. Model training ===============
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # === Validate AUROC after each epoch ===
        model.eval()
        with torch.no_grad():
            logits = model(X_val.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()
            y_true = y_val.squeeze().cpu().numpy()
            auroc_val = roc_auc_score(y_true, y_pred_prob)

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f}")

        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # =============== 8. Load best model and evaluate on test set ===============
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()
        y_true = y_test.squeeze().cpu().numpy()
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Test AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        result_df = pd.DataFrame({
            "y_test": y_true,
            "y_pred": y_pred_prob
        })
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
        all_results.append(result_df)

    fold += 1

# =============== 9. Calculate mean and 95% confidence interval ===============
def mean_ci(data, confidence=0.95):
    arr = np.array(data)
    mean = np.mean(arr)
    se = np.std(arr, ddof=1) / np.sqrt(len(arr))
    h = 1.96 * se
    return mean, (mean - h, mean + h)

mean_auroc, ci_auroc = mean_ci(auroc_list)
mean_auprc, ci_auprc = mean_ci(auprc_list)

print("\n===== 5-Fold Cross-Validation Results =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({ci_auroc[0]:.4f}, {ci_auroc[1]:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({ci_auprc[0]:.4f}, {ci_auprc[1]:.4f})")

# -----------------------------
# Combine all patient prediction results
# -----------------------------
all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)

print(f"\n✅ All fold prediction results have been combined and saved to: {all_results_path}")

- 2.3.2. bootstrap approach to calculate 95% confidence intervals for AUROC and AUPRC

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

# =============== 0. Fix random seed to ensure reproducibility ===============
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Multi-GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # Ensure deterministic convolution operations
    torch.backends.cudnn.benchmark = False     # Disable automatic optimization algorithms
    print(f"Random seed fixed as {seed}")

set_seed(42)

# =============== 1. Save path ===============
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_valid_test_5cv/"
os.makedirs(save_path, exist_ok=True)

# =============== 2. Device selection ===============
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# =============== 3. Tensorize data ===============
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# =============== 4. Define DNN model ===============
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)
    
# =============== 5. 5-fold cross-validation ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []

for train_val_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")

    set_seed(42 + fold)

    # Initial split: 80% train_val, 20% test
    X_train_val, X_test = X_tensor[train_val_index], X_tensor[test_index]
    y_train_val, y_test = y_tensor[train_val_index], y_tensor[test_index]

    # Split train_val: training set 7/8, validation set 1/8 (overall ~70/10/20)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=1/8, random_state=42, stratify=y_train_val
    )

    # Calculate positive and negative sample ratio
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32).to(device)
    print(f"Fold {fold}: pos_weight = {pos_weight:.2f}")

    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=16,
        shuffle=True,
        num_workers=0,
        worker_init_fn=worker_init_fn
    )

    # Initialize model and optimizer
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    # =============== 6. Early Stopping ===============
    patience = 8
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # =============== 7. Model training ===============
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # === Validate AUROC after each epoch ===
        model.eval()
        with torch.no_grad():
            logits = model(X_val.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()
            y_true = y_val.squeeze().cpu().numpy()
            auroc_val = roc_auc_score(y_true, y_pred_prob)

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f}")

        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # =============== 8. Load the best model and evaluate on the test set ===============
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()
        y_true = y_test.squeeze().cpu().numpy()
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Test AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        result_df = pd.DataFrame({
            "y_test": y_true,
            "y_pred": y_pred_prob
        })
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
        all_results.append(result_df)

    fold += 1

# =============== 9. Bootstrap to calculate overall AUROC and AUPRC with 95% CI ===============
from sklearn import metrics

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        scores.append(metric_fn(y_true[indices], y_pred[indices]))
    mean_score = np.mean(scores)
    lower = np.percentile(scores, 2.5)
    upper = np.percentile(scores, 97.5)
    return mean_score, lower, upper

# Combine predictions from all folds
all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
y_all_true = all_results_df["y_test"].values
y_all_pred = all_results_df["y_pred"].values

# Use bootstrap to calculate overall AUROC and AUPRC with 95% CI
mean_auroc, auc_lower, auc_upper = bootstrap_metric_ci(
    y_all_true, y_all_pred, metrics.roc_auc_score, n_bootstrap=2000, seed=42
)
mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(
    y_all_true, y_all_pred, metrics.average_precision_score, n_bootstrap=2000, seed=42
)

print("\n===== 5-Fold Cross-Validation Results (Bootstrap) =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({auc_lower:.4f}, {auc_upper:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({auprc_lower:.4f}, {auprc_upper:.4f})")

# -----------------------------
# Save all patient prediction results
# -----------------------------
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_valid_test_5cv/"
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)
print(f"\n✅ All fold prediction results have been merged and saved to: {all_results_path}")

- 2.3.3. 减少过拟合

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random
from sklearn import metrics

# =============== 0. Fix random seed ===============
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed fixed as {seed}")

set_seed(42)

# =============== 1. Save path ===============
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_valid_test_5cv/"
os.makedirs(save_path, exist_ok=True)

# =============== 2. Device selection ===============
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# =============== 3. Tensorize data ===============
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# =============== 4. Improved DNN model ===============
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(64, 32),
            nn.ReLU(),

            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

# =============== 5. 5-fold cross-validation ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []

for train_val_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")
    set_seed(42 + fold)

    # Split data
    X_train_val, X_test = X_tensor[train_val_index], X_tensor[test_index]
    y_train_val, y_test = y_tensor[train_val_index], y_tensor[test_index]
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=1/8, random_state=42 + fold, stratify=y_train_val
    )

    # Class weights
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32).to(device)
    print(f"Fold {fold}: pos_weight = {pos_weight:.2f}")

    # DataLoader
    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=32,
        shuffle=True,
        num_workers=0,
        worker_init_fn=worker_init_fn
    )

    # Model, optimizer, scheduler, loss function
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3
    )

    # Early stopping
    patience = 12
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # =============== 6. Training ===============
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # Validation set
        model.eval()
        with torch.no_grad():
            logits = model(X_val.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()
            y_true = y_val.squeeze().cpu().numpy()
            auroc_val = roc_auc_score(y_true, y_pred_prob)
        scheduler.step(auroc_val)
        current_lr = optimizer.param_groups[0]['lr']

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f} | LR={current_lr:.6f}")

        # Early stopping check
        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # =============== 7. Test set evaluation ===============
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()
        y_true = y_test.squeeze().cpu().numpy()
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Test AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        result_df = pd.DataFrame({"y_test": y_true, "y_pred": y_pred_prob})
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
        all_results.append(result_df)

    fold += 1

# =============== 8. Bootstrap to calculate overall metrics ===============
def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
y_all_true = all_results_df["y_test"].values
y_all_pred = all_results_df["y_pred"].values

mean_auroc, auc_lower, auc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, metrics.roc_auc_score)
mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, metrics.average_precision_score)

print("\n===== 5-fold cross-validation results (Bootstrap) =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({auc_lower:.4f}, {auc_upper:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({auprc_lower:.4f}, {auprc_upper:.4f})")

# =============== 9. Save all prediction results ===============
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)
print(f"\n✅ All fold prediction results have been merged and saved to: {all_results_path}")


Random seed fixed as 42
Using device: cuda

===== Fold 1 =====
Random seed fixed as 43
Fold 1: pos_weight = 11.12
Epoch 001 | Loss: 1.2887 | Val AUROC: 0.4115 | LR=0.000500
Epoch 002 | Loss: 1.2559 | Val AUROC: 0.6419 | LR=0.000500
Epoch 003 | Loss: 1.1981 | Val AUROC: 0.7402 | LR=0.000500
Epoch 004 | Loss: 1.1700 | Val AUROC: 0.7711 | LR=0.000500
Epoch 005 | Loss: 1.1492 | Val AUROC: 0.8287 | LR=0.000500
Epoch 006 | Loss: 1.0729 | Val AUROC: 0.8469 | LR=0.000500
Epoch 007 | Loss: 1.0249 | Val AUROC: 0.8539 | LR=0.000500
Epoch 008 | Loss: 0.9427 | Val AUROC: 0.8539 | LR=0.000500
Epoch 009 | Loss: 0.8930 | Val AUROC: 0.8638 | LR=0.000500
Epoch 010 | Loss: 0.8408 | Val AUROC: 0.8652 | LR=0.000500
Epoch 011 | Loss: 0.7568 | Val AUROC: 0.8652 | LR=0.000500
Epoch 012 | Loss: 0.7245 | Val AUROC: 0.8750 | LR=0.000500
Epoch 013 | Loss: 0.6680 | Val AUROC: 0.8750 | LR=0.000500
Epoch 014 | Loss: 0.6280 | Val AUROC: 0.8750 | LR=0.000500
Epoch 015 | Loss: 0.5854 | Val AUROC: 0.8581 | LR=0.000500
E

### finally used 5 CV (70% train, 10% valid, 20% test)

In [None]:
### 增加校准模型，概率校准 ###

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import TensorDataset, DataLoader
# from sklearn.model_selection import KFold, train_test_split
# from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
# import pandas as pd
# import numpy as np
# import os
# import random
# from scipy.optimize import minimize

# # ===================== 固定随机种子 =====================
# def set_seed(seed=42):
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)
#     np.random.seed(seed)
#     random.seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False

# set_seed(42)

# # ===================== DNN 模型 =====================
# class DNN(nn.Module):
#     def __init__(self, input_dim):
#         super(DNN, self).__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_dim, 128),
#             nn.BatchNorm1d(128),
#             nn.ReLU(),
#             nn.Dropout(0.5),

#             nn.Linear(128, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),
#             nn.Dropout(0.5),

#             nn.Linear(64, 32),
#             nn.ReLU(),

#             nn.Linear(32, 1)
#         )

#     def forward(self, x):
#         return self.net(x)

# # ===================== Platt scaling（sigmoid 校准） =====================
# def platt_calibration(y_val, logits_val):
#     """
#     使用 logits 拟合 Platt scaling（sigmoid 校准），返回校准函数
#     """
#     def sigmoid_loss(params, y_true, logits):
#         a, b = params
#         p = 1 / (1 + np.exp(-(a * logits + b)))
#         eps = 1e-15
#         loss = -np.mean(y_true * np.log(p + eps) + (1 - y_true) * np.log(1 - p + eps))
#         return loss

#     res = minimize(sigmoid_loss, x0=[1.0, 0.0], args=(y_val, logits_val), method='BFGS')
#     a, b = res.x
#     return lambda logits: 1 / (1 + np.exp(-(a * logits + b)))

# # ===================== Bootstrap AUROC =====================
# def bootstrap_metric_ci(y_true, y_pred, metric_fn=roc_auc_score, n_bootstrap=2000, seed=42):
#     rng = np.random.RandomState(seed)
#     y_true, y_pred = np.array(y_true), np.array(y_pred)
#     scores = []
#     for _ in range(n_bootstrap):
#         idx = rng.randint(0, len(y_true), len(y_true))
#         if len(np.unique(y_true[idx])) < 2:
#             continue
#         scores.append(metric_fn(y_true[idx], y_pred[idx]))
#     return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

# # ===================== DNN 五折训练 + Platt 校准 =====================
# def train_dnn_5fold_platt(dataX, dataY, save_path='/home/mailiyi/Poisoning_Prediction/DNN/predict_death_calibration/', seed=42):
#     os.makedirs(save_path, exist_ok=True)

#     X_tensor = torch.tensor(dataX.values, dtype=torch.float32)
#     y_tensor = torch.tensor(dataY.values, dtype=torch.float32).unsqueeze(1)
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     input_dim = X_tensor.shape[1]

#     kf = KFold(n_splits=5, shuffle=True, random_state=seed)
#     all_results = []
#     auprc_list, brier_list = [], []

#     fold = 1
#     for train_val_idx, test_idx in kf.split(X_tensor):
#         set_seed(seed + fold)
#         X_train_val, X_test = X_tensor[train_val_idx], X_tensor[test_idx]
#         y_train_val, y_test = y_tensor[train_val_idx], y_tensor[test_idx]

#         # 拆分验证集
#         X_train, X_val, y_train, y_val = train_test_split(
#             X_train_val, y_train_val, test_size=1/8, stratify=y_train_val, random_state=seed+fold
#         )

#         # 类别权重
#         num_pos = (y_train == 1).sum().item()
#         num_neg = (y_train == 0).sum().item()
#         pos_weight = torch.tensor(num_neg / max(num_pos,1), dtype=torch.float32).to(device)

#         # DataLoader
#         train_loader = DataLoader(
#             TensorDataset(X_train, y_train),
#             batch_size=32,
#             shuffle=True
#         )

#         # 模型
#         model = DNN(input_dim=input_dim).to(device)
#         criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
#         optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-4)
#         scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

#         # Early stopping
#         patience = 12
#         best_auroc = 0
#         wait = 0
#         best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

#         # ===================== 训练 =====================
#         for epoch in range(100):
#             model.train()
#             total_loss = 0
#             for batch_X, batch_y in train_loader:
#                 batch_X, batch_y = batch_X.to(device), batch_y.to(device)
#                 optimizer.zero_grad()
#                 outputs = model(batch_X)
#                 loss = criterion(outputs, batch_y)
#                 loss.backward()
#                 optimizer.step()
#                 total_loss += loss.item()
#             avg_loss = total_loss / len(train_loader)

#             # 验证集 AUROC
#             model.eval()
#             with torch.no_grad():
#                 logits_val = model(X_val.to(device)).squeeze()
#                 y_val_logits = logits_val.cpu().numpy()
#                 y_val_true = y_val.cpu().numpy()
#                 auroc_val = roc_auc_score(y_val_true, 1 / (1 + np.exp(-y_val_logits)))

#             scheduler.step(auroc_val)

#             # Early stopping
#             if auroc_val > best_auroc:
#                 best_auroc = auroc_val
#                 wait = 0
#                 torch.save(model.state_dict(), best_model_path)
#             else:
#                 wait += 1
#                 if wait >= patience:
#                     break

#         # 加载最优模型
#         model.load_state_dict(torch.load(best_model_path, map_location=device))
#         model.eval()

#         # ===================== Platt 校准 =====================
#         with torch.no_grad():
#             logits_val = model(X_val.to(device)).squeeze().cpu().numpy()
#             y_val_true = y_val.cpu().numpy()
#         sigmoid_fn = platt_calibration(y_val_true, logits_val)

#         # ===================== 测试集预测 + 校准 =====================
#         with torch.no_grad():
#             logits_test = model(X_test.to(device)).squeeze().cpu().numpy()
#             y_test_pred_calib = sigmoid_fn(logits_test)
#             y_test_true = y_test.cpu().numpy().squeeze()

#         # 指标
#         auprc_list.append(average_precision_score(y_test_true, y_test_pred_calib))
#         brier_list.append(brier_score_loss(y_test_true, y_test_pred_calib))

#         # 保存折结果
#         fold_df = pd.DataFrame({"y_test": y_test_true, "y_pred": y_test_pred_calib})
#         fold_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
#         all_results.append(fold_df)

#         fold += 1

#     # ===================== 合并所有折 =====================
#     all_results_df = pd.concat(all_results, ignore_index=True)
#     all_results_df.to_csv(os.path.join(save_path, "all_folds_results.csv"), index=False)

#     # ===================== Bootstrap AUROC =====================
#     mean_auroc, auroc_l, auroc_u = bootstrap_metric_ci(all_results_df["y_test"], all_results_df["y_pred"], roc_auc_score)

#     # AUPRC 和 Brier 分数
#     def mean_ci(x):
#         mean = np.mean(x)
#         ci = 1.96 * np.std(x, ddof=1) / np.sqrt(len(x))
#         return mean, mean - ci, mean + ci

#     mean_auprc, auprc_l, auprc_u = mean_ci(auprc_list)
#     mean_brier, brier_l, brier_u = mean_ci(brier_list)

#     print(f"AUROC (bootstrap): {mean_auroc:.4f} ({auroc_l:.4f}-{auroc_u:.4f})")
#     print(f"AUPRC: {mean_auprc:.4f} ({auprc_l:.4f}-{auprc_u:.4f})")
#     print(f"Brier score: {mean_brier:.4f} ({brier_l:.4f}-{brier_u:.4f})")

#     return {
#         "AUROC_mean": mean_auroc, "AUROC_CI": (auroc_l, auroc_u),
#         "AUPRC_mean": mean_auprc, "AUPRC_CI": (auprc_l, auprc_u),
#         "Brier_mean": mean_brier, "Brier_CI": (brier_l, brier_u),
#         "AllResults": all_results_df
#     }

# # ===================== 使用示例 =====================
# results = train_dnn_5fold_platt(datax_encoded, datay)


AUROC (bootstrap): 0.4777 (0.4019-0.5507)
AUPRC: 0.2562 (0.0924-0.4200)
Brier score: 0.0774 (0.0673-0.0874)


In [None]:
import torch   
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random
from sklearn import metrics

# ===================== 0. Fix random seed =====================
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed fixed as {seed}")

set_seed(42)

# ===================== 1. Save path =====================
save_path = "/home/mailiyi/Poisoning_Prediction/DNN/predict_death_calibration/"
os.makedirs(save_path, exist_ok=True)

# ===================== 2. Device selection =====================
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# ===================== 3. Tensorize data =====================
X_tensor = torch.tensor(datax_encoded.values, dtype=torch.float32)
y_tensor = torch.tensor(datay.values, dtype=torch.float32).unsqueeze(1)

# ===================== 4. DNN model =====================
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(64, 32),
            nn.ReLU(),

            nn.Linear(32, 1)  # Output logits
        )

    def forward(self, x):
        return self.net(x)

# ===================== 5. 5-fold cross-validation =====================
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auroc_list, auprc_list = [], []
all_results = []

for train_val_index, test_index in kf.split(X_tensor):
    print(f"\n===== Fold {fold} =====")
    set_seed(42 + fold)

    # Split data
    X_train_val, X_test = X_tensor[train_val_index], X_tensor[test_index]
    y_train_val, y_test = y_tensor[train_val_index], y_tensor[test_index]
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=1/8, random_state=42 + fold, stratify=y_train_val
    )

    # DataLoader
    def worker_init_fn(worker_id):
        np.random.seed(42 + worker_id)
        random.seed(42 + worker_id)

    train_loader = DataLoader(
        TensorDataset(X_train, y_train),
        batch_size=32,
        shuffle=True,
        num_workers=0,
        worker_init_fn=worker_init_fn
    )

    # Model, optimizer, loss function
    model = DNN(input_dim=X_tensor.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss()  # Without using pos_weight
    optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3
    )

    # Early stopping parameters
    patience = 12
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold{fold}_best_model.pt")

    # ===================== 6. Training =====================
    max_epochs = 100
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)           # logits
            loss = criterion(outputs, batch_y) # BCEWithLogitsLoss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # Validation set
        model.eval()
        with torch.no_grad():
            logits = model(X_val.to(device)).squeeze()
            y_pred_prob = torch.sigmoid(logits).cpu().numpy()
            y_true = y_val.squeeze().cpu().numpy()
            auroc_val = roc_auc_score(y_true, y_pred_prob)
        scheduler.step(auroc_val)
        current_lr = optimizer.param_groups[0]['lr']

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f} | LR={current_lr:.6f}")

        # Early stopping check
        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # ===================== 7. Test set evaluation =====================
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test.to(device)).squeeze()
        y_pred_prob = torch.sigmoid(logits).cpu().numpy()
        y_true = y_test.squeeze().cpu().numpy()
        auroc = roc_auc_score(y_true, y_pred_prob)
        auprc = average_precision_score(y_true, y_pred_prob)
        auroc_list.append(auroc)
        auprc_list.append(auprc)

        print(f"[Fold {fold}] Test AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")

        result_df = pd.DataFrame({"y_test": y_true, "y_pred": y_pred_prob})
        result_df.to_csv(os.path.join(save_path, f"fold{fold}_results.csv"), index=False)
        all_results.append(result_df)

    fold += 1

# ===================== 8. Bootstrap to calculate overall metrics =====================
def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
y_all_true = all_results_df["y_test"].values
y_all_pred = all_results_df["y_pred"].values

mean_auroc, auc_lower, auc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, metrics.roc_auc_score)
mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, metrics.average_precision_score)

print("\n===== 5-fold cross-validation results (Bootstrap) =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({auc_lower:.4f}, {auc_upper:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({auprc_lower:.4f}, {auprc_upper:.4f})")

# ===================== 9. Save all prediction results =====================
all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)
print(f"\n✅ All fold prediction results have been merged and saved to: {all_results_path}")


Random seed fixed as 42
Using device: cuda

===== Fold 1 =====
Random seed fixed as 43
Epoch 001 | Loss: 0.6937 | Val AUROC: 0.2430 | LR=0.000500
Epoch 002 | Loss: 0.5409 | Val AUROC: 0.3315 | LR=0.000500
Epoch 003 | Loss: 0.4296 | Val AUROC: 0.4298 | LR=0.000500
Epoch 004 | Loss: 0.3486 | Val AUROC: 0.5281 | LR=0.000500
Epoch 005 | Loss: 0.2919 | Val AUROC: 0.6404 | LR=0.000500
Epoch 006 | Loss: 0.2752 | Val AUROC: 0.7135 | LR=0.000500
Epoch 007 | Loss: 0.2385 | Val AUROC: 0.7458 | LR=0.000500
Epoch 008 | Loss: 0.2313 | Val AUROC: 0.7921 | LR=0.000500
Epoch 009 | Loss: 0.2161 | Val AUROC: 0.8244 | LR=0.000500
Epoch 010 | Loss: 0.2032 | Val AUROC: 0.8497 | LR=0.000500
Epoch 011 | Loss: 0.1952 | Val AUROC: 0.8581 | LR=0.000500
Epoch 012 | Loss: 0.1904 | Val AUROC: 0.8708 | LR=0.000500
Epoch 013 | Loss: 0.1696 | Val AUROC: 0.8778 | LR=0.000500
Epoch 014 | Loss: 0.1621 | Val AUROC: 0.8876 | LR=0.000500
Epoch 015 | Loss: 0.1679 | Val AUROC: 0.8834 | LR=0.000500
Epoch 016 | Loss: 0.1881 | V