## Step 1: Data Preprocessing

In [None]:
import pandas as pd 
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import StandardScaler, LabelEncoder

df_series_filled = pd.read_csv('./Poisoning_Prediction/LSTM/clean_data/poisoning_data_sequence_filled_onehot.csv')

df_unique = df_series_filled.drop_duplicates(subset=['Hospital ID']).reset_index(drop=True)

# Statistics Distribution of Outcome_other and Outcome
print(df_unique["Outcome_other"].value_counts(dropna=False))

print(df_unique["Outcome"].value_counts(dropna=False))

df_series_filled['检验时间'] = pd.to_datetime(df_series_filled['检验时间'], errors='coerce')

static_cols = [
    'Age',
    'Length of Stay',
    'Weight',
    ] + [
    'Gender_Female',
    'Gender_Male',
    'Education Level_Illiterate',
    'Education Level_Junior High School',
    'Education Level_Primary School',
    'Education Level_Senior High School',
    'Education Level_University Degree',
    'Education Level_Unknown',
    'Type of Poisoning_Alcohol',
    'Type of Poisoning_Industrial',
    'Type of Poisoning_Pesticide',
    'Type of Poisoning_Pharmaceutical',
    'Hypertension_No',
    'Hypertension_Unknown',
    'Hypertension_Yes',
    'Hyperlipidemia_No',
    'Hyperlipidemia_Unknown',
    'Hyperlipidemia_Yes',
    'Diabetes Mellitus_No',
    'Diabetes Mellitus_Unknown',
    'Diabetes Mellitus_Yes',
    'Cerebrovascular Disease_No',
    'Cerebrovascular Disease_Unknown',
    'Cerebrovascular Disease_Yes',
    'Heart Disease_No',
    'Heart Disease_Unknown',
    'Heart Disease_Yes',
    'Allergy History_No',
    'Allergy History_Unknown',
    'Allergy History_Yes',
    'Cancer_No',
    'Cancer_Unknown',
    'Cancer_Yes',
    'Poisoning_No',
    'Poisoning_Unknown',
    'Poisoning_Yes',
    'degree of poisoning_High',
    'degree of poisoning_Low',
    'degree of poisoning_Moderate',
    'degree of poisoning_Undetermined',
    'Smoking Status_No',
    'Smoking Status_Yes',
    'Alcohol Consumption Status_No',
    'Alcohol Consumption Status_Yes',
    'Shortness of Breath_No',
    'Shortness of Breath_Yes',
    'Chest Pain_No',
    'Chest Pain_Yes',
    'Cough_No',
    'Cough_Yes',
    'Pre-syncope_No',
    'Pre-syncope_Yes',
    'Altered Consciousness or Syncope_No',
    'Altered Consciousness or Syncope_Yes',
    'Sore Throat_No',
    'Sore Throat_Yes',
    'Fever_No',
    'Fever_Yes',
    'Fatigue_No',
    'Fatigue_Yes',
    'Lower Limb Edema_No',
    'Lower Limb Edema_Yes',
    'Palpitations_No',
    'Palpitations_Yes',
    'Vomiting_No',
    'Vomiting_Yes',
    'Nausea_No',
    'Nausea_Yes',
    'Weakness_No',
    'Weakness_Yes',
    'Headache_No',
    'Headache_Yes',
    'Residence_Rural',
    'Residence_Unknown',
    'Residence_Urban',
    ]

dynamic_cols = [
    'Systolic Blood Pressure',
    'Diastolic Blood Pressure',
    'Heart Rate',
    'Respiratory Rate',
    'White Blood Cell Count',
    'Red Blood Cell Count',
    'Hemoglobin Concentration',
    'Mean Corpuscular Volume',
    'Mean Corpuscular Hemoglobin',
    'Mean Corpuscular Hemoglobin Concentration',
    'Platelet Count',
    'Mean Platelet Volume',
    'Albumin',
    'Alanine Aminotransferase',
    'Aspartate Aminotransferase',
    'Total Bilirubin',
    'Direct Bilirubin',
    'Lactate Dehydrogenase',
    'Urea',
    'Uric Acid',
    'Creatine Kinase',
    'Creatine Kinase-MB',
    'Troponin I',
    'High-Sensitivity C-Reactive Protein',
    'Homocysteine',
    'Number of Hemoperfusion Sessions',
    'Number of Blood Purification Sessions',
]


scaler = StandardScaler()
df_series_filled[['Age','Length of Stay','Weight',]] = scaler.fit_transform(df_series_filled[['Age','Length of Stay','Weight',]])

static_df = df_series_filled.groupby("Hospital ID")[static_cols].first().reset_index()

# 9. build sequence
scaler = StandardScaler()
scaler.fit(df_series_filled[dynamic_cols])  
all_sequences, all_statics, all_labels = [], [], []

for pid, group in df_series_filled.groupby("Hospital ID"):
    group_sorted = group.sort_values("检验时间")
    # X_dyn = group_sorted[dynamic_cols].values
    X_dyn = scaler.transform(group_sorted[dynamic_cols])  
    # y = group_sorted["Outcome_other"].iloc[-1]  
    y = group_sorted["Outcome"].iloc[-1]  

    # X_dyn = scaler.fit_transform(X_dyn)  
    static_values = static_df.loc[static_df["Hospital ID"] == pid, static_cols].values
    if len(static_values) == 0:
        continue

    all_sequences.append(torch.tensor(X_dyn, dtype=torch.float32))
    all_statics.append(torch.tensor(static_values.squeeze(), dtype=torch.float32))
    all_labels.append(y)

# 10. Padding
X_padded = pad_sequence(all_sequences, batch_first=True, padding_value=0.0)
X_static = torch.stack(all_statics)
y_tensor = torch.tensor(all_labels, dtype=torch.float32).unsqueeze(1)

print(f"number of patients: {len(all_sequences)}")
print(f"Dynamic Feature Input Shape: {X_padded.shape}")  
print(f"static feature input shape: {X_static.shape}")   
print(f"tag shape: {y_tensor.shape}")

print('dynamic feature dimension:',len(dynamic_cols))



Outcome_other 分布（是否死亡）：
Outcome_other
0.0    889
1.0     82
Name: count, dtype: int64

Outcome 分布（是否未治愈）：
Outcome
0.0    731
1.0    240
Name: count, dtype: int64
患者数: 971
动态特征输入形状: torch.Size([971, 25, 27])
静态特征输入形状: torch.Size([971, 78])
标签形状: torch.Size([971, 1])


#### 2.  Five-fold cross validation: Divide 1/8 of the training set into validation sets (i.e. 70% training set, 10% validation set, 20% test set)

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)

save_path = './Poisoning_Prediction/LSTM/predict_non-recovery_valid_test_5cv/'
os.makedirs(save_path, exist_ok=True)

class LSTMWithStatic(nn.Module):
    def __init__(self, input_dim, static_dim, hidden_dim=64, num_layers=2, dropout=0.5):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + static_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )
    def forward(self, x_seq, x_static):
        _, (h_n, _) = self.lstm(x_seq)
        h_last = h_n[-1]
        combined = torch.cat([h_last, x_static], dim=1)
        out = self.fc(combined)
        return out

def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
all_results = []

max_epochs = 100
batch_size = 16
lr = 5e-4
weight_decay = 5e-4
patience = 12  

device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

for fold, (train_val_idx, test_idx) in enumerate(kf.split(X_padded)):
    print(f"\n===== Fold {fold+1}/{n_splits} =====")
    set_seed(42 + fold)
    X_train_val_seq, X_test_seq = X_padded[train_val_idx], X_padded[test_idx]
    X_train_val_static, X_test_static = X_static[train_val_idx], X_static[test_idx]
    y_train_val, y_test = y_tensor[train_val_idx], y_tensor[test_idx]

    X_train_seq, X_val_seq, X_train_static, X_val_static, y_train, y_val = train_test_split(
        X_train_val_seq, X_train_val_static, y_train_val,
        test_size=1/8, random_state=42, stratify=y_train_val
    )
    # pos_weight
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / max(num_pos,1), dtype=torch.float32).to(device)
    print(f"pos_weight = {pos_weight:.2f}  (neg={num_neg}, pos={num_pos})")
    train_loader = DataLoader(TensorDataset(X_train_seq, X_train_static, y_train),
                              batch_size=batch_size, shuffle=True)
    model = LSTMWithStatic(
        input_dim=X_padded.shape[2],
        static_dim=X_static.shape[1],
        hidden_dim=64,  
        dropout=0.5
    ).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold_{fold+1}_best_model.pt")
    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0
        for batch_seq, batch_static, batch_y in train_loader:
            batch_seq, batch_static, batch_y = batch_seq.to(device), batch_static.to(device), batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_seq, batch_static)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        model.eval()
        with torch.no_grad():
            logits = model(X_val_seq.to(device), X_val_static.to(device)).cpu().numpy().flatten()
            y_pred = 1 / (1 + np.exp(-logits))
            y_true = y_val.numpy().reshape(-1)
            auroc_val = roc_auc_score(y_true, y_pred)
        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f}")
        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test_seq.to(device), X_test_static.to(device)).cpu().numpy().flatten()
        y_pred = 1 / (1 + np.exp(-logits))
        y_true = y_test.numpy().reshape(-1)
        result_df = pd.DataFrame({"y_test": y_true, "y_pred": y_pred})
        result_df.to_csv(save_path + f"fold_{fold+1}_results.csv", index=False)
        all_results.append(result_df)

all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
y_all_true = all_results_df["y_test"].values
y_all_pred = all_results_df["y_pred"].values

mean_auroc, auc_lower, auc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, roc_auc_score)
mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, average_precision_score)

print("\n===== 5-Fold Cross Validation Results (Bootstrap) =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({auc_lower:.4f}–{auc_upper:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({auprc_lower:.4f}–{auprc_upper:.4f})")

all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)

