## Step 1: Data Preprocessing

In [None]:
import pandas as pd

## Time series data: multiple examinations of a patient (examination time)
df_series = pd.read_csv('./Poisoning_Prediction/LSTM/all_poisoning_data_sequence.csv')
df_series = df_series.rename(columns={'住院号': 'Hospital ID','居住地（1农村 2城市）':'Residence'})

## Wide table data: one row per patient (final wash data)
df_wide = pd.read_excel('./Poisoning_Prediction/rawdata_中毒数据_patient_last_1114.xlsx')
df_wide = df_wide.rename(columns={'居住地（1农村 2城市）':'Residence'})

- Distinguish between continuous and categorical indicators

In [4]:
features_categorical = ['Gender',
 'Education Level',
 'Type of Poisoning',
 'Hypertension',
 'Hyperlipidemia',
 'Diabetes Mellitus',
 'Cerebrovascular Disease',
 'Heart Disease',
 'Allergy History',
 'Cancer',
 'Poisoning',
 'degree of poisoning',
 'Smoking Status',
 'Alcohol Consumption Status',
 'Shortness of Breath',
 'Chest Pain',
 'Cough',
 'Pre-syncope',
 'Altered Consciousness or Syncope',
 'Sore Throat',
 'Fever',
 'Fatigue',
 'Lower Limb Edema',
 'Palpitations',
 'Vomiting',
 'Nausea',
 'Weakness',
 'Headache',
 'Residence']

In [5]:
value_mappings_en = {
    "Gender": {
        1: "Male",
        0: "Female"
    },
    "Education Level": {
        1: "Illiterate",
        2: "Primary School",
        3: "Junior High School",
        4: "Senior High School",
        5: "University Degree"
    },
    "Type of Poisoning": {
        1: "Industrial",
        2: "Pharmaceutical",
        3: "Pesticide",
        4: "Alcohol",
        0: "Uncertain"
    },
    "Hypertension": {
        1: "Yes",
        0: "No"
    },
    "Hyperlipidemia": {
        1: "Yes",
        0: "No"
    },
    "Diabetes Mellitus": {
        1: "Yes",
        0: "No"
    },
    "Cerebrovascular Disease": {
        1: "Yes",
        0: "No"
    },
    "Heart Disease": {
        1: "Yes",
        0: "No"
    },
    "Allergy History": {
        1: "Yes",
        0: "No"
    },
    "Cancer": {
        1: "Yes",
        0: "No"
    },
    "Poisoning": {
        1: "Yes",
        0: "No"
    },
    "degree of poisoning": {
        0: "Undetermined",
        1: "Low",
        2: "Moderate",
        3: "High"
    },
    "Smoking": {
        1: "Yes",
        0: "No"
    },
    "Alcohol Consumption Status": {
        1: "Yes",
        0: "No"
    },
    "Shortness of Breath": {
        1: "Yes",
        0: "No"
    },
    "Chest Pain": {
        1: "Yes",
        0: "No"
    },
    "Cough": {
        1: "Yes",
        0: "No"
    },
    "Pre-syncope": {
        1: "Yes",
        0: "No"
    },
    "Altered Mental Status or Syncope(AMS or Sync)": {
        1: "Yes",
        0: "No"
    },
    "Sore Throat": {
        1: "Yes",
        0: "No"
    },
    "Fever": {
        1: "Yes",
        0: "No"
    },
    "Fatigue": {
        1: "Yes",
        0: "No"
    },
    "Lower Limb Edema": {
        1: "Yes",
        0: "No"
    },
    "Palpitations": {
        1: "Yes",
        0: "No"
    },
    "Vomiting": {
        1: "Yes",
        0: "No"
    },
    "Nausea": {
        1: "Yes",
        0: "No"
    },
    "Weakness": {
        1: "Yes",
        0: "No"
    },
    "Headache": {
        1: "Yes",
        0: "No"
    },
    "Residence": {
        1: "Rural",
        2: "Urban"
    },
    "Smoking Status": {
        1: "Yes",
        0: "No"
    },
    'Altered Consciousness or Syncope': {
        1: "Yes",
        0: "No"
    },    
}

In [None]:
## Circular mapping (numerical value → English label)
df_mapped_wide = df_wide.copy()
for col in features_categorical:
    if col in value_mappings_en and col in df_mapped_wide.columns:
        df_mapped_wide[col] = df_mapped_wide[col].map(value_mappings_en[col])

In [None]:
## Keep useful fields for wide tables
useful_cols_from_wide_df = [
    'Hospital ID',
    'Age',
    'Length of Stay',
    'Weight',
] + features_categorical  ## Add all the classification features

In [9]:
df_wide_remain = df_mapped_wide[useful_cols_from_wide_df]

In [10]:
df_wide_remain.head()

Unnamed: 0,Hospital ID,Age,Length of Stay,Weight,Gender,Education Level,Type of Poisoning,Hypertension,Hyperlipidemia,Diabetes Mellitus,...,Sore Throat,Fever,Fatigue,Lower Limb Edema,Palpitations,Vomiting,Nausea,Weakness,Headache,Residence
0,5305990,48,6.0,65.0,Male,Junior High School,Pesticide,No,No,No,...,No,No,No,No,No,Yes,Yes,No,No,Rural
1,5483805,69,6.0,65.0,Female,Illiterate,Pesticide,Yes,No,No,...,No,No,No,No,No,Yes,Yes,No,No,Rural
2,5386113,21,3.0,90.0,Male,Junior High School,Pesticide,No,No,No,...,No,No,No,No,No,Yes,Yes,No,No,Rural
3,5069871,25,3.0,60.0,Female,Junior High School,Pesticide,No,No,No,...,No,No,No,No,No,No,No,No,No,Rural
4,5173751,65,3.0,65.0,Male,Junior High School,Pesticide,No,No,No,...,No,No,No,No,No,No,Yes,Yes,Yes,


In [None]:
## Useful fields for preserving time series data
useful_cols_from_series_df = [
    'Hospital ID',
    '检验时间',

    ## Vital signs
    '收缩压',
    '舒张压',
    '心率',
    '呼吸频率',
    
    ## Lab tests
    '血胆碱酯酶检测结果',
    '白细胞计数',
    '红细胞计数',
    '血红蛋白浓度',
    '平均红细胞体积',
    '平均红细胞血红蛋白含量',
    '平均红细胞血红蛋白浓度',
    '血小板计数',
    '平均血小板体积',
    '白蛋白',
    '谷丙转氨酶',
    '谷草转氨酶',
    '总胆红素',
    '直接胆红素',
    '乳酸脱氢酶',
    '尿素',
    '血肌酐',
    '尿酸',
    '肌酸激酶',
    '肌酸激酶-MB 同工酶',
    '肌钙蛋白Ⅰ',
    '血沉',
    '超敏C反应蛋白',
    '同型半胱氨酸',

    '降钙素原',
    '氨基末端脑利钠肽前体测定',
    '钾',
    '钠',
    '氯',
    '二氧化碳',
    '凝血酶原时间',
    'D二聚体',
    '乳酸',
    # '血红蛋白',  ## 数据格式不对（129,30.5,328），填写错误
    '红细胞压积',
    
    ## Interventions
    '血液灌流（次数）',
    '血液净化（次数）',
    '阿托品使用剂量',
    '长托宁使用剂量',
    '碘解磷定使用剂量、频次',
    '高压氧治疗时间、频次',

    'Outcome_other',
    'Outcome',
    ]

In [None]:
rename_mapping = {
    # Vital signs
    '收缩压': 'Systolic Blood Pressure',
    '舒张压': 'Diastolic Blood Pressure',
    '心率': 'Heart Rate',
    '呼吸频率': 'Respiratory Rate',

    # Lab tests
    '血胆碱酯酶检测结果': 'Blood Cholinesterase Test Results',
    '白细胞计数': 'White Blood Cell Count',
    '红细胞计数': 'Red Blood Cell Count',
    '血红蛋白浓度': 'Hemoglobin Concentration',
    '平均红细胞体积': 'Mean Corpuscular Volume',
    '平均红细胞血红蛋白含量': 'Mean Corpuscular Hemoglobin',
    '平均红细胞血红蛋白浓度': 'Mean Corpuscular Hemoglobin Concentration',
    '血小板计数': 'Platelet Count',
    '平均血小板体积': 'Mean Platelet Volume',
    '白蛋白': 'Albumin',  # Note: The actual data may be divided into First/Last, which is a common name here.
    '谷丙转氨酶': 'Alanine Aminotransferase',
    '谷草转氨酶': 'Aspartate Aminotransferase',
    '总胆红素': 'Total Bilirubin',
    '直接胆红素': 'Direct Bilirubin',
    '乳酸脱氢酶': 'Lactate Dehydrogenase',
    '尿素': 'Urea',
    '血肌酐': 'Serum Creatinine',
    '尿酸': 'Uric Acid',
    '肌酸激酶': 'Creatine Kinase',
    '肌酸激酶-MB 同工酶': 'Creatine Kinase-MB',
    '肌钙蛋白Ⅰ': 'Troponin I',
    '血沉': 'Erythrocyte Sedimentation Rate',
    '超敏C反应蛋白': 'High-Sensitivity C-Reactive Protein',
    '同型半胱氨酸': 'Homocysteine',
    '降钙素原': 'Procalcitonin',
    '氨基末端脑利钠肽前体测定': 'Amino-Terminal Pro-B-Type Natriuretic Peptide',
    '钾': 'Potassium',
    '钠': 'Sodium',
    '氯': 'Chloride',
    '二氧化碳': 'Carbon Dioxide',
    '凝血酶原时间': 'Prothrombin Time',
    'D二聚体': 'D-Dimer',
    '乳酸': 'Lactate',
    '红细胞压积': 'Hematocrit',

    # Interventions
    '血液灌流（次数）': 'Number of Hemoperfusion Sessions',
    '血液净化（次数）': 'Number of Blood Purification Sessions',
    '阿托品使用剂量': 'Atropine Dosage',
    '长托宁使用剂量': 'Long-acting Nitroglycerin Dosage',
    '碘解磷定使用剂量、频次': 'Pralidoxime Dosage',
    '高压氧治疗时间、频次': 'Hyperbaric Oxygen Therapy Duration and Frequency',
}

In [None]:
## Useful fields for preserving time series data
df_series_remain = df_series[useful_cols_from_series_df]

In [None]:
## Variable renamed to English
df_series_remain = df_series_remain.rename(columns=rename_mapping)

In [17]:
features_continuous = [
    'Age',
    'Length of Stay',
    'Weight',
] + list(rename_mapping.values())

In [None]:
## Right link (left: wide table, right: timing table)
df_merged = pd.merge(df_wide_remain, df_series_remain, on='Hospital ID', how='right')

In [None]:
df_merged.columns.tolist()

In [None]:
## Check whether all continuous indicators are numbers, and set them to blank if they are not
# Ensure that columns in features_continuous exist in df_merged
continuous_cols = [col for col in features_continuous if col in df_merged.columns]

# Numerical cleaning of each column: non-numerical → NaN
for col in continuous_cols:
    df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')

In [None]:
## Calculate missing proportions for continuous variables
# Ensure that only continuous variables present in df_merged are processed
continuous_cols = [col for col in features_continuous if col in df_merged.columns]

# Calculate Missing Proportion (by Column)
missing_ratios = df_merged[continuous_cols].isnull().mean()

# Convert to percentage and sort (high to low)
missing_summary = (missing_ratios * 100).round(2).sort_values(ascending=False)

print("Proportion of missing continuous variables ( %):")
print(missing_summary)

In [None]:
# Feature names screened for deletion rates> 90%
high_missing_features = missing_ratios[missing_ratios > 0.90].index.tolist()

# Optional: Print these features
print("Continuous variables with missing rate> 90%:")
for feat in high_missing_features:
    print(f"{feat}: {missing_ratios[feat]*100:.2f}%")

In [None]:
# Continuous variables with deletion rate ≤ 90% were retained
features_continuous_clean = [col for col in features_continuous if col not in high_missing_features]

In [None]:
# df_merged Remove variables from high_missing_features
df_merged_clean = df_merged.drop(columns=high_missing_features)

In [26]:
df_merged_clean["检验时间"] = pd.to_datetime(df_merged_clean["检验时间"], errors="coerce")
df_merged_clean = df_merged_clean.sort_values(["Hospital ID", "检验时间"]).reset_index(drop=True)

In [None]:
## De-weight according to admission number (only one for each patient)
print(f"Data dimension before demultiplexing：{df_merged_clean.shape}")
df_merged_clean = df_merged_clean.drop_duplicates(subset=["Hospital ID", "检验时间"], keep="first").reset_index(drop=True)
print(f"Data dimension after demultiplexing：{df_merged_clean.shape}")

去重前数据维度：(4267, 63)
去重后数据维度：(3074, 63)


In [None]:
# Remove duplicates based on the hospital ID (keep only one record per patient)
df_unique = df_merged_clean.drop_duplicates(subset=['Hospital ID']).reset_index(drop=True)

# Count the distribution of Outcome_other and Outcome
print("\nOutcome_other Distribution (Mortality):")
print(df_unique["Outcome_other"].value_counts(dropna=False))

print("\nOutcome Distribution (Not Cured):")
print(df_unique["Outcome"].value_counts(dropna=False))

In [None]:
# ---------------------- Added: Statistics by Poisoning Type Group Outcome_other Distribution ----------------------
outcome_by_type = (
    df_unique.groupby("Type of Poisoning")["Outcome_other"]
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={0: "非死亡", 1: "死亡"})
)

# Increase Proportion Column (Percentage)
outcome_by_type["死亡比例(%)"] = (
    outcome_by_type["死亡"] / outcome_by_type.sum(axis=1) * 100
).round(2)

print(outcome_by_type)

# If you also want to calculate the Outcome distribution, it can be similar to:
outcome_by_type2 = (
    df_unique.groupby("Type of Poisoning")["Outcome"]
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={0: "治愈/好转", 1: "死亡或未愈"})
)
outcome_by_type2["不良结局比例(%)"] = (
    outcome_by_type2["死亡或未愈"] / outcome_by_type2.sum(axis=1) * 100
).round(2)

print(outcome_by_type2)

In [52]:
df_mapped_wide.shape

(971, 106)

- Save unfilled time series data

In [None]:
df_merged_clean.to_csv('./Poisoning_Prediction/LSTM/clean_data/poisoning_data_sequence_clean_without_filled.csv', index=False)

In [None]:
## Filling missing values in time series data
import pandas as pd
import numpy as np

df_merged_clean = pd.read_csv('/home/mailiyi/Poisoning_Prediction/LSTM/clean_data/poisoning_data_sequence_clean_without_filled.csv')

## Fill missing values for continuous indicators: Sort data by Hospital ID and Test Time
df_merged_clean = df_merged_clean.sort_values(["Hospital ID", "检验时间"]).reset_index(drop=True)

# Create a copy for filling
df_filled = df_merged_clean.copy()

# Perform linear interpolation for continuous variables for each patient
# common_continuous_cols: Fill with median
common_continuous_cols = [
    'Age',
    'Length of Stay',
    'Weight',
    ]

## series_continuous_cols: Fill using linear interpolation
series_continuous_cols = [
    'Systolic Blood Pressure',
    'Diastolic Blood Pressure',
    'Heart Rate',
    'Respiratory Rate',
    'White Blood Cell Count',
    'Red Blood Cell Count',
    'Hemoglobin Concentration',
    'Mean Corpuscular Volume',
    'Mean Corpuscular Hemoglobin',
    'Mean Corpuscular Hemoglobin Concentration',
    'Platelet Count',
    'Mean Platelet Volume',
    'Albumin',
    'Alanine Aminotransferase',
    'Aspartate Aminotransferase',
    'Total Bilirubin',
    'Direct Bilirubin',
    'Lactate Dehydrogenase',
    'Urea',
    'Uric Acid',
    'Creatine Kinase',
    'Creatine Kinase-MB',
    'Troponin I',
    'High-Sensitivity C-Reactive Protein',
    'Homocysteine',
    'Number of Hemoperfusion Sessions',
    'Number of Blood Purification Sessions',
]

In [None]:
# 1. Global median fill (static variable)
for col in common_continuous_cols:
    if col in df_filled.columns:
        median_val = df_filled[col].median()
        df_filled[col] = df_filled[col].fillna(median_val)

# 2. Sequential variable: forward fill by patient (Hospital ID)
for col in series_continuous_cols:
    if col in df_filled.columns:
        df_filled[col] = df_filled.groupby('Hospital ID')[col].ffill()

# 3. Sequential variables: bfill by patient--supplement this step
for col in series_continuous_cols:
    if col in df_filled.columns:
        df_filled[col] = df_filled.groupby('Hospital ID')[col].bfill()

# 4. Final Bottom: Fill in missing values with global median
for col in series_continuous_cols:
    if col in df_filled.columns:
        if df_filled[col].isna().any():
            global_median = df_filled[col].median()
            print(f"{df_filled[col].isna().sum()} : {global_median:.2f}")
            df_filled[col] = df_filled[col].fillna(global_median)

In [None]:
cols = [col for col in common_continuous_cols if col in df_merged_clean.columns]

# 1. Number of missing
missing_counts = df_merged_clean[cols].isnull().sum()
print("Missing value statistics:")
print(missing_counts)

# 2. Find missing rows
mask = df_merged_clean[cols].isnull().any(axis=1)
df_missing = df_merged_clean[mask]

print(f"\n {df_missing.shape[0]} row missing.")

In [None]:
# Calculate the number of missing values for each column
missing_counts = df_filled.isnull().sum()

# Filter columns with missing values > 0
missing_cols = missing_counts[missing_counts > 0]

# Output the result
if missing_cols.empty:
    print("✅ No missing values in df_filled.")
else:
    print("⚠️ The following columns still contain missing values:")
    for col, count in missing_cols.items():
        print(f"  {col}: {count} missing values")


In [None]:
## Classification index one-hot coding

features_categorical = [
    'Gender',
    'Education Level',
    'Type of Poisoning',
    'Hypertension',
    'Hyperlipidemia',
    'Diabetes Mellitus',
    'Cerebrovascular Disease',
    'Heart Disease',
    'Allergy History',
    'Cancer',
    'Poisoning',
    'degree of poisoning',
    'Smoking Status',
    'Alcohol Consumption Status',
    'Shortness of Breath',
    'Chest Pain',
    'Cough',
    'Pre-syncope',
    'Altered Consciousness or Syncope',
    'Sore Throat',
    'Fever',
    'Fatigue',
    'Lower Limb Edema',
    'Palpitations',
    'Vomiting',
    'Nausea',
    'Weakness',
    'Headache',
    'Residence',
]

## Categorical variables are populated first (missing values are populated as Unknown)
for col in features_categorical:
    if col in df_filled.columns:
        df_filled[col] = df_filled[col].fillna('Unknown')

# df_final_filled = pd.get_dummies(df_filled, columns=features_categorical, drop_first=True) ## Delete the first category column

df_final_filled = pd.get_dummies(df_filled, columns=features_categorical, drop_first=False)  ## Keep all category columns
# df_final_filled = df_final_filled.astype(float)
df_final_filled[df_final_filled.select_dtypes(include=['bool', 'int', 'float']).columns] = \
    df_final_filled.select_dtypes(include=['bool', 'int', 'float']).astype(float)


In [None]:
outcome_cols = ['Outcome_other','Outcome',]
df_final_filled = df_final_filled[[col for col in df_final_filled.columns if col not in outcome_cols] + outcome_cols]

In [None]:
df_final_filled.head()

In [None]:
df_final_filled.columns.tolist()

Save Missing Value Fill +one-hot encoded data

In [None]:
df_final_filled.to_csv('./Poisoning_Prediction/LSTM/clean_data/poisoning_data_sequence_filled_onehot.csv', index=False)

## Step 2: Precondition for LSTM (Sequence Input)

Each patient is a time series, sorted by "test time"; each time point is a set of numerical feature vectors; corresponds to a final label (e.g."hospital death")

In [None]:
import pandas as pd 
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import StandardScaler, LabelEncoder

df_series_filled = pd.read_csv('./Poisoning_Prediction/LSTM/clean_data/poisoning_data_sequence_filled_onehot.csv')

# De-weight according to admission number (only one for each patient)
df_unique = df_series_filled.drop_duplicates(subset=['Hospital ID']).reset_index(drop=True)

# Statistics Distribution of Outcome_other and Outcome
print("\nStatistics Distribution of Outcome_other（是否死亡）：")
print(df_unique["Outcome_other"].value_counts(dropna=False))

print("\nStatistics Distribution of Outcome（是否未治愈）：")
print(df_unique["Outcome"].value_counts(dropna=False))

df_series_filled['检验时间'] = pd.to_datetime(df_series_filled['检验时间'], errors='coerce')

## Define static feature columns
static_cols = [
    'Age',
    'Length of Stay',
    'Weight',
    ] + [
    'Gender_Female',
    'Gender_Male',
    'Education Level_Illiterate',
    'Education Level_Junior High School',
    'Education Level_Primary School',
    'Education Level_Senior High School',
    'Education Level_University Degree',
    'Education Level_Unknown',
    'Type of Poisoning_Alcohol',
    'Type of Poisoning_Industrial',
    'Type of Poisoning_Pesticide',
    'Type of Poisoning_Pharmaceutical',
    'Hypertension_No',
    'Hypertension_Unknown',
    'Hypertension_Yes',
    'Hyperlipidemia_No',
    'Hyperlipidemia_Unknown',
    'Hyperlipidemia_Yes',
    'Diabetes Mellitus_No',
    'Diabetes Mellitus_Unknown',
    'Diabetes Mellitus_Yes',
    'Cerebrovascular Disease_No',
    'Cerebrovascular Disease_Unknown',
    'Cerebrovascular Disease_Yes',
    'Heart Disease_No',
    'Heart Disease_Unknown',
    'Heart Disease_Yes',
    'Allergy History_No',
    'Allergy History_Unknown',
    'Allergy History_Yes',
    'Cancer_No',
    'Cancer_Unknown',
    'Cancer_Yes',
    'Poisoning_No',
    'Poisoning_Unknown',
    'Poisoning_Yes',
    'degree of poisoning_High',
    'degree of poisoning_Low',
    'degree of poisoning_Moderate',
    'degree of poisoning_Undetermined',
    'Smoking Status_No',
    'Smoking Status_Yes',
    'Alcohol Consumption Status_No',
    'Alcohol Consumption Status_Yes',
    'Shortness of Breath_No',
    'Shortness of Breath_Yes',
    'Chest Pain_No',
    'Chest Pain_Yes',
    'Cough_No',
    'Cough_Yes',
    'Pre-syncope_No',
    'Pre-syncope_Yes',
    'Altered Consciousness or Syncope_No',
    'Altered Consciousness or Syncope_Yes',
    'Sore Throat_No',
    'Sore Throat_Yes',
    'Fever_No',
    'Fever_Yes',
    'Fatigue_No',
    'Fatigue_Yes',
    'Lower Limb Edema_No',
    'Lower Limb Edema_Yes',
    'Palpitations_No',
    'Palpitations_Yes',
    'Vomiting_No',
    'Vomiting_Yes',
    'Nausea_No',
    'Nausea_Yes',
    'Weakness_No',
    'Weakness_Yes',
    'Headache_No',
    'Headache_Yes',
    'Residence_Rural',
    'Residence_Unknown',
    'Residence_Urban',
    ]

## Define dynamic feature columns
dynamic_cols = [
    'Systolic Blood Pressure',
    'Diastolic Blood Pressure',
    'Heart Rate',
    'Respiratory Rate',
    'White Blood Cell Count',
    'Red Blood Cell Count',
    'Hemoglobin Concentration',
    'Mean Corpuscular Volume',
    'Mean Corpuscular Hemoglobin',
    'Mean Corpuscular Hemoglobin Concentration',
    'Platelet Count',
    'Mean Platelet Volume',
    'Albumin',
    'Alanine Aminotransferase',
    'Aspartate Aminotransferase',
    'Total Bilirubin',
    'Direct Bilirubin',
    'Lactate Dehydrogenase',
    'Urea',
    'Uric Acid',
    'Creatine Kinase',
    'Creatine Kinase-MB',
    'Troponin I',
    'High-Sensitivity C-Reactive Protein',
    'Homocysteine',
    'Number of Hemoperfusion Sessions',
    'Number of Blood Purification Sessions',
]

## Standardize ['Age','Length of Stay','Weight',]
scaler = StandardScaler()
df_series_filled[['Age','Length of Stay','Weight',]] = scaler.fit_transform(df_series_filled[['Age','Length of Stay','Weight',]])

## Classified variables have been one-hot coded before，omitted here
## static feature coding
static_df = df_series_filled.groupby("Hospital ID")[static_cols].first().reset_index()

# 9. build sequence
# 1️⃣ First overall fit normalizer
scaler = StandardScaler()
scaler.fit(df_series_filled[dynamic_cols])  # global standardization
all_sequences, all_statics, all_labels = [], [], []

# 2️⃣ patient-by-patient transformation
for pid, group in df_series_filled.groupby("Hospital ID"):
    group_sorted = group.sort_values("检验时间")
    # X_dyn = group_sorted[dynamic_cols].values
    X_dyn = scaler.transform(group_sorted[dynamic_cols])  
    y = group_sorted["Outcome_other"].iloc[-1]  
    # X_dyn = scaler.fit_transform(X_dyn)  

    static_values = static_df.loc[static_df["Hospital ID"] == pid, static_cols].values
    if len(static_values) == 0:
        continue

    all_sequences.append(torch.tensor(X_dyn, dtype=torch.float32))
    all_statics.append(torch.tensor(static_values.squeeze(), dtype=torch.float32))
    all_labels.append(y)

# 10. Padding
X_padded = pad_sequence(all_sequences, batch_first=True, padding_value=0.0)
X_static = torch.stack(all_statics)
y_tensor = torch.tensor(all_labels, dtype=torch.float32).unsqueeze(1)

print(f"number of patients: {len(all_sequences)}")
print(f"Dynamic Feature Input Shape: {X_padded.shape}")  # (num_patients, seq_len, dyn_features)
print(f"static feature input shape: {X_static.shape}")   # (num_patients, static_features)
print(f"tag shape: {y_tensor.shape}")


##### Five-fold cross-validation: Further divide 1/8 of the training set into a validation set (i.e., the training set accounts for 70%, the validation set for 10%, and the test set for 20%).

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
import numpy as np
import os
import random

# ===================== fixed random seed =====================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)

# ===================== save path =====================
save_path = './Poisoning_Prediction/LSTM/predict_death_valid_test_5cv/'
os.makedirs(save_path, exist_ok=True)

# ===================== model definition =====================
class LSTMWithStatic(nn.Module):
    def __init__(self, input_dim, static_dim, hidden_dim=64, num_layers=2, dropout=0.5):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + static_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )
    def forward(self, x_seq, x_static):
        _, (h_n, _) = self.lstm(x_seq)
        h_last = h_n[-1]
        combined = torch.cat([h_last, x_static], dim=1)
        out = self.fc(combined)
        return out

# ===================== Bootstrap function =====================
def bootstrap_metric_ci(y_true, y_pred, metric_fn, n_bootstrap=2000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[idx])) < 2:
            continue
        scores.append(metric_fn(y_true[idx], y_pred[idx]))
    return np.mean(scores), np.percentile(scores, 2.5), np.percentile(scores, 97.5)

# ===================== five-fold cross-validation =====================
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
all_results = []

max_epochs = 100
batch_size = 16
lr = 5e-4
weight_decay = 5e-4
patience = 12

device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

for fold, (train_val_idx, test_idx) in enumerate(kf.split(X_padded)):
    print(f"\n===== Fold {fold+1}/{n_splits} =====")
    set_seed(42 + fold)

    X_train_val_seq, X_test_seq = X_padded[train_val_idx], X_padded[test_idx]
    X_train_val_static, X_test_static = X_static[train_val_idx], X_static[test_idx]
    y_train_val, y_test = y_tensor[train_val_idx], y_tensor[test_idx]

    X_train_seq, X_val_seq, X_train_static, X_val_static, y_train, y_val = train_test_split(
        X_train_val_seq, X_train_val_static, y_train_val,
        test_size=1/8, random_state=42, stratify=y_train_val
    )

    # pos_weight
    num_pos = (y_train == 1).sum().item()
    num_neg = (y_train == 0).sum().item()
    pos_weight = torch.tensor(num_neg / max(num_pos,1), dtype=torch.float32).to(device)
    print(f"pos_weight = {pos_weight:.2f}  (neg={num_neg}, pos={num_pos})")

    train_loader = DataLoader(TensorDataset(X_train_seq, X_train_static, y_train),
                              batch_size=batch_size, shuffle=True)

    model = LSTMWithStatic(
        input_dim=X_padded.shape[2],
        static_dim=X_static.shape[1],
        hidden_dim=64,  # hidden_dim
        dropout=0.5     # add dropout
    ).to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_auroc = 0
    wait = 0
    best_model_path = os.path.join(save_path, f"fold_{fold+1}_best_model.pt")

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0
        for batch_seq, batch_static, batch_y in train_loader:
            batch_seq, batch_static, batch_y = batch_seq.to(device), batch_static.to(device), batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_seq, batch_static)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # validation
        model.eval()
        with torch.no_grad():
            logits = model(X_val_seq.to(device), X_val_static.to(device)).cpu().numpy().flatten()
            y_pred = 1 / (1 + np.exp(-logits))
            y_true = y_val.numpy().reshape(-1)
            auroc_val = roc_auc_score(y_true, y_pred)

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | Val AUROC: {auroc_val:.4f}")

        if auroc_val > best_auroc:
            best_auroc = auroc_val
            wait = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best Val AUROC={best_auroc:.4f})")
                break

    # test set evaluation
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    model.eval()
    with torch.no_grad():
        logits = model(X_test_seq.to(device), X_test_static.to(device)).cpu().numpy().flatten()
        y_pred = 1 / (1 + np.exp(-logits))
        y_true = y_test.numpy().reshape(-1)
        result_df = pd.DataFrame({"y_test": y_true, "y_pred": y_pred})
        result_df.to_csv(save_path + f"fold_{fold+1}_results.csv", index=False)
        all_results.append(result_df)

# ===================== summary results =====================
all_results_df = pd.concat(all_results, axis=0).reset_index(drop=True)
y_all_true = all_results_df["y_test"].values
y_all_pred = all_results_df["y_pred"].values

mean_auroc, auc_lower, auc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, roc_auc_score)
mean_auprc, auprc_lower, auprc_upper = bootstrap_metric_ci(y_all_true, y_all_pred, average_precision_score)

print("\n===== 5-Fold Cross Validation Results (Bootstrap) =====")
print(f"AUROC: Mean = {mean_auroc:.4f}, 95% CI = ({auc_lower:.4f}–{auc_upper:.4f})")
print(f"AUPRC: Mean = {mean_auprc:.4f}, 95% CI = ({auprc_lower:.4f}–{auprc_upper:.4f})")

all_results_path = os.path.join(save_path, "all_folds_results.csv")
all_results_df.to_csv(all_results_path, index=False)
print(f"\n✅ 5-Fold Cross Validation Results as：{all_results_path}")
