In [None]:
# ==================== 库引用 (按功能分组，去重后排序) ====================

# 1. 基础库 (Core Libraries)
import numpy as np
import pandas as pd

import shap.plots

import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso  # 导入 Lasso
from sklearn.model_selection import train_test_split
import numpy as np

from joblib import dump

import shap
import lightgbm as lgb
import pandas as pd  # 假设您的 X_test_robust 是 DataFrame
import matplotlib.pyplot as plt
import numpy as np # 导入 numpy

import lightgbm as lgb
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, average_precision_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn import set_config # For potential Cython optimization disabling

# 2. 数据预处理 (Data Preprocessing)
from sklearn.experimental import enable_iterative_imputer  # 允许使用 IterativeImputer
from sklearn.impute import IterativeImputer, SimpleImputer  # 缺失值填充：IterativeImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler  # 特征编码和缩放：LabelEncoder, OneHotEncoder, StandardScaler
from scipy import stats  # 统计函数，如 scipy.stats
from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer


# 3. 数据划分与模型验证 (Data Splitting & Model Validation)
from sklearn.model_selection import (
    GridSearchCV,  # 网格搜索
    KFold,  # K折交叉验证
    StratifiedKFold, # 分层K折交叉验证
    cross_validate,  # 交叉验证评估
    cross_val_score, # 交叉验证评分
    train_test_split # 训练集/测试集划分
)

# 4. 经典模型 (Classical Models)
from sklearn.linear_model import (
    BayesianRidge,  # 贝叶斯岭回归
    ElasticNet,     # 弹性网络回归
    Lasso,          # Lasso回归
    LogisticRegression, # 逻辑回归
    Ridge           # 岭回归
)
from sklearn.ensemble import (
    ExtraTreesRegressor, # 极端随机树回归
    RandomForestClassifier, # 随机森林分类器
    RandomForestRegressor, # 随机森林回归器 (虽然在导入部分没有直接使用，但可能在其他地方用到，保留)
)
from sklearn.neighbors import KNeighborsRegressor # K近邻回归
import xgboost as xgb # XGBoost

# 5. 数据平衡 (Data Balancing - imbalanced-learn库)
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE # 过采样：ADASYN, RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler # 欠采样：RandomUnderSampler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 6. 聚类 (Clustering - 虽然在导入部分没有直接使用，但可能在其他地方用到，保留)
from sklearn.cluster import DBSCAN, KMeans # 聚类算法：DBSCAN, KMeans

# 7. 模型评估指标 (Model Evaluation Metrics)
from sklearn.metrics import (
    accuracy_score,        # 准确率
    classification_report, # 分类报告
    confusion_matrix,     # 混淆矩阵
    roc_auc_score          # ROC AUC 分数
)

# 8. 其他 (Miscellaneous)
from sklearn.utils.class_weight import compute_class_weight # 计算类别权重
import matplotlib.pyplot as plt # Matplotlib 绘图 (虽然在导入部分没有直接使用，但可能在其他地方用到，保留)
import seaborn as sns # Seaborn 可视化 (虽然在导入部分没有直接使用，但可能在其他地方用到，保留)


# ==================== Pandas 显示设置 ====================
pd.set_option('display.max_columns', None)  # 显示所有列
# pd.set_option('display.max_rows', None)     # 显示所有行 (取消注释以显示所有行)

In [None]:
import pandas as pd

# 读取训练集
train_data = pd.read_csv('path/to/mimic_train.csv')

# 读取测试集
test_data = pd.read_csv('path/to/mimic_test.csv')

# 读取验证集
val_data = pd.read_csv('path/to/mimic_validation.csv')

# 查看数据集信息
print("训练集形状:", train_data.shape)
print("测试集形状:", test_data.shape)
print("验证集形状:", val_data.shape)

# 查看前几行数据
print("\n训练集前5行:")
print(train_data.head())

print("\n测试集前5行:")
print(test_data.head())

print("\n验证集前5行:")
print(val_data.head())


In [None]:
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']
X_val = val_data.drop('label', axis=1)
y_val = val_data['label']

In [None]:
imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    random_state=42,
    max_iter=10
)

# 仅在训练集上拟合 imputer
X_train_imputed = imputer.fit_transform(X_train)

# 使用训练好的 imputer 转换测试集和验证集
X_test_imputed = imputer.transform(X_test)
X_val_imputed = imputer.transform(X_val)

In [None]:
scaler = RobustScaler()

# 仅在训练集填充数据上拟合 scaler
X_train_scaled = scaler.fit_transform(X_train_imputed)

# 使用训练好的 scaler 转换测试集和验证集的填充数据
X_test_scaled = scaler.transform(X_test_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

In [None]:
# 初始化 SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# 仅在训练集标准化数据上进行平衡处理
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_scaled, y_train)

# 测试集和验证集保持不变
X_test_final = X_test_scaled
y_test_final = y_test
X_val_final = X_val_scaled
y_val_final = y_val

In [None]:
X_train_balanced.shape, y_train_balanced.shape

In [None]:
import joblib
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(scaler, 'scaler.pkl')