In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.linear_model import RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier,Perceptron,LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
import pickle
import os
#import shap

In [None]:
base_dir = "data"
train_file = os.path.join(base_dir, "train_data_standardized.csv")
test_file = os.path.join(base_dir, "test_data_standardized.csv")

# 载入数据集
dataset = pd.read_csv(train_file)
x_train = dataset.iloc[:, 1:]  # 从第二列开始
y_train = dataset[['dic']].values.ravel()

dataset_test = pd.read_csv(test_file)
x_test = dataset_test.iloc[:, 1:]
y_test = dataset_test[['dic']].values.ravel()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)  # 5折分层交叉验证，重复5次
n_jobs = 120

In [3]:
# 设定阈值，比如非重复值个数少于一定数量时认为是分类变量
threshold = 2 # 可根据实际需求调整

# 自动提取列名，根据唯一值数量
columns_by_unique_count = x_train.nunique()  # 每列的唯一值个数
categorical_cols = columns_by_unique_count[columns_by_unique_count <= threshold].index.tolist()
categorical_indices = [x_train.columns.get_loc(col) for col in categorical_cols]
continuous_cols = columns_by_unique_count[columns_by_unique_count > threshold].index.tolist()
print(categorical_cols)
print(continuous_cols)

['gender', 'ventilation', 'vasopressin', 'crrt', 'myocardial_infarct', 'congestive_heart_failure', 'peripheral_vascular_disease', 'cerebrovascular_disease', 'dementia', 'chronic_pulmonary_disease', 'rheumatic_disease', 'peptic_ulcer_disease', 'mild_liver_disease', 'diabetes_without_cc', 'diabetes_with_cc', 'paraplegia', 'renal_disease', 'malignant_cancer', 'severe_liver_disease', 'metastatic_solid_tumor', 'aids']
['sofa_score', 'age', 'los', 'scr_min', 'mdrd_est', 'scr_baseline', 'alt_min', 'alp_min', 'ast_min', 'sirs', 'lods', 'apsiii', 'wbc_max', 'basophils_abs_max', 'eosinophils_abs_min', 'lymphocytes_abs_min', 'monocytes_abs_min', 'neutrophils_abs_max', 'basophils_min', 'eosinophils_max', 'lymphocytes_min', 'monocytes_min', 'neutrophils_max', 'age_score', 'charlson_comorbidity_index', 'albumin_min', 'creatinine_max', 'sodium_min', 'inr_max', 'pt_max', 'ptt_max', 'hemoglobin_min', 'platelet_min', 'rbc_min', 'potassium_max', 'glucose_max', 'bicarbonate_min', 'bun_max', 'chloride_max'

In [None]:
def predict_with_model(model, X, threshold=0.5):
    """使用模型进行预测，并根据模型类型选择预测方法。"""
    if hasattr(model, 'decision_function'):
        y_pred = model.decision_function(X)
        y_pred_label = (y_pred > 0).astype(int)
    else:
        y_pred = model.predict_proba(X)[:, 1]
        y_pred_label = (y_pred > threshold).astype(int)
    return y_pred, y_pred_label

def evaluate_model(model, x_train, y_train, x_test, y_test):
    """在训练集和测试集上评估模型，并保存模型文件。"""
    # 使用最好的模型进行训练集预测
    y_train_pred, y_train_pred_label = predict_with_model(model, x_train)
    train_auc = roc_auc_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred_label)
    train_acc = accuracy_score(y_train, y_train_pred_label)
    
    # 在测试集上进行预测
    y_test_pred, y_test_pred_label = predict_with_model(model, x_test)
    test_auc = roc_auc_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred_label)
    test_acc = accuracy_score(y_test, y_test_pred_label)

    # 保存模型，文件名为模型类型
    # 创建模型存储目录（如果不存在）
    # os.makedirs("model", exist_ok=True)
    # # 保存模型，路径包含子目录
    # model_name = type(model.named_steps["model"]).__name__
    # model_filename = os.path.join("model", f"{model_name}.pkl")
    # with open(model_filename, "wb") as f:
    #     pickle.dump(model, f)
    # print(f"模型已保存为: {model_filename}")
    
    # 打印性能结果
    print(f"Training AUC: {train_auc}")
    print(f"Training ACC: {train_acc}")
    print(f"Training F1 Score: {train_f1}")
    print(f"Test AUC: {test_auc}")
    print(f"Test ACC: {test_acc}")
    print(f"Test F1 Score: {test_f1}")

    return {
        'train_auc': train_auc,
        'train_f1': train_f1,
        'train_acc': train_acc,
        'test_auc': test_auc,
        'test_f1': test_f1,
        'test_acc': test_acc
    }

# def get_top_features_shap(model, X, output_file="./output/shap_feature_importance.xlsx"):
#     """
#     Parameters:
#     - model: 训练好的模型。
#     - X: 用于计算 SHAP 值的特征数据（通常是训练数据）。
#     - output_file: 保存 SHAP 排名和 SHAP 值的 Excel 文件名（默认为 "shap_feature_importance.xlsx"）。

#     Returns:
#     - top_feature_list: 包含按 SHAP 排序的特征名称的列表。
#     """
#     # 创建解释器
#     explainer = shap.Explainer(model.predict, X)
#     shap_values = explainer(X)

#     # 获取特征重要性
#     importance = np.abs(shap_values.values).mean(axis=0)
#     # 获取特征名称
#     feature_names = X.columns

#     # 创建一个 DataFrame 来存储特征及其重要性
#     importance_df = pd.DataFrame({
#         'Feature': feature_names,
#         'Importance': importance,
#         'SHAP Value': np.mean(shap_values.values, axis=0)
#     })

#     # 按重要性排序并提取前 N 个特征
#     importance_df = importance_df.sort_values(by='Importance', ascending=False)

#     # 将特征的名称保存到列表中
#     top_feature_list = importance_df['Feature'].tolist()

#     # 保存排名和 SHAP 值到 Excel 文件
#     importance_df.to_excel(output_file, index=False)

#     print(f"Top features based on SHAP values saved to {output_file}")
#     print(importance_df)

#     # SHAP 排序图
#     shap.plots.bar(shap_values)

#     return top_feature_list

# RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=1)
param_grid_rfc = {
    "model__n_estimators": np.arange(80, 100, 1),
    "model__criterion": ['gini', 'entropy', 'log_loss'],
    "model__max_features": np.arange(0.1, 1, 0.1),
    "model__max_depth": np.arange(1, 13, 1)
}

pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', rfc)
])

# 执行超参数搜索
grid_search_rfc = GridSearchCV(pipeline, param_grid_rfc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_rfc.fit(x_train, y_train)
best_model_rfc = grid_search_rfc.best_estimator_

Fitting 25 folds for each of 6480 candidates, totalling 162000 fits


In [None]:
print(type(rfc).__name__)
print(f"Best parameters: {grid_search_rfc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_rfc.best_score_}")

results_rfc = evaluate_model(best_model_rfc, x_train, y_train, x_test, y_test)

# 打印结果
print(f"Training AUC: {results_rfc['train_auc']}")
print(f"Training ACC: {results_rfc['train_acc']}")
print(f"External validation AUC: {results_rfc['test_auc']}")
print(f"Training F1 Score: {results_rfc['train_f1']}")
print(f"External validation ACC: {results_rfc['test_acc']}")
print(f"External validation F1 Score: {results_rfc['test_f1']}")


# LogisticRegression

In [7]:
lr=LogisticRegression(random_state=1)

param_grid_lr={
    "model__fit_intercept":[True,False],
    "model__penalty":['l1','l2'],
    "model__solver":['liblinear','saga'],
    "model__max_iter": [2500]  # 增加迭代次数
}

# 创建 Pipeline：过采样 → 模型训练
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', lr)
])

# 执行超参数搜索
grid_search_lr = GridSearchCV(pipeline, param_grid_lr, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_lr.fit(x_train, y_train)
best_model_lr = grid_search_lr.best_estimator_

Fitting 25 folds for each of 8 candidates, totalling 200 fits


In [8]:
print(type(lr).__name__)
print(f"Best parameters: {grid_search_lr.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_lr.best_score_}")

results_lr = evaluate_model(best_model_lr, x_train, y_train, x_test, y_test)

LogisticRegression
Best parameters: {'model__fit_intercept': True, 'model__max_iter': 2500, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
Best F1 Score (cross-validation): 0.216483583345964
模型已保存为: model/LogisticRegression.pkl
Training AUC: 0.7518382122484799
Training ACC: 0.8581571473650994
Training F1 Score: 0.23748939779474132
Test AUC: 0.7484156202195438
Test ACC: 0.8694006309148264
Test F1 Score: 0.2935153583617747


# GradientBoostingClassifier

In [None]:
gbd = GradientBoostingClassifier(random_state = 1)#这个模型比较耗时，参数空间比较大，粗略检索与精细检索结合
param_grid_gbd={
    "model__learning_rate":np.arange(0.2,0.4,0.04),
    "model__n_estimators":np.arange(50,100,4),
    "model__subsample":np.arange(0.6,0.8,0.04),
    "model__max_features":np.arange(0.01,0.2,0.04),
    "model__max_depth":np.arange(1,13,1)
}

pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', gbd)
])

'''
原始字典
param_grid_gbd={
    "model__learning_rate":np.arange(0.2,0.4,0.01),
    "model__n_estimators":np.arange(50,100,1),
    "model__subsample":np.arange(0.6,0.8,0.01),
    "model__max_features":np.arange(0.01,0.2,0.01),
    "model__max_depth":np.arange(1,13,1)
}
'''

# 执行超参数搜索
grid_search_gbd = GridSearchCV(pipeline, param_grid_gbd, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_gbd.fit(x_train, y_train)
best_model_gbd = grid_search_gbd.best_estimator_

In [None]:
print(type(gbd).__name__)

print(f"Best parameters: {grid_search_gbd.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_gbd.best_score_}")

results_gbd = evaluate_model(best_model_gbd, x_train, y_train, x_test, y_test)

#rfc_top_feature_list = get_top_features_shap(best_model_gbd, x_train)

# BernoulliNB

In [None]:
bnb=BernoulliNB()
param_grid_bnb={
    "model__alpha":np.linspace(0,100,100),
    "model__fit_prior":[True,False],
    "model__binarize":np.linspace(0,10,100),
    "model__class_prior":[None]
}
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', bnb)
])
# 执行超参数搜索
grid_search_bnb = GridSearchCV(pipeline, param_grid_bnb, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_bnb.fit(x_train, y_train)
best_model_bnb = grid_search_bnb.best_estimator_

In [None]:
print(type(bnb).__name__)

print(f"Best parameters: {grid_search_bnb.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_bnb.best_score_}")

results_bnb = evaluate_model(best_model_bnb, x_train, y_train, x_test, y_test)

#rfc_top_feature_list = get_top_features_shap(best_model_bnb, x_train)

# KNeighborsClassifier

In [None]:
# 定义 KNN 模型
knn = KNeighborsClassifier()# 无需random_state

pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', knn)
])
# 定义超参数搜索范围
param_grid_knn = {
    "model__n_neighbors": np.arange(1,16,1),
    "model__weights": ['uniform', 'distance'],
    "model__metric": ['euclidean', 'manhattan', 'minkowski'],
    "model__p": [1, 2]  # 1 = 曼哈顿距离, 2 = 欧几里得距离
}

# 执行超参数搜索
grid_search_knn = GridSearchCV(pipeline, param_grid_knn, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_knn.fit(x_train, y_train)

# 获取最佳模型
best_model_knn = grid_search_knn.best_estimator_

In [None]:
print(type(knn).__name__)

print(f"Best parameters: {grid_search_knn.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_knn.best_score_}")

results_knn = evaluate_model(best_model_knn, x_train, y_train, x_test, y_test)

# ExtraTreeClassifier

In [None]:
et = ExtraTreeClassifier(random_state=1)

pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', et)
])

# 定义超参数搜索范围
param_grid_et = {
    "model__criterion": ["gini", "entropy", "log_loss"],
    "model__splitter": ["random", "best"],
    "model__max_depth": [3, 5, 10, 20, 30],
    "model__min_samples_split": np.arange(2,11,1),
    "model__min_samples_leaf": np.arange(1,11,1),
    "model__max_features": ["sqrt", "log2"]
}

# 执行超参数搜索
grid_search_et = GridSearchCV(pipeline, param_grid_et, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_et.fit(x_train, y_train)

# 获取最佳模型
best_model_et = grid_search_et.best_estimator_

Fitting 25 folds for each of 9000 candidates, totalling 225000 fits


In [None]:
print(f"Best parameters: {grid_search_et.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_et.best_score_}")

print(type(et).__name__)
results_et = evaluate_model(best_model_et, x_train, y_train, x_test, y_test)

# ExtraTreesClassifier

In [None]:
etc = ExtraTreesClassifier(random_state = 1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', etc)
])
param_grid_etc={
    "model__n_estimators":np.arange(1,100,1),
    #"model__max_samples":np.arange(0.01,0.1,0.01),
    "model__criterion":['gini', 'entropy', 'log_loss'],
    "model__max_features":np.arange(0.01,0.1,0.01),
    "model__max_depth":np.arange(1,13,1)
}


# 执行超参数搜索
grid_search_etc = GridSearchCV(pipeline, param_grid_etc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_etc.fit(x_train, y_train)
best_model_etc = grid_search_etc.best_estimator_

In [None]:
print(type(etc).__name__)
print(f"Best parameters: {grid_search_etc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_etc.best_score_}")

results_etc = evaluate_model(best_model_etc, x_train, y_train, x_test, y_test)
#rfc_top_feature_list = get_top_features_shap(best_model_etc, x_train)

# LinearSVC

In [6]:
lsvc = LinearSVC(random_state=1, penalty='l1',dual = False, loss = 'squared_hinge')
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', lsvc)
])
param_grid_lsvc= {'model__C':np.arange(0.01,1,0.01)}


# 执行超参数搜索
grid_search_lsvc = GridSearchCV(pipeline, param_grid_lsvc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_lsvc.fit(x_train, y_train)
best_model_lsvc = grid_search_lsvc.best_estimator_


Fitting 25 folds for each of 99 candidates, totalling 2475 fits


In [None]:
print(type(lsvc).__name__)

print(f"Best parameters: {grid_search_lsvc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_lsvc.best_score_}")

results_lsvc = evaluate_model(best_model_lsvc, x_train, y_train, x_test, y_test)
#rfc_top_feature_list = get_top_features_shap(best_model_lsvc, x_train)

LinearSVC
Best parameters: {'model__C': 0.01}
Best F1 Score (cross-validation): 0.2155902391718928
Training AUC: 0.7643388797220122
Training F1 Score: 0.2259810554803789
Training ACC: 0.8404017857142857
External validation AUC: 0.7766296791079363
External validation F1 Score: 0.23497267759562843
External validation ACC: 0.84375


# NonLinear_SVC

In [None]:
svc = SVC(random_state = 1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', svc)
])
param_grid_svc = {'model__C':np.arange(100,150,1),
              'model__gamma':np.arange(0.0001,0.001,0.0001),
              'model__kernel':['poly','rbf','sigmoid']}  

# 执行超参数搜索
grid_search_svc = GridSearchCV(pipeline, param_grid_svc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_svc.fit(x_train, y_train)
best_model_svc = grid_search_svc.best_estimator_

In [None]:
print(type(svc).__name__)

print(f"Best parameters: {grid_search_svc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_svc.best_score_}")

results_svc = evaluate_model(best_model_svc, x_train, y_train, x_test, y_test)

#rfc_top_feature_list = get_top_features_shap(best_model_svc, x_train)

# AdaBoostClassifier

In [None]:
abc = AdaBoostClassifier(random_state = 1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', abc)
])
param_grid_abc={
    "model__n_estimators":np.arange(400,500,1),
    "model__learning_rate":np.arange(0.1,0.2,0.01),
    "model__algorithm":['SAMME','SAMME.R']
}

# 执行超参数搜索
grid_search_abc = GridSearchCV(pipeline, param_grid_abc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_abc.fit(x_train, y_train)
best_model_abc = grid_search_abc.best_estimator_

In [None]:
print(type(abc).__name__)

print(f"Best parameters: {grid_search_abc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_abc.best_score_}")

results_abc = evaluate_model(best_model_abc, x_train, y_train, x_test, y_test)
#rfc_top_feature_list = get_top_features_shap(best_model_abc, x_train)

# DecisionTreeClassifier

In [6]:
dtc = DecisionTreeClassifier(random_state = 1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', dtc)
])
param_grid_dtc={
    "model__criterion":['gini','entropy','log_loss'],
    "model__splitter":['best','random'],
    #"model__min_samples_leaf":(0.001,0.01,0.001),
    #"model__min_samples_split":(0.001,0.01,0.001),
    "model__max_features":np.arange(1,10,1),
    #"model__max_features":['auto', 'sqrt', 'log2'],#"max_features"要不要设置值应该尝试
    "model__max_depth":np.arange(1,13,1)
}

# 执行超参数搜索
grid_search_dtc = GridSearchCV(pipeline, param_grid_dtc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_dtc.fit(x_train, y_train)
best_model_dtc = grid_search_dtc.best_estimator_

Fitting 25 folds for each of 648 candidates, totalling 16200 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
print(type(dtc).__name__)

print(f"Best parameters: {grid_search_dtc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_dtc.best_score_}")

results_dtc = evaluate_model(best_model_dtc, x_train, y_train, x_test, y_test)
#rfc_top_feature_list = get_top_features_shap(best_model_dtc, x_train)

DecisionTreeClassifier
Best parameters: {'model__criterion': 'gini', 'model__max_depth': 2, 'model__max_features': 5, 'model__splitter': 'best'}
Best F1 Score (cross-validation): 0.1929815995653992
Training AUC: 0.7407050441809182
Training F1 Score: 0.19319227230910763
Training ACC: 0.7553013392857143
External validation AUC: 0.7688186700836991
External validation F1 Score: 0.23391812865497075
External validation ACC: 0.7806919642857143


# RidgeClassifierCV

In [8]:
rccv =RidgeClassifierCV()
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', rccv)
])
param_grid_rccv={
    "model__fit_intercept":[True,False],
    "model__alphas":np.arange(38,40,0.1)
}

# 执行超参数搜索
grid_search_rccv = GridSearchCV(pipeline, param_grid_rccv, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_rccv.fit(x_train, y_train)
best_model_rccv = grid_search_rccv.best_estimator_

Fitting 25 folds for each of 40 candidates, totalling 1000 fits


In [None]:
print(type(rccv).__name__)

print(f"Best parameters: {grid_search_rccv.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_rccv.best_score_}")

results_rccv = evaluate_model(best_model_rccv, x_train, y_train, x_test, y_test)
#rfc_top_feature_list = get_top_features_shap(best_model_rccv, x_train)

RidgeClassifierCV
Best parameters: {'model__alphas': 39.50000000000002, 'model__fit_intercept': False}
Best F1 Score (cross-validation): 0.2124291796099841
Training AUC: 0.7508340144236949
Training F1 Score: 0.2211126961483595
Training ACC: 0.84765625
External validation AUC: 0.7475871752228797
External validation F1 Score: 0.20869565217391303
External validation ACC: 0.84765625


# SGDClassifier

In [None]:
sgdc=SGDClassifier(random_state=1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', sgdc)
])
param_grid_sgdc={
    "model__loss":['hinge','log_loss','modified_huber','squared_hinge','perceptron','squared_error','huber','epsilon_insensitive','squared_epsilon_insensitive'],
    "model__penalty":['l2','l1','elasticnet'],
    "model__alpha":np.arange(0.01,0.1,0.01),
    "model__l1_ratio":np.arange(0.1,0.5,0.1),
    "model__fit_intercept":[True,False],
    "model__learning_rate":['optimal','invscaling','adaptive']
    #"learning_rate":np.arange(0.01,1,0.01)
}

# 执行超参数搜索
grid_search_sgdc = GridSearchCV(pipeline, param_grid_sgdc, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_sgdc.fit(x_train, y_train)
best_model_sgdc = grid_search_sgdc.best_estimator_


In [None]:
print(type(sgdc).__name__)

print(f"Best parameters: {grid_search_sgdc.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_sgdc.best_score_}")
results_sgdc = evaluate_model(best_model_sgdc, x_train, y_train, x_test, y_test)

#rfc_top_feature_list = get_top_features_shap(best_model_sgdc, x_train)

# Perceptron

In [None]:
p=Perceptron(random_state=1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', p)
])
param_grid_p={
    "model__penalty":['l2','l1','elasticnet'],
    "model__alpha":np.arange(0.000001,0.0001,0.000001),
    "model__fit_intercept":[True,False]
}

# 执行超参数搜索
grid_search_p = GridSearchCV(pipeline, param_grid_p, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_p.fit(x_train, y_train)
best_model_p = grid_search_p.best_estimator_

In [None]:
print(type(p).__name__)
print(f"Best parameters: {grid_search_p.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_p.best_score_}")
results_p = evaluate_model(best_model_p, x_train, y_train, x_test, y_test)



#rfc_top_feature_list = get_top_features_shap(best_model_p, x_train)

# PassiveAggressiveClassifier

In [None]:
pac=PassiveAggressiveClassifier(random_state=1)
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', pac)
])
param_grid_pac={
    "model__C":np.arange(0.001,1.0,0.001),
    "model__fit_intercept":[True,False]
}

# 执行超参数搜索
grid_search_pac = GridSearchCV(pipeline, param_grid_pac, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_pac.fit(x_train, y_train)
best_model_pac = grid_search_pac.best_estimator_

In [None]:
print(type(pac).__name__)
print(f"Best parameters: {grid_search_pac.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_pac.best_score_}")
results_pac = evaluate_model(best_model_pac, x_train, y_train, x_test, y_test)


#rfc_top_feature_list = get_top_features_shap(best_model_pac, x_train)

# GaussianNB

In [18]:
# 定义 GaussianNB 模型
nb = GaussianNB()
pipeline = Pipeline([
    ('smote', SMOTENC(categorical_features=categorical_indices, random_state=1)),
    ('model', nb)
])
# 超参数搜索范围
param_grid_nb = {
    "model__var_smoothing": np.logspace(-9, 1, 10)  # 设定一个平滑参数，防止概率为 0
}

# 执行超参数搜索
grid_search_nb = GridSearchCV(pipeline, param_grid_nb, cv=cv, scoring='f1', n_jobs=n_jobs, verbose=1)
grid_search_nb.fit(x_train, y_train)

# 获取最佳模型
best_model_nb = grid_search_nb.best_estimator_

Fitting 25 folds for each of 10 candidates, totalling 250 fits


In [24]:
print(type(nb).__name__)

print(f"Best parameters: {grid_search_nb.best_params_}")
print(f"Best F1 Score (cross-validation): {grid_search_nb.best_score_}")

# 评估模型
results_nb = evaluate_model(best_model_nb, x_train, y_train, x_test, y_test)

GaussianNB
Best parameters: {'model__var_smoothing': 10.0}
Best F1 Score (cross-validation): 0.19897387176073134
Training AUC: 0.7891754946822959
Training ACC: 0.7869990533291259
Training F1 Score: 0.22145328719723184
Test AUC: 0.7832215442029351
Test ACC: 0.7867507886435331
Test F1 Score: 0.2028301886792453
