In [4]:
# ! pip install xgboost
# ! pip install lightgbm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lightgbm
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/42/86/dabda8fbcb1b00bcfb0003c3776e8ade1aa7b413dff0a2c08f457dace22f/lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[33mDEPRECATION: lmdeploy 0.1.0-git782048c.abi0.dtk2404.torch2.1. has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of lmdeploy or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: mmcv 2.0.1-gitc0ccf15.abi0.dtk2404.torch2.1. has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of mmcv o

In [5]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

# 加载数据
X_train = pd.read_csv("X_train.csv")
X_val = pd.read_csv("X_val.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_val = pd.read_csv("y_val.csv").squeeze()


In [6]:
lasso = LassoCV(cv=10, random_state=42, max_iter=10000)
lasso.fit(X_train, y_train)


0,1,2
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,10000
,tol,0.0001
,copy_X,True
,cv,10
,verbose,False


In [7]:
selected_features = X_train.columns[lasso.coef_ != 0].tolist()
print("🎯 LASSO保留的特征数量：", len(selected_features))
print(selected_features)


🎯 LASSO保留的特征数量： 13
['age', 'gender', 'AG', 'BUN', 'Ca', 'Cl', 'Glucose', 'Hb', 'Na', 'PLT', 'Phos', 'Scr', 'WBC']


In [8]:
X_train_sel = X_train[selected_features]
X_val_sel = X_val[selected_features]


In [11]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
    # "ComplementNB": ComplementNB(),
    "MLP": MLPClassifier(max_iter=1000),
    "SVM": SVC(probability=True)
}


In [12]:
results = []

for name, model in models.items():
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_val_sel)
    y_proba = model.predict_proba(X_val_sel)[:, 1]

    auc = roc_auc_score(y_val, y_proba)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    specificity = tn / (tn + fp)

    results.append({
        "Model": name,
        "AUC": auc,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "Specificity": specificity,
        "F1 Score": f1
    })


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 391, number of negative: 787
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1237
[LightGBM] [Info] Number of data points in the train set: 1178, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.331919 -> initscore=-0.699521
[LightGBM] [Info] Start training from score -0.699521




In [14]:
results_df = pd.DataFrame(results).sort_values(by='AUC', ascending=False)
print(results_df)
results_df.to_csv("model_results.csv", index=False)


                Model       AUC  Accuracy  Precision    Recall  Specificity  \
3            AdaBoost  0.819876  0.760766   0.686275  0.507246     0.885714   
0  LogisticRegression  0.793271  0.770335   0.769231  0.434783     0.935714   
6                 SVM  0.786698  0.760766   0.771429  0.391304     0.942857   
4          GaussianNB  0.754296  0.751196   0.742857  0.376812     0.935714   
1             XGBoost  0.753623  0.722488   0.617021  0.420290     0.871429   
2            LightGBM  0.748965  0.712919   0.600000  0.391304     0.871429   
5                 MLP  0.713665  0.712919   0.578947  0.478261     0.828571   

   F1 Score  
3  0.583333  
0  0.555556  
6  0.519231  
4  0.500000  
1  0.500000  
2  0.473684  
5  0.523810  
