In [None]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


save_path = "Model"


data_path = "Features.csv"
data = pd.read_csv(data_path)


train_data = data[data['ref'] != 'DILIrank']
test_data = data[data['ref'] == 'DILIrank']

X_train = train_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_train = train_data['Label']
X_test = test_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_test = test_data['Label']

model_names = ['RF', 'ET', 'HistGB', 'XGBoost']

prob_features_train = []
prob_features_test = []


for model_name in model_names:
    model_file = os.path.join(save_path, f"best_model_{model_name}.pkl")

    print(f"加载模型: {model_name}")
    with open(model_file, 'rb') as f:
        model = pickle.loads(f.read())

    
    train_prob = model.predict_proba(X_train)[:, 1]
    test_prob = model.predict_proba(X_test)[:, 1]



 
    prob_features_train.append(train_prob)
    prob_features_test.append(test_prob)


X_train_prob = np.column_stack(prob_features_train)
X_test_prob = np.column_stack(prob_features_test)




assert X_train_prob.shape[1] == X_test_prob.shape[1], "训练和测试集特征数量不一致"
assert X_train_prob.shape[1] == len(model_names), "特征数量应与模型数量一致"


stacking_model = ExtraTreesClassifier(random_state=RANDOM_SEED)


best_stacking_auc = -np.inf
best_stacking_model = None

for i in range(10):
    print(f"第 {i + 1} 次训练 Stacking 模型...")

   
    stacking_model.set_params(random_state=np.random.randint(0, 10000))

   
    stacking_model.fit(X_train_prob, y_train)

  
    y_pred = stacking_model.predict(X_test_prob)
    y_prob = stacking_model.predict_proba(X_test_prob)[:, 1]

   
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)

    print(f"第 {i + 1} 次 Stacking 模型结果: ACC: {acc:.4f}, AUC: {auc:.4f}, MCC: {mcc:.4f}")

   
    if auc > best_stacking_auc:
        best_stacking_auc = auc
        best_stacking_model = pickle.dumps(stacking_model)


stacking_model_file = os.path.join(save_path, "best_model_stacking.pkl")
with open(stacking_model_file, 'wb') as f:
    f.write(best_stacking_model)

print(f"最佳 Stacking 模型的AUC: {best_stacking_auc:.4f}，已保存为 {stacking_model_file}")



In [None]:

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, matthews_corrcoef, 
    precision_score, recall_score, f1_score, confusion_matrix
)

# 固定随机种子
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


save_path = "Model"


data_path = "Fearure.csv"
data = pd.read_csv(data_path)


train_data = data[data['ref'] != 'DILIrank']
test_data = data[data['ref'] == 'DILIrank']


X_train = train_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_train = train_data['Label']
X_test = test_data.drop(['SMILES', 'Label', 'ref'], axis=1)
y_test = test_data['Label']


model_names = ['RF', 'ET', 'HistGB', 'XGBoost', 'Stacking']


results = []


for model_name in model_names:
    model_file = os.path.join(save_path, f"best_model_{model_name}.pkl")

    print(f"加载模型: {model_name}")
    with open(model_file, 'rb') as f:
        model = pickle.loads(f.read())

    
    if model_name == 'Stacking':
        prob_features_train = []
        prob_features_test = []

  
        for base_model_name in ['RF', 'ET', 'HistGB', 'XGBoost']:
            base_model_file = os.path.join(save_path, f"best_model_{base_model_name}.pkl")
            with open(base_model_file, 'rb') as base_f:
                base_model = pickle.loads(base_f.read())

                
                prob_features_train.append(base_model.predict_proba(X_train)[:, 1])
                prob_features_test.append(base_model.predict_proba(X_test)[:, 1])

        
        X_train_stacking = np.column_stack(prob_features_train)
        X_test_stacking = np.column_stack(prob_features_test)

        X_train, X_test = X_train_stacking, X_test_stacking

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]


    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)  # Sensitivity
    f1 = f1_score(y_test, y_pred)

 
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)


    print(f"模型: {model_name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Sensitivity (Recall): {recall:.4f}")
    print(f"  Specificity: {specificity:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  AUC: {auc:.4f}")

    results.append({
        'Model': model_name,
        'Accuracy': acc,
        'Sensitivity': recall,
        'Specificity': specificity,
        'Precision': precision,
        'F1 Score': f1,
        'AUC': auc
    })

results_df = pd.DataFrame(results)








加载模型: RF
模型: RF
  Accuracy: 0.9159
  Sensitivity (Recall): 0.9511
  Specificity: 0.8918
  Precision: 0.8578
  F1 Score: 0.9021
  AUC: 0.9783
加载模型: ET
模型: ET
  Accuracy: 0.9226
  Sensitivity (Recall): 0.9620
  Specificity: 0.8955
  Precision: 0.8634
  F1 Score: 0.9100
  AUC: 0.9746
加载模型: HistGB
模型: HistGB
  Accuracy: 0.9071
  Sensitivity (Recall): 0.9457
  Specificity: 0.8806
  Precision: 0.8447
  F1 Score: 0.8923
  AUC: 0.9686
加载模型: XGBoost
模型: XGBoost
  Accuracy: 0.9137
  Sensitivity (Recall): 0.9457
  Specificity: 0.8918
  Precision: 0.8571
  F1 Score: 0.8992
  AUC: 0.9620
加载模型: Stacking
模型: Stacking
  Accuracy: 0.9270
  Sensitivity (Recall): 0.9620
  Specificity: 0.9030
  Precision: 0.8719
  F1 Score: 0.9147
  AUC: 0.9739
