# 1.Start

This script does not include the Mutual Information (MI) feature-selection stage; run the MI pipeline separately, save the selected feature set to its own file, and then use this script for model evaluation.After MI-based feature selection, I saved separate files containing 5–20 features (e.g., the 15-feature file is named data_15); set MI_number = 15 below so the script automatically loads that specific file

In [None]:
seed1 = 42
seed2 = range(42,43) #set seed range for cv
Kfold = 5
MI_number = 12 #'correlation' #39、5 ~ 20
file_path1 = f"C:/Users/User/Desktop/data_{MI_number}.CSV"  #File with labelb encoding
file_path2 = f"C:/Users/User/Desktop/data_{MI_number}.CSV"  #File with one-hot encoding and Z-score 
file_path3 = f"C:/Users/User/Desktop/data_{MI_number}.CSV"  #One-hot only for NB
output_base1 = f"C:/Users/User/Desktop/MI_{MI_number}"

In [None]:
#Gradient Boosting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="gb",
    base_estimator=GradientBoostingClassifier(random_state=seed1),
    param_grid={
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    output_dir=os.path.join(output_base, "GB"),
    seed_cv_range=seed2
)


In [None]:
#Random Forest

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="rf",
    base_estimator=RandomForestClassifier(random_state=seed1),
    param_grid={
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    output_dir=os.path.join(output_base, "RF"),
    seed_cv_range=seed2
)


In [None]:
#Support Vector Machine

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path2
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])


X = X.apply(pd.to_numeric, errors='coerce')
if X.isnull().values.any():
    print("⚠️ Detect NaN values and perform imputation")
    X = X.fillna(0)
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="svm",
    base_estimator=SVC(probability=True, random_state=seed1),
    param_grid={
        'C': [0.1, 1, 10],
        'kernel': ['linear']
    },
    output_dir=os.path.join(output_base, "SVM"),
    seed_cv_range=seed2
)


In [None]:
#XGBoost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="xgb",
    base_estimator=XGBClassifier(eval_metric='logloss', n_jobs=1, random_state=seed1),
    param_grid={
        'n_estimators': [50, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    },
    output_dir=os.path.join(output_base, "XGB"),
    seed_cv_range=seed2
)


In [None]:
#AdaBoost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="ada",
    base_estimator=AdaBoostClassifier(random_state=seed1),
    param_grid={
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
        'algorithm': ['SAMME']
    },
    output_dir=os.path.join(output_base, "ADA"),
    seed_cv_range=seed2
)


In [None]:
#Decision Tree

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="dt",
    base_estimator=DecisionTreeClassifier(random_state=seed1),
    param_grid={
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    output_dir=os.path.join(output_base, "DT"),
    seed_cv_range=seed2
)


In [None]:
#K-Nearest Neighbors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path2
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])


X = X.apply(pd.to_numeric, errors='coerce')
if X.isnull().values.any():
    print("⚠️ Detect NaN values and perform imputation")
    X = X.fillna(0)
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="knn",
    base_estimator=KNeighborsClassifier(),
    param_grid={
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'p': [1, 2],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'metric': ['euclidean', 'manhattan']
    },
    output_dir=os.path.join(output_base, "KNN"),
    seed_cv_range=seed2
)


In [None]:
#Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path3
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])


X = X.apply(pd.to_numeric, errors='coerce')
if X.isnull().values.any():
    print("⚠️ Detect NaN values and perform imputation")
    X = X.fillna(0)
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="nb",
    base_estimator=GaussianNB(),
    param_grid={
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    },
    output_dir=os.path.join(output_base, "NB"),
    seed_cv_range=seed2
)


In [None]:
#Multi-Layer Perceptron

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row['Accuracy']:.4f}｜F1={mean_row['F1']:.4f}｜Recall={mean_row['Recall']:.4f}｜Precision={mean_row['Precision']:.4f}｜MCC={ mean_row['MCC']:.4f}")

# === Main program ===
file_path = file_path2
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])


X = X.apply(pd.to_numeric, errors='coerce')
if X.isnull().values.any():
    print("⚠️ Detect NaN values and perform imputation")
    X = X.fillna(0)
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="mlp",
    base_estimator=MLPClassifier(random_state=seed1),
    param_grid={
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [3000, 5000],
        'tol': [1e-4, 1e-3]
    },
    output_dir=os.path.join(output_base, "MLP"),
    seed_cv_range=seed2
)
