In [None]:
seed1 = 42
seed2 = range(42,43) 
featuresnumber = 10 #set target feature number
Kfold = 5
file_path1 = "C:/Users/admin/Desktop/"  #File with labelb encoding
file_path2 = "C:/Users/User/Desktop/.CSV"  #File with one-hot encoding and Z-score for SVC
output_base1 = "C:/Users/admin/Desktop/"

In [None]:
#Random Forest

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        
        rfe = RFE(base_estimator, n_features_to_select=featuresnumber)
        X_rfe_all = rfe.fit_transform(X, y)
        selected_features = np.array(feature_names)[rfe.support_]
        X_rfe_df = pd.DataFrame(X_rfe_all, columns=selected_features)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X_rfe_df, y)):
            X_train, X_test = X_rfe_df.iloc[train_idx], X_rfe_df.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)
                
        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row["Accuracy"]:.4f}｜F1={mean_row["F1"]:.4f}｜Recall={mean_row["Recall"]:.4f}｜Precision={mean_row["Precision"]:.4f}｜MCC={ mean_row["MCC"]:.4f}")


# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="rf",
    base_estimator=RandomForestClassifier(random_state=seed1),
    param_grid={
        'n_estimators': [100, 200],
        'max_depth': [5, 10, ],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    output_dir=os.path.join(output_base, "RF"),
    seed_cv_range=seed2
)




In [None]:
#GB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import os
import shap
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        
        rfe = RFE(base_estimator, n_features_to_select=featuresnumber)
        X_rfe_all = rfe.fit_transform(X, y)
        selected_features = np.array(feature_names)[rfe.support_]
        X_rfe_df = pd.DataFrame(X_rfe_all, columns=selected_features)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X_rfe_df, y)):
            X_train, X_test = X_rfe_df.iloc[train_idx], X_rfe_df.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)
                
        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row["Accuracy"]:.4f}｜F1={mean_row["F1"]:.4f}｜Recall={mean_row["Recall"]:.4f}｜Precision={mean_row["Precision"]:.4f}｜MCC={ mean_row["MCC"]:.4f}")


# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="gb",
    base_estimator=GradientBoostingClassifier(random_state=seed1),
    param_grid={
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    output_dir=os.path.join(output_base, "GB"),
    seed_cv_range=seed2
)


In [None]:
#XGBoosts

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import joblib
import os
import shap
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        
        rfe = RFE(base_estimator, n_features_to_select=featuresnumber)
        X_rfe_all = rfe.fit_transform(X, y)
        selected_features = np.array(feature_names)[rfe.support_]
        X_rfe_df = pd.DataFrame(X_rfe_all, columns=selected_features)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X_rfe_df, y)):
            X_train, X_test = X_rfe_df.iloc[train_idx], X_rfe_df.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)
                
        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row["Accuracy"]:.4f}｜F1={mean_row["F1"]:.4f}｜Recall={mean_row["Recall"]:.4f}｜Precision={mean_row["Precision"]:.4f}｜MCC={ mean_row["MCC"]:.4f}")


# === Main program ===
file_path = file_path1
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="xgb",
    base_estimator=XGBClassifier(eval_metric='logloss', random_state=seed1, n_jobs=1),
    param_grid={
        'n_estimators': [50, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    },
    output_dir=os.path.join(output_base, "XGB"),
    seed_cv_range=seed2
)


In [None]:
#SVM
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import joblib
import os
import shap
from numpy import interp
from itertools import cycle

def run_model_pipeline_cvroc_multiseed(X, y, feature_names, model_name, base_estimator, param_grid, output_dir, seed_model=seed1, seed_cv_range=seed2):
    best_acc = 0
    best_seed_result = None

    for seed_cv in seed_cv_range:
        print(f"[{model_name}]  running Seed_cv = {seed_cv}")
        this_output_dir = os.path.join(output_dir, f"{model_name}_seed{seed_cv}")
        os.makedirs(this_output_dir, exist_ok=True)

        
        rfe = RFE(base_estimator, n_features_to_select=featuresnumber)
        X_rfe_all = rfe.fit_transform(X, y)
        selected_features = np.array(feature_names)[rfe.support_]
        X_rfe_df = pd.DataFrame(X_rfe_all, columns=selected_features)

        skf = StratifiedKFold(n_splits=Kfold, shuffle=True, random_state=seed_cv)
        all_results = []
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for fold, (train_idx, test_idx) in enumerate(skf.split(X_rfe_df, y)):
            X_train, X_test = X_rfe_df.iloc[train_idx], X_rfe_df.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid_search = GridSearchCV(base_estimator, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            calibrated_model = CalibratedClassifierCV(best_model, cv=5, method='sigmoid')
            calibrated_model.fit(X_train, y_train)
            y_proba = calibrated_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)

            y_pred = calibrated_model.predict(X_test)
            all_results.append({
                "Fold": fold + 1,
                "Accuracy": accuracy_score(y_test, y_pred),
                "F1": f1_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        results_df = pd.DataFrame(all_results)
        mean_row = results_df.mean(numeric_only=True)
        mean_row = mean_row.astype(object)
        mean_row["Fold"] = "Mean"
        std_row = results_df.std(numeric_only=True)
        std_row = std_row.astype(object)
        std_row["Fold"] = "Std"
        results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)
                
        print(f"✅ [{model_name}] [Seed_CV = {seed_cv}, Seed_model = {seed_model}] done")
        print(f"🔍 Results：ACC={mean_row["Accuracy"]:.4f}｜F1={mean_row["F1"]:.4f}｜Recall={mean_row["Recall"]:.4f}｜Precision={mean_row["Precision"]:.4f}｜MCC={ mean_row["MCC"]:.4f}")


# === Main program ===
file_path = file_path2
df = pd.read_csv(file_path)
y = df["TMT"].values
X = df.drop(columns=["TMT"])
feature_names = X.columns.tolist()

output_base = output_base1


run_model_pipeline_cvroc_multiseed(
    X, y, feature_names,
    model_name="svm",
    base_estimator=SVC(kernel='linear', probability=True, random_state=seed1),
    param_grid={
        'C': [0.1, 1, 10],
    },
    output_dir=os.path.join(output_base, "SVM"),
    seed_cv_range=seed2
)
