In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import os  
import seaborn as sns  
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.tree import plot_tree
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from deap import base, creator, tools, algorithms
from sklearn.utils import resample
from sklearn.metrics import matthews_corrcoef
import random

# Define filenames for Ada, DT, J48, KNN, MLP, NB, RF, SVM, and XGB
GAnumber = "GA4" #Name the output file

Group = "_ALL"  #Name the output file

Y_output = "TMT"


# Load data path
base_dir     = "C:/Users/User/Desktop"
file_name = "CTMT_clean_c_lable_upload.CSV"  # Name of the file to read
encoding_fmt = "ISO-8859-1"


n_splits = 5
seed_ML = 42
seed_CV = 42 #any number

# Set GA parameter variables
POP_SIZE = 30
GENS = 30
CX_PB = 0.5
MUT_PB = 0.3
SEED = 42

# Set custom save path 
folder_name = f"{GAnumber}{Group}({Y_output})"
custom_results_dir = f"C:/Users/User/Desktop/{folder_name}"  # You can modify this path

data_source_Ada = Y_output + "_Ada" + Group + "_clean_nu_" + ".csv"
output_filename_Ada = "Ada_" + GAnumber + Group + ".csv"

data_source_GB = Y_output + "_GB" + Group + "_clean_nu_" + ".csv"
output_filename_GB = "GB_" + GAnumber + Group + ".csv"

data_source_DT = Y_output + "_DT" + Group + "_clean_nu_" + ".csv"
output_filename_DT = "DT_" + GAnumber + Group + ".csv"

data_source_J48 = Y_output + "_J48" + Group + "_clean_nu_" + ".csv"
output_filename_J48 = "J48_" + GAnumber + Group + ".csv"

data_source_KNN = Y_output + "_KNN" + Group + "_clean_nu_" + ".csv"
output_filename_KNN = "KNN_" + GAnumber + Group + ".csv"

data_source_MLP = Y_output + "_MLP" + Group + "_clean_nu_" + ".csv"
output_filename_MLP = "MLP_" + GAnumber + Group + ".csv"

data_source_NB = Y_output + "_NB" + Group + "_clean_nu_" + ".csv"
output_filename_NB = "NB_" + GAnumber + Group + ".csv"

data_source_RF = Y_output + "_RF" + Group + "_clean_nu_" + ".csv"
output_filename_RF = "RF_" + GAnumber + Group + ".csv"

data_source_SVM = Y_output + "_SVM" + Group + "_clean_nu_" + ".csv"
output_filename_SVM = "SVM_" + GAnumber + Group + ".csv"

data_source_XGB = Y_output + "_XGB" + Group + "_clean_nu_" + ".csv"
output_filename_XGB = "XGB_" + GAnumber + Group + ".csv"

# Build the full path first, then load files via data_path
data_path = os.path.join(base_dir, file_name)

# Initialize results list
results_data = []

# Create results directory
os.makedirs(custom_results_dir, exist_ok=True)
os.makedirs(os.path.join(custom_results_dir, "models"), exist_ok=True)
os.makedirs(os.path.join(custom_results_dir, "figures"), exist_ok=True)
os.makedirs(os.path.join(custom_results_dir, "text"), exist_ok=True)


In [None]:
#GradientBoostingClassifier
df = pd.read_csv(data_path, encoding=encoding_fmt)

# Reconstruct X and y
y = df["TMT"].values
X = df.drop(columns=["TMT"])

# Perform label encoding on categorical columns["GENDER", "Eeducation", "Smoke", "Drink", "SelfHealth", "SelfHappiness", "EX_TYPE", "OPEN_CLOSE"]
categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]


# Calculate and display the number of features
categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\nCategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\nContinuous features：")
for feat in continuous_features:
    print(f"- {feat}")

# Save the standardized dataset to a new CSV file
# Add the target variable back to the dataset first
processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)

# Specify save path
processed_data_path = os.path.join(custom_results_dir, data_source_GB )

# Save as CSV file
processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

# Estimator: GB accuracy
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = GradientBoostingClassifier(random_state=seed_ML)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),  


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)



random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)

）
print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")

best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_GB.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")

# Grid Search
print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
}

# Perform 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(GradientBoostingClassifier(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Perform 5-fold cross-validation with the optimal parameters and calibration, and collect performance metrics
best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)

# Store metrics for each fold
metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    # Train uncalibrated model
    clf = GradientBoostingClassifier(**best_params, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

    # Calibrate model
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    # Validate and compute metrics
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))

# Compute mean and standard deviation
print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")
    
    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

# CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "GB_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# Ramdom Forest

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]



categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_RF)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

# Random Forest
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=seed_ML)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_RF.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")

# Perform hyperparameter tuning via Grid Search
print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(RandomForestClassifier(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)


metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

   
    clf = RandomForestClassifier(**best_params, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

# CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "RF_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# SVM

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])

#
categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]
for col in categorical_cols:
    if col in X.columns:
        
        dummies = pd.get_dummies(X[col], prefix=col, drop_first=True).astype(int)
        
        X = X.drop(col, axis=1)
        
        X = pd.concat([X, dummies], axis=1)


continuous_cols = [col for col in X.columns if not any(cat_col in col for cat_col in categorical_cols)]


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[continuous_cols] = scaler.fit_transform(X[continuous_cols])


joblib.dump(scaler, os.path.join(custom_results_dir, "models", "standard_scaler_SVM.pkl"))


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_SVM)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

# SVM
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = SVC(random_state=seed_ML)  
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)



random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA:\n")
print(f"(Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_SVM.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.1, 1, 10],
    #'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    #'gamma': ['scale', 'auto', 0.1, 1, 10],
}


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(SVC(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)


metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = SVC(**best_params, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

  
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

# CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "SVM_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# XGBoost

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_XGB)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

# XGBoost
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = xgb.XGBClassifier(random_state=seed_ML)  #  XGBoost
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)



random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)

）
print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_XGB.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
    }


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(xgb.XGBClassifier(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)


metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = xgb.XGBClassifier(**best_params, random_state=seed_ML, n_jobs=1)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

#  CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "XGB_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")



In [None]:
#Adaboost
df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]



categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_Ada )


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]


def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = AdaBoostClassifier(algorithm='SAMME', random_state=seed_ML)  
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),  


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)



random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")

best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_Ada.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME']  
}


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(AdaBoostClassifier(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)


metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

   
    clf = AdaBoostClassifier(**best_params, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

   
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Compute mean and standard deviation：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")
    
    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

# CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "Ada_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# Decision Tree

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_DT)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

# Decision Tree 
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = DecisionTreeClassifier(random_state=seed_ML)  # 改為 DecisionTreeClassifier
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")



best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_DT.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(DecisionTreeClassifier(random_state= seed_ML), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)


metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = DecisionTreeClassifier(**best_params, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })


result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "DT_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# KNN

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]
for col in categorical_cols:
    if col in X.columns:
        
        dummies = pd.get_dummies(X[col], prefix=col, drop_first=True).astype(int)
       
        X = X.drop(col, axis=1)
        
        X = pd.concat([X, dummies], axis=1)


continuous_cols = [col for col in X.columns if not any(cat_col in col for cat_col in categorical_cols)]


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[continuous_cols] = scaler.fit_transform(X[continuous_cols])


joblib.dump(scaler, os.path.join(custom_results_dir, "models", "standard_scaler.pkl"))


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_KNN)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

#KNN
def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = KNeighborsClassifier()  
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_KNN.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],  
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)

metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = KNeighborsClassifier(**grid_search.best_params_)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

# CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "KNN_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# Naive Bayes

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_NB)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]


def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)


print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_NB.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")

# NB
print("\nNB ...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)



metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = GaussianNB()
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

    
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

#  CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "NB_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")


In [None]:
# MLP

df = pd.read_csv(data_path, encoding=encoding_fmt)


y = df["TMT"].values
X = df.drop(columns=["TMT"])


categorical_cols = ["GEN", "Edu", "Smoke", "Drink", "S-HLTH", "S-HAP", "EX_TYPE", "O/C"]
for col in categorical_cols:
    if col in X.columns:
        
        dummies = pd.get_dummies(X[col], prefix=col, drop_first=True).astype(int)
        
        X = X.drop(col, axis=1)
        
        X = pd.concat([X, dummies], axis=1)


continuous_cols = [col for col in X.columns if not any(cat_col in col for cat_col in categorical_cols)]


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[continuous_cols] = scaler.fit_transform(X[continuous_cols])


joblib.dump(scaler, os.path.join(custom_results_dir, "models", "standard_scaler.pkl"))


categorical_features = [col for col in X.columns if any(cat_col in col for cat_col in categorical_cols)]
continuous_features = [col for col in X.columns if col not in categorical_features]

print(f"\nTotal number of features：{len(X.columns)}")
print(f"Number of categorical features：{len(categorical_features)}")
print(f"Number of continuous features：{len(continuous_features)}")
print("\ncategorical features：")
for feat in categorical_features:
    print(f"- {feat}")
print("\ncontinuous features：")
for feat in continuous_features:
    print(f"- {feat}")


processed_df = pd.concat([X, pd.Series(y, name="TMT")], axis=1)


processed_data_path = os.path.join(custom_results_dir, data_source_MLP)


processed_df.to_csv(processed_data_path, index=False, encoding="utf-8")
print(f"\n✅ Standardized dataset saved to：{processed_data_path}")

X_np = X.values
feature_names = X.columns.tolist()
num_features = X_np.shape[1]

def evaluate(individual):
    if sum(individual) == 0:
        return 0.0,
    selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
    X_sel = X_np[:, selected_idx]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)
    clf = MLPClassifier(
        random_state=seed_ML,
        max_iter=3000,
        learning_rate_init=0.01,
        tol=1e-4
    )
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),


creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)



random.seed(SEED)


pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(pop, toolbox, cxpb=CX_PB, mutpb=MUT_PB, ngen=GENS, halloffame=hof, verbose=True)

）
print("\n🔧 GA :\n")
print(f" (Population Size): {POP_SIZE}")
print(f" (Generations): {GENS}")
print(f" (Crossover Probability): {CX_PB}")
print(f" (Mutation Probability): {MUT_PB}")
print(f" (Random Seed): {SEED}")
print(f" (Selection): Tournament (tournsize=3)")
print(f" (Crossover): Two-point crossover")
print(f" (Mutation): Flip bit (indpb=0.05)")


best_individual = hof[0]
selected_features = [feature_names[i] for i in range(num_features) if best_individual[i] == 1]
print("\n✅ Selected features:")
for feat in selected_features:
    print(f"- {feat}")

selected_features_df = pd.DataFrame({'Selected_Features': selected_features})
selected_features_path = os.path.join(custom_results_dir, "text", "Selected_Features_MLP.csv")
os.makedirs(os.path.dirname(selected_features_path), exist_ok=True)
selected_features_df.to_csv(selected_features_path, index=False, encoding="utf-8")
print(f"\n✅ Selected features saved to：{selected_features_path}")


print("\n🔍 Perform hyperparameter tuning via Grid Search...")
X_selected = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1], 
    'max_iter': [3000, 5000],  
    'tol': [1e-4, 1e-3]  
}


cv = StratifiedKFold(n_splits, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    MLPClassifier(
        random_state=seed_ML,
        max_iter=5000,
    ),
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("\n✅ Optimal parameter set:")
print(grid_search.best_params_)
print(f"Best accuracy: {grid_search.best_score_:.4f}")



cv = StratifiedKFold(n_splits, shuffle=True, random_state= seed_CV)
grid_search = GridSearchCV(
    MLPClassifier(
        random_state=seed_ML,
        max_iter=3000,
        learning_rate_init=0.01,
        tol=1e-4
    ),
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_

cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_CV)

metrics = {
    'accuracy': [],
    'f1': [],
    'recall': [],
    'precision': [],
    'mcc': []
}

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_tr = X_selected.iloc[train_idx]
    X_val = X_selected.iloc[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]

    
    clf = MLPClassifier(**grid_search.best_params_, random_state=seed_ML)
    clf.fit(X_tr, y_tr)

    
    calibrator = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
    calibrator.fit(X_tr, y_tr)

    
    y_pred = calibrator.predict(X_val)
    metrics['accuracy'].append(accuracy_score(y_val, y_pred))
    metrics['f1'].append(f1_score(y_val, y_pred))
    metrics['recall'].append(recall_score(y_val, y_pred))
    metrics['precision'].append(precision_score(y_val, y_pred))
    metrics['mcc'].append(matthews_corrcoef(y_val, y_pred))


print("🎯 5-Fold Mean and standard deviation of calibrated model metrics：")
for name, vals in metrics.items():
    mean_val = np.mean(vals)
    std_val = np.std(vals)
    print(f"{name.capitalize():<10}: {mean_val:.4f} ± {std_val:.4f}")

  
    results_data.append({
        'Fold': 'Mean ± Std',
        'Metric': name.capitalize(),
        'Mean': f"{mean_val:.4f}",
        'Std': f"{std_val:.4f}"
    })


for i in range(n_splits):
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'Accuracy',   'Mean': metrics['accuracy'][i],   'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",
        'Metric': 'F1',         'Mean': metrics['f1'][i],         'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Recall',     'Mean': metrics['recall'][i],     'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'Precision',  'Mean': metrics['precision'][i],  'Std': ''
    })
    results_data.append({
        'Fold': f"Fold {i+1}",  'Metric': 'MCC',        'Mean': metrics['mcc'][i],        'Std': ''
    })

#  CSV
result_df = pd.DataFrame(results_data)
csv_save_path = os.path.join(custom_results_dir, "text", "MLP_Fold_Calibration_Results.csv")
result_df.to_csv(csv_save_path, index=False, encoding="utf-8")
print(f"✅ Results saved to：{csv_save_path}")
