In [3]:
# Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (confusion_matrix, roc_auc_score, average_precision_score, accuracy_score,
                             precision_score, recall_score, f1_score)
from catboost import CatBoostClassifier
import openpyxl

# Loading the preprocessed data
data = pd.read_csv('DPPIV descriptors.csv')
X = data.drop(columns=['label'])
y = data['label']

# Initialize result storage
excel_writer = pd.ExcelWriter("seqDPPIV Evaluation Metrics.xlsx", engine="openpyxl")

# Number of runs
n_runs = 10

# Hyperparameters for the CatBoost classifier
user_params = {
    'depth': 5, 
    'learning_rate': 0.1, 
    'l2_leaf_reg': 2,  
}

# Metrics storage
summary_results = []

for run in range(1, n_runs + 1):
    print(f"Run {run}/{n_runs}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=run) #random_state is assigned as run to change the state 'n_runs' times

    # Scale the features
    scaler = MinMaxScaler(feature_range=(-1, 1))
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the model with user-defined hyperparameters
    model = CatBoostClassifier(**user_params)

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=run)
    cv_results = []
    fold = 1
    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train the model
        model.fit(X_fold_train, y_fold_train)

        # Predict
        y_fold_pred = model.predict(X_fold_val)
        y_fold_proba = model.predict_proba(X_fold_val)[:, 1]

        # Evaluation metrics
        tn, fp, fn, tp = confusion_matrix(y_fold_val, y_fold_pred).ravel()
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        accuracy = accuracy_score(y_fold_val, y_fold_pred)
        precision = precision_score(y_fold_val, y_fold_pred)
        recall = recall_score(y_fold_val, y_fold_pred)
        sensitivity = recall  # Same as recall
        specificity = tn / (tn + fp)
        auc_roc = roc_auc_score(y_fold_val, y_fold_proba)
        auc_pr = average_precision_score(y_fold_val, y_fold_proba)
        f1 = f1_score(y_fold_val, y_fold_pred)

        # Append results
        cv_results.append([tpr, fpr, accuracy, precision, recall, sensitivity, specificity, auc_roc, auc_pr, f1, tn, fp, fn, tp])
        print(f"Run {run} - Fold {fold}: TPR={tpr}, FPR={fpr}, Accuracy={accuracy}")
        fold += 1

    # Save cross-validation results
    cv_results_df = pd.DataFrame(cv_results, columns=[
        "TPR", "FPR", "Accuracy", "Precision", "Recall", "Sensitivity", "Specificity", "AUC-ROC", "AUC-PR", "F1",
        "TN", "FP", "FN", "TP"
    ])
    cv_results_df.to_excel(excel_writer, sheet_name=f"Run_{run}_CV", index=False)

    # Model testing
    model.fit(X_train_scaled, y_train)
    y_test_pred = model.predict(X_test_scaled)
    y_test_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Evaluation metrics for testing
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    sensitivity = recall
    specificity = tn / (tn + fp)
    auc_roc = roc_auc_score(y_test, y_test_proba)
    auc_pr = average_precision_score(y_test, y_test_proba)
    f1 = f1_score(y_test, y_test_pred)

    # Store results
    test_results = [tpr, fpr, accuracy, precision, recall, sensitivity, specificity, auc_roc, auc_pr, f1, tn, fp, fn, tp]
    summary_results.append([run] + test_results)

    # Save test results
    test_results_df = pd.DataFrame([test_results], columns=[
        "TPR", "FPR", "Accuracy", "Precision", "Recall", "Sensitivity", "Specificity", "AUC-ROC", "AUC-PR", "F1",
        "TN", "FP", "FN", "TP"
    ])
    test_results_df.to_excel(excel_writer, sheet_name=f"Run_{run}_Test", index=False)

# Save summary results
summary_df = pd.DataFrame(summary_results, columns=[
    "Run", "TPR", "FPR", "Accuracy", "Precision", "Recall", "Sensitivity", "Specificity", "AUC-ROC", "AUC-PR", "F1",
    "TN", "FP", "FN", "TP"
])
summary_df.to_excel(excel_writer, sheet_name="Test_Summary", index=False)

# Finalize the Excel file
excel_writer.close()
print("Evaluation metrics saved")


Run 1/10
0:	learn: 0.6069599	total: 474us	remaining: 474ms
1:	learn: 0.5519896	total: 794us	remaining: 396ms
2:	learn: 0.5024118	total: 1.39ms	remaining: 461ms
3:	learn: 0.4684757	total: 1.8ms	remaining: 447ms
4:	learn: 0.4438028	total: 2.13ms	remaining: 424ms
5:	learn: 0.4169670	total: 2.46ms	remaining: 407ms
6:	learn: 0.4009970	total: 2.76ms	remaining: 392ms
7:	learn: 0.3841784	total: 3.17ms	remaining: 393ms
8:	learn: 0.3712521	total: 3.58ms	remaining: 394ms
9:	learn: 0.3614407	total: 3.88ms	remaining: 384ms
10:	learn: 0.3500785	total: 4.31ms	remaining: 387ms
11:	learn: 0.3422866	total: 4.67ms	remaining: 385ms
12:	learn: 0.3345424	total: 4.97ms	remaining: 378ms
13:	learn: 0.3252493	total: 5.27ms	remaining: 371ms
14:	learn: 0.3148694	total: 5.57ms	remaining: 366ms
15:	learn: 0.3102246	total: 5.86ms	remaining: 360ms
16:	learn: 0.3041478	total: 6.34ms	remaining: 367ms
17:	learn: 0.2993151	total: 6.73ms	remaining: 367ms
18:	learn: 0.2949953	total: 7.2ms	remaining: 372ms
19:	learn: 0.2911