In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, f1_score,
                             recall_score, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import StandardScaler

# Function to select features from a single CSV file using ExtraTreesClassifier
def select_features(data_file, y):
    X = pd.read_csv(data_file, index_col=0)
    etc = ExtraTreesClassifier(n_estimators=100)
    etc.fit(X, y)
    selector = SelectFromModel(etc, max_features=90, prefit=True)
    selected_features = X.columns[selector.get_support()]
    return X[selected_features]

# Function to run SVM with feature selection and hyperparameter tuning on the combined features
def run_svm_with_combined_features(data_files, label_file):
    # Read the label file
    y = pd.read_csv(label_file, index_col=0).values.ravel()

    # Convert labels from {1, 2} to {0, 1}
    y = np.where(y == 2, 1, 0)

    # Select features for each data file
    selected_dataframes = [select_features(data_file, y) for data_file in data_files]

    # Combine the selected features
    X_combined = pd.concat(selected_dataframes, axis=1)

    # Standardize the combined features
    scaler = StandardScaler()
    X_combined = scaler.fit_transform(X_combined)

    # Create the SVM classifier
    svm = SVC(kernel='rbf', probability=True)

    # Define the hyperparameter grid
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001]
    }

    # Create the cross-validation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

    # Create GridSearchCV
    grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=cv, n_jobs=-1)

    # Fit GridSearchCV
    grid_search.fit(X_combined, y)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Initialize lists to store the metrics
    accuracies = []
    precisions = []
    f1_scores = []
    sensitivities = []
    specificities = []
    aucs = []
    tprs = []
    fprs = []

    # Evaluate the model with the best parameters on cross-validation splits
    best_model = grid_search.best_estimator_

    # Loop through the cross-validation splits
    for train_index, test_index in cv.split(X_combined, y):
        X_train, X_test = X_combined[train_index], X_combined[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit the model and make predictions
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC

        # Calculate the metrics for the current split
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        sensitivity = recall_score(y_test, y_pred)  # Recall is calculated here

        # Calculate specificity
        cm = confusion_matrix(y_test, y_pred)
        tn = cm[0, 0]  # True Negatives
        fp = cm[0, 1]  # False Positives
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        # Calculate ROC AUC
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Store metrics
        accuracies.append(accuracy)
        precisions.append(precision)
        f1_scores.append(f1)
        sensitivities.append(sensitivity)
        specificities.append(specificity)
        aucs.append(roc_auc)

        # Store TPR and FPR
        tprs.append(tpr)
        fprs.append(fpr)

    # Calculate the mean and standard deviation of the metrics
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_f1 = np.mean(f1_scores)
    mean_sensitivity = np.mean(sensitivities)  # Mean Recall
    mean_specificity = np.mean(specificities)
    mean_auc = np.mean(aucs)

    std_accuracy = np.std(accuracies)
    std_precision = np.std(precisions)
    std_f1 = np.std(f1_scores)
    std_sensitivity = np.std(sensitivities)  # Standard Deviation of Recall
    std_specificity = np.std(specificities)
    std_auc = np.std(aucs)

    # Print the selected features for each data file
    for i, data_file in enumerate(data_files):
        selected_features = selected_dataframes[i].columns
        print(f"Selected features from {data_file}: {', '.join(selected_features)}")

    return (mean_accuracy, std_accuracy, mean_precision, std_precision,
            mean_f1, std_f1, mean_sensitivity, std_sensitivity,
            mean_specificity, std_specificity, mean_auc, std_auc, best_params,
            tprs, fprs)

# List of data and label files
data_files = ["mRNA296.csv", "mirna296.csv", "meth296.csv"]
label_file = "Labels.csv"

# Run SVM with combined features and print results
results = run_svm_with_combined_features(data_files, label_file)
print(f"Mean Accuracy: {results[0]:.2f} ± {results[1]:.2f}")
print(f"Mean Precision: {results[2]:.2f} ± {results[3]:.2f}")
print(f"Mean F1-Score: {results[4]:.2f} ± {results[5]:.2f}")
print(f"Mean Sensitivity (Recall): {results[6]:.2f} ± {results[7]:.2f}")
print(f"Mean Specificity: {results[8]:.2f} ± {results[9]:.2f}")
print(f"Mean AUC: {results[10]:.2f} ± {results[11]:.2f}")
print(f"Best Parameters: {results[12]}")

# Calculate the mean TPR and FPR across all folds
mean_tpr = np.mean([np.mean(tpr) for tpr in results[13]])
mean_fpr = np.mean([np.mean(fpr) for fpr in results[14]])

# Print the mean TPR and FPR
print(f"Mean TPR: {mean_tpr:.2f}")
print(f"Mean FPR: {mean_fpr:.2f}")