In [None]:
## DT model

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from itertools import combinations
import matplotlib.pyplot as plt
import numpy as np

# Load the data
data_path = "path/to/your/data/"
df = pd.read_excel(data_path + "merged_data.xlsx")
df_validation = pd.read_excel(data_path + "df_validationnew.xlsx")

# Encode the target variable
label_encoder = LabelEncoder()
y = df["no-prog/prog"]
y_encoded = label_encoder.fit_transform(y)

y_validation_encoded = label_encoder.transform(df_validation['no-prog/prog'])

# Define selected features
selected_features = ['Age', 'hsa-miR-556-3p', 'hsa-miR-3667-5p', 'hsa-miR-141-3p', 'hsa-miR-224-5p',
                     'hsa-let-7c-5p', 'hsa-miR-3157-5p', 'hsa-miR-200a-5p']

# Initialize lists to keep track of the best models
best_models = []

# Loop through all possible combinations of the selected features, starting from 3 up to the total number of features
for i in range(3, len(selected_features) + 1):
    for subset in combinations(selected_features, i):
        # Select the current subset of features
        X_subset = df[list(subset)]
        X_validation_subset = df_validation[list(subset)]

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y_encoded, test_size=0.2, random_state=42)

        # Scale the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_validation_subset_scaled = scaler.transform(X_validation_subset)

        # Apply SMOTE to the training data
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

        # Create a new Decision Tree classifier for each subset of features
        dt_clf_subset = DecisionTreeClassifier(random_state=42)

        # Perform hyperparameter tuning on the resampled data
        param_grid_subset = {
            'max_depth': [None, 5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
        }

        # Create GridSearchCV
        grid_search_subset = GridSearchCV(dt_clf_subset, param_grid_subset, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='roc_auc', n_jobs=-1)
        grid_search_subset.fit(X_train_resampled, y_train_resampled)

        # Get the best model from the grid search
        best_dt_clf_subset = grid_search_subset.best_estimator_

        # Evaluate the model on the test and validation sets
        y_pred_test = best_dt_clf_subset.predict(X_test_scaled)
        y_pred_proba_test = best_dt_clf_subset.predict_proba(X_test_scaled)[:, 1]
        accuracy_test = accuracy_score(y_test, y_pred_test)
        auc_test = roc_auc_score(y_test, y_pred_proba_test)

        y_pred_validation = best_dt_clf_subset.predict(X_validation_subset_scaled)
        y_pred_proba_validation = best_dt_clf_subset.predict_proba(X_validation_subset_scaled)[:, 1]
        accuracy_validation = accuracy_score(y_validation_encoded, y_pred_validation)
        auc_validation = roc_auc_score(y_validation_encoded, y_pred_proba_validation)

        # Append the model's performance metrics to the best_models list
        best_models.append({
            'features': subset,
            'accuracy_test': accuracy_test,
            'auc_test': auc_test,
            'accuracy_validation': accuracy_validation,
            'auc_validation': auc_validation
        })

# Find the best model based on the combined AUC for test and validation datasets
best_model_auc = max(best_models, key=lambda x: (x['auc_test'] + x['auc_validation']))

# Retrieve the performance metrics for the best model
best_features = best_model_auc['features']
X_best_subset = df[list(best_features)]
X_validation_best_subset = df_validation[list(best_features)]

# Split the data into training and testing sets for the best feature set
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best_subset, y_encoded, test_size=0.2, random_state=42)

# Scale the best feature set
X_train_best_scaled = scaler.fit_transform(X_train_best)
X_test_best_scaled = scaler.transform(X_test_best)
X_validation_best_scaled = scaler.transform(X_validation_best_subset)

# Apply SMOTE to the best training data
X_train_best_resampled, y_train_best_resampled = smote.fit_resample(X_train_best_scaled, y_train_best)

# Retrain the model on the best feature set
best_dt_clf = DecisionTreeClassifier(random_state=42)
best_dt_clf.fit(X_train_best_resampled, y_train_best_resampled)

# Predict and evaluate on the test set for the best feature set
y_pred_test_best = best_dt_clf.predict(X_test_best_scaled)
cm_test_best = confusion_matrix(y_test_best, y_pred_test_best)
sensitivity_test_best = cm_test_best[1, 1] / (cm_test_best[1, 1] + cm_test_best[1, 0])
specificity_test_best = cm_test_best[0, 0] / (cm_test_best[0, 0] + cm_test_best[0, 1])

# Predict and evaluate on the validation set for the best feature set
y_pred_validation_best = best_dt_clf.predict(X_validation_best_scaled)
cm_validation_best = confusion_matrix(y_validation_encoded, y_pred_validation_best)
sensitivity_validation_best = cm_validation_best[1, 1] / (cm_validation_best[1, 1] + cm_validation_best[1, 0])
specificity_validation_best = cm_validation_best[0, 0] / (cm_validation_best[0, 0] + cm_validation_best[0, 1])

# Print the results for the best model
print(f"Best Model Based on Combined AUC:")
print(f"Features: {best_features}")
print(f"Test - Accuracy: {best_model_auc['accuracy_test']:.2f}, AUC: {best_model_auc['auc_test']:.2f}, Sensitivity: {sensitivity_test_best:.2f}, Specificity: {specificity_test_best:.2f}")
print(f"Validation - Accuracy: {best_model_auc['accuracy_validation']:.2f}, AUC: {best_model_auc['auc_validation']:.2f}, Sensitivity: {sensitivity_validation_best:.2f}, Specificity: {specificity_validation_best:.2f}\n")


In [None]:
## GBM model

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations

data_path = "path/to/your/data/"
df = pd.read_excel(data_path + "merged_data.xlsx")
df_validation = pd.read_excel(data_path + "df_validationnew.xlsx")

RANDOM_STATE = 42  

# Assuming df, df_validation, and y are already defined
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_validation_encoded = label_encoder.transform(df_validation['no-prog/prog'])

# Define selected features
selected_features = ['Age', 'hsa-miR-556-3p', 'hsa-miR-3667-5p', 'hsa-miR-141-3p', 'hsa-miR-224-5p',
                     'hsa-let-7c-5p', 'hsa-miR-3157-5p', 'hsa-miR-200a-5p']

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

best_models = []

for i in range(3, len(selected_features) + 1):
    for subset in combinations(selected_features, i):
        X_subset = df[list(subset)]
        X_validation_subset = df_validation[list(subset)]

        X_train, X_test, y_train, y_test = train_test_split(X_subset, y_encoded, test_size=0.2, random_state=RANDOM_STATE)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_validation_scaled = scaler.transform(X_validation_subset)

        smote = SMOTE(random_state=RANDOM_STATE)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

        gbm_clf = GradientBoostingClassifier(random_state=RANDOM_STATE, verbose=1)
        grid_search = GridSearchCV(gbm_clf, param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE), scoring='roc_auc', n_jobs=-1, verbose=2)
        grid_search.fit(X_train_resampled, y_train_resampled)

        best_gbm = grid_search.best_estimator_

        y_pred_test = best_gbm.predict(X_test_scaled)
        y_pred_proba_test = best_gbm.predict_proba(X_test_scaled)[:, 1]
        accuracy_test = accuracy_score(y_test, y_pred_test)
        auc_test = roc_auc_score(y_test, y_pred_proba_test)

        y_pred_validation = best_gbm.predict(X_validation_scaled)
        y_pred_proba_validation = best_gbm.predict_proba(X_validation_scaled)[:, 1]
        accuracy_validation = accuracy_score(y_validation_encoded, y_pred_validation)
        auc_validation = roc_auc_score(y_validation_encoded, y_pred_proba_validation)

        best_models.append({
            'features': subset,
            'best_estimator': best_gbm,  # Store the best estimator
            'accuracy_test': accuracy_test,
            'auc_test': auc_test,
            'accuracy_validation': accuracy_validation,
            'auc_validation': auc_validation
        })

best_model_auc = max(best_models, key=lambda x: (x['auc_test'] + x['auc_validation']))

# Extract the features and best estimator (model) of the best model
best_features = best_model_auc['features']
best_gbm = best_model_auc['best_estimator']  # Assuming you store the best estimator in best_models list

# Prepare the dataset with the best features
X_best_subset = df[list(best_features)]
X_validation_best_subset = df_validation[list(best_features)]

# Split and scale the dataset
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best_subset, y_encoded, test_size=0.2, random_state=42)
X_train_best_scaled = scaler.fit_transform(X_train_best)
X_test_best_scaled = scaler.transform(X_test_best)
X_validation_best_scaled = scaler.transform(X_validation_best_subset)

# Use the best estimator for predictions without retraining
y_pred_test_best = best_gbm.predict(X_test_best_scaled)
y_pred_proba_test_best = best_gbm.predict_proba(X_test_best_scaled)[:, 1]
y_pred_validation_best = best_gbm.predict(X_validation_best_scaled)
y_pred_proba_validation_best = best_gbm.predict_proba(X_validation_best_scaled)[:, 1]

# Calculate metrics
accuracy_test_best = accuracy_score(y_test_best, y_pred_test_best)
auc_test_best = roc_auc_score(y_test_best, y_pred_proba_test_best)
accuracy_validation_best = accuracy_score(y_validation_encoded, y_pred_validation_best)
auc_validation_best = roc_auc_score(y_validation_encoded, y_pred_proba_validation_best)

cm_test_best = confusion_matrix(y_test_best, y_pred_test_best)
cm_validation_best = confusion_matrix(y_validation_encoded, y_pred_validation_best)

# Calculate sensitivity and specificity
sensitivity_test = cm_test_best[1, 1] / (cm_test_best[1, 1] + cm_test_best[1, 0])
specificity_test = cm_test_best[0, 0] / (cm_test_best[0, 0] + cm_test_best[0, 1])
sensitivity_validation = cm_validation_best[1, 1] / (cm_validation_best[1, 1] + cm_validation_best[1, 0])
specificity_validation = cm_validation_best[0, 0] / (cm_validation_best[0, 0] + cm_validation_best[0, 1])

# Print the results
print("Best Model Based on Combined AUC:")
print(f"Features: {best_features}")
print(f"Test - Accuracy: {accuracy_test_best:.2f}, AUC: {auc_test_best:.2f}, Sensitivity: {sensitivity_test:.2f}, Specificity: {specificity_test:.2f}")
print(f"Validation - Accuracy: {accuracy_validation_best:.2f}, AUC: {auc_validation_best:.2f}, Sensitivity: {sensitivity_validation:.2f}, Specificity: {specificity_validation:.2f}")


In [None]:
## penalized logistic regression

data_path = "path/to/your/data/"
df = pd.read_excel(data_path + "merged_data.xlsx")
df_validation = pd.read_excel(data_path + "df_validationnew.xlsx")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from itertools import combinations

# Define your features and target
X = df.drop('no-prog/prog', axis='columns')
y = df["no-prog/prog"]

# The initial set of features
all_features = ['Age', 'hsa-miR-556-3p', 'hsa-miR-3667-5p', 'hsa-miR-200a-5p', 'hsa-miR-3157-5p', 'hsa-miR-141-3p',
                'hsa-miR-224-5p', 'hsa-let-7c-5p']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

assert set(all_features).issubset(df_validation.columns), "Validation data is missing some features"

# Make sure all datasets have the same features in the same order
X_train = X_train[all_features]
X_test = X_test[all_features]
X_validation = df_validation[all_features]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validation_scaled = scaler.transform(X_validation)

# Standardize the validation data using the same scaler
X_validation_scaled = scaler.transform(X_validation)

# Dictionary to store performance metrics for each feature subset
performance_metrics = {}

# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validation, y_train, y_validation = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)

# Create an instance of RandomOverSampler
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Resample the training data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)

assert len(X_train_resampled) == len(y_train_resampled), "Mismatch in resampled training set size"

# Create the logistic regression model
logreg = LogisticRegressionCV(cv=5, scoring='roc_auc', l1_ratios=[0.5],
                              penalty='elasticnet', solver='saga',
                              class_weight={0: 1, 1: 1}, random_state=42, max_iter=10000)

# Fit the model on the resampled training data
logreg.fit(X_train_resampled, y_train_resampled)

# Iterate over all possible non-empty combinations of features
for i in range(1, len(all_features) + 1):
    for subset in combinations(all_features, i):
        # Select the current subset of features' indices
        subset_indices = [all_features.index(feat) for feat in subset]
        
        # Standardize the features (if not already done)
        scaler = StandardScaler()
        X_train_subset_scaled = scaler.fit_transform(X_train.iloc[:, subset_indices])
        X_validation_subset_scaled = scaler.transform(X_validation.iloc[:, subset_indices])
        X_test_subset_scaled = scaler.transform(X_test.iloc[:, subset_indices])

        # Create an instance of RandomOverSampler
        ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

        # Resample the training subset
        X_train_subset_resampled, y_train_subset_resampled = ros.fit_resample(X_train_subset_scaled, y_train)

        # Fit the model on the resampled training subset
        logreg.fit(X_train_subset_resampled, y_train_subset_resampled)

        # Predict on the validation and test subsets
        y_pred_proba_validation = logreg.predict_proba(X_validation_subset_scaled)[:, 1]
        y_pred_validation = logreg.predict(X_validation_subset_scaled)
        y_pred_proba_test = logreg.predict_proba(X_test_subset_scaled)[:, 1]
        y_pred_test = logreg.predict(X_test_subset_scaled)
        
        # Calculate performance metrics for validation data
        auc_validation = roc_auc_score(y_validation, y_pred_proba_validation)
        accuracy_validation = accuracy_score(y_validation, y_pred_validation)
        cm_validation = confusion_matrix(y_validation, y_pred_validation)
        sensitivity_validation = cm_validation[1, 1] / (cm_validation[1, 1] + cm_validation[1, 0])
        specificity_validation = cm_validation[0, 0] / (cm_validation[0, 0] + cm_validation[0, 1])
        
        # Calculate performance metrics for test data
        auc_test = roc_auc_score(y_test, y_pred_proba_test)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        cm_test = confusion_matrix(y_test, y_pred_test)
        sensitivity_test = cm_test[1, 1] / (cm_test[1, 1] + cm_test[1, 0])
        specificity_test = cm_test[0, 0] / (cm_test[0, 0] + cm_test[0, 1])
        
        # Store the performance metrics
        performance_metrics[subset] = {'AUC Validation': auc_validation, 'Accuracy Validation': accuracy_validation,
                                       'Sensitivity Validation': sensitivity_validation, 'Specificity Validation': specificity_validation,
                                       'AUC Test': auc_test, 'Accuracy Test': accuracy_test,
                                       'Sensitivity Test': sensitivity_test, 'Specificity Test': specificity_test}

# Identify the feature subset with the highest AUC and accuracy on validation data
best_auc_validation_features = max(performance_metrics, key=lambda x: performance_metrics[x]['AUC Validation'])
best_accuracy_validation_features = max(performance_metrics, key=lambda x: performance_metrics[x]['Accuracy Validation'])

# Sort the performance metrics dictionary by a combined score of AUC and accuracy for both test and validation data
top_combined_models = sorted(performance_metrics.items(), key=lambda x: x[1]['AUC Validation'] * x[1]['Accuracy Validation'] * x[1]['AUC Test'] * x[1]['Accuracy Test'], reverse=True)[:5]

# Print the top 5 models with the highest combined score
print("Top 5 models with the highest combined score:")
for i, (features, metrics) in enumerate(top_combined_models, 1):
    print(f"Model {i}:")
    print(f"Features: {features}")
    print(f"AUC Test: {metrics['AUC Test']:.2f}, AUC Validation: {metrics['AUC Validation']:.2f}")
    print(f"Accuracy Test: {metrics['Accuracy Test']:.2f}, Accuracy Validation: {metrics['Accuracy Validation']:.2f}")
    print(f"Sensitivity Test: {metrics['Sensitivity Test']:.2f}, Sensitivity Validation: {metrics['Sensitivity Validation']:.2f}")
    print(f"Specificity Test: {metrics['Specificity Test']:.2f}, Specificity Validation: {metrics['Specificity Validation']:.2f}")
    print()

In [None]:
## Random forest model
from sklearn.ensemble import RandomForestClassifier

data_path = "path/to/your/data/"
df = pd.read_excel(data_path + "merged_data.xlsx")
df_validation = pd.read_excel(data_path + "df_validationnew.xlsx")

# Create an instance of StandardScaler
scaler = StandardScaler()

# Standardize the features
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)

# Create the Random Forest classifier
rf_clf = RandomForestClassifier()

# Perform grid search to find the best parameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(rf_clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best Random Forest classifier from the grid search
best_rf_clf = grid_search.best_estimator_

# Fit the best model on the resampled training data
best_rf_clf.fit(X_train_resampled, y_train_resampled)

# Iterate over all possible non-empty combinations of features
for i in range(1, len(all_features) + 1):
    for subset in combinations(all_features, i):
        # Select the current subset of features' indices
        subset_indices = [all_features.index(feat) for feat in subset]

        # Standardize the features (if not already done)
        X_train_subset_scaled = scaler.fit_transform(X_train.iloc[:, subset_indices])
        X_validation_subset_scaled = scaler.transform(X_validation.iloc[:, subset_indices])
        X_test_subset_scaled = scaler.transform(X_test.iloc[:, subset_indices])

        # Create an instance of RandomOverSampler
        ros = RandomOverSampler(sampling_strategy='auto', random_state=42)

        # Resample the training subset
        X_train_subset_resampled, y_train_subset_resampled = ros.fit_resample(X_train_subset_scaled, y_train)

        # Fit the model on the resampled training subset
        best_rf_clf.fit(X_train_subset_resampled, y_train_subset_resampled)

        # Predict on the validation and test subsets
        y_pred_proba_validation = best_rf_clf.predict_proba(X_validation_subset_scaled)[:, 1]
        y_pred_validation = best_rf_clf.predict(X_validation_subset_scaled)
        y_pred_proba_test = best_rf_clf.predict_proba(X_test_subset_scaled)[:, 1]
        y_pred_test = best_rf_clf.predict(X_test_subset_scaled)

        # Calculate performance metrics for validation data
        auc_validation = roc_auc_score(y_validation, y_pred_proba_validation)
        accuracy_validation = accuracy_score(y_validation, y_pred_validation)
        sensitivity_validation, specificity_validation = calculate_sensitivity_specificity(y_validation, y_pred_validation)

        # Calculate performance metrics for test data
        auc_test = roc_auc_score(y_test, y_pred_proba_test)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        sensitivity_test, specificity_test = calculate_sensitivity_specificity(y_test, y_pred_test)

        # Store the performance metrics
        performance_metrics[subset] = {
            'AUC Validation': auc_validation, 'Accuracy Validation': accuracy_validation,
            'Sensitivity Validation': sensitivity_validation, 'Specificity Validation': specificity_validation,
            'AUC Test': auc_test, 'Accuracy Test': accuracy_test,
            'Sensitivity Test': sensitivity_test, 'Specificity Test': specificity_test
        }

# Sort the performance metrics dictionary by a combined score of AUC and accuracy for both test and validation data
top_combined_models = sorted(performance_metrics.items(), key=lambda x: (x[1]['AUC Validation'] * x[1]['Accuracy Validation']) + (x[1]['AUC Test'] * x[1]['Accuracy Test']), reverse=True)[:5]

# Print the top 5 models with the highest combined score
print("Top 5 models with the highest combined score:")
for i, (features, metrics) in enumerate(top_combined_models, 1):
    print(f"Model {i}:")
    print(f"Features: {features}")
    print(f"AUC Test: {metrics['AUC Test']:.2f}, AUC Validation: {metrics['AUC Validation']:.2f}")
    print(f"Accuracy Test: {metrics['Accuracy Test']:.2f}, Accuracy Validation: {metrics['Accuracy Validation']:.2f}")
    print(f"Sensitivity Test: {metrics['Sensitivity Test']:.2f}, Sensitivity Validation: {metrics['Sensitivity Validation']:.2f}")
    print(f"Specificity Test: {metrics['Specificity Test']:.2f}, Specificity Validation: {metrics['Specificity Validation']:.2f}")
    print()


In [None]:
# Support Vector Machine and Kernel Support Vector Machine

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import numpy as np
from itertools import combinations

data_path = "path/to/your/data/"
df = pd.read_excel(data_path + "merged_data.xlsx")
df_validation = pd.read_excel(data_path + "df_validationnew.xlsx")

RANDOM_STATE = 42  # Fixed random state for consistency

# Assuming df, df_validation, and y are already defined
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_validation_encoded = label_encoder.transform(df_validation['no-prog/prog'])

# Define selected features
selected_features = ['Age', 'hsa-miR-556-3p', 'hsa-miR-3667-5p', 'hsa-miR-141-3p', 'hsa-miR-224-5p',
                     'hsa-let-7c-5p', 'hsa-miR-3157-5p', 'hsa-miR-200a-5p']

param_grid = {
    'C': [0.1,0.5, 1, 5, 10]
}

best_models = []

for i in range(3, len(selected_features) + 1):
    for subset in combinations(selected_features, i):
        X_subset = df[list(subset)]
        X_validation_subset = df_validation[list(subset)]

        X_train, X_test, y_train, y_test = train_test_split(X_subset, y_encoded, test_size=0.2, random_state=RANDOM_STATE)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_validation_scaled = scaler.transform(X_validation_subset)

        smote = SMOTE(random_state=RANDOM_STATE)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

        svm_clf = SVC(probability=True, random_state=RANDOM_STATE, kernel='rbf', class_weight={0: 1, 1: 1})
        grid_search = GridSearchCV(svm_clf, param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE), scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train_resampled, y_train_resampled)

        best_svm = grid_search.best_estimator_

        y_pred_test = best_svm.predict(X_test_scaled)
        y_pred_proba_test = best_svm.predict_proba(X_test_scaled)[:, 1]
        accuracy_test = accuracy_score(y_test, y_pred_test)
        auc_test = roc_auc_score(y_test, y_pred_proba_test)

        y_pred_validation = best_svm.predict(X_validation_scaled)
        y_pred_proba_validation = best_svm.predict_proba(X_validation_scaled)[:, 1]
        accuracy_validation = accuracy_score(y_validation_encoded, y_pred_validation)
        auc_validation = roc_auc_score(y_validation_encoded, y_pred_proba_validation)

        best_models.append({
            'features': subset,
            'best_estimator': best_svm,
            'accuracy_test': accuracy_test,
            'auc_test': auc_test,
            'accuracy_validation': accuracy_validation,
            'auc_validation': auc_validation
        })

best_model_auc = max(best_models, key=lambda x: (x['auc_test'] + x['auc_validation']))

# Retrieve the features and best estimator (model) of the best model
best_features = best_model_auc['features']
best_svm_estimator = best_model_auc['best_estimator']

# Extract the 'C' parameter from the best estimator
C_value = best_svm_estimator.get_params()['C']

# Prepare the dataset with the best features
X_best_subset = df[list(best_features)]
X_validation_best_subset = df_validation[list(best_features)]

# Split and scale the dataset
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best_subset, y_encoded, test_size=0.2, random_state=RANDOM_STATE)
X_train_best_scaled = scaler.fit_transform(X_train_best)
X_test_best_scaled = scaler.transform(X_test_best)
X_validation_best_scaled = scaler.transform(X_validation_best_subset)

# SMOTE application
X_train_best_resampled, y_train_best_resampled = smote.fit_resample(X_train_best_scaled, y_train_best)

# Retrain the model using the same 'C' value
best_svm = SVC(probability=True, random_state=RANDOM_STATE, C=C_value, kernel='rbf')
best_svm.fit(X_train_best_resampled, y_train_best_resampled)

# Predict on test and validation set
y_pred_test_best = best_svm.predict(X_test_best_scaled)
y_pred_proba_test_best = best_svm.predict_proba(X_test_best_scaled)[:, 1]
y_pred_validation_best = best_svm.predict(X_validation_best_scaled)
y_pred_proba_validation_best = best_svm.predict_proba(X_validation_best_scaled)[:, 1]

# Calculate metrics
accuracy_test_best = accuracy_score(y_test_best, y_pred_test_best)
auc_test_best = roc_auc_score(y_test_best, y_pred_proba_test_best)
accuracy_validation_best = accuracy_score(y_validation_encoded, y_pred_validation_best)
auc_validation_best = roc_auc_score(y_validation_encoded, y_pred_proba_validation_best)

# Calculate sensitivity and specificity
cm_test_best = confusion_matrix(y_test_best, y_pred_test_best)
cm_validation_best = confusion_matrix(y_validation_encoded, y_pred_validation_best)
sensitivity_test = cm_test_best[1, 1] / (cm_test_best[1, 1] + cm_test_best[1, 0])
specificity_test = cm_test_best[0, 0] / (cm_test_best[0, 0] + cm_test_best[0, 1])
sensitivity_validation = cm_validation_best[1, 1] / (cm_validation_best[1, 1] + cm_validation_best[1, 0])
specificity_validation = cm_validation_best[0, 0] / (cm_validation_best[0, 0] + cm_validation_best[0, 1])

# Print the results
print("Best Model Based on Combined AUC:")
print(f"Features: {best_features}")
print(f"Test - Accuracy: {accuracy_test_best:.2f}, AUC: {auc_test_best:.2f}, Sensitivity: {sensitivity_test:.2f}, Specificity: {specificity_test:.2f}")
print(f"Validation - Accuracy: {accuracy_validation_best:.2f}, AUC: {auc_validation_best:.2f}, Sensitivity: {sensitivity_validation:.2f}, Specificity: {specificity_validation:.2f}")


In [None]:
# Sort the best_models list based on the sum of AUC for test and validation
sorted_best_models = sorted(best_models, key=lambda x: (x['auc_test'] + x['auc_validation']), reverse=True)

# Print the top 5 models
print("Top 5 Models Based on Combined AUC:")
for i, model in enumerate(sorted_best_models[:5], start=1):
    print(f"Model {i}:")
    print(f"    Features: {model['features']}")
    print(f"    Test - Accuracy: {model['accuracy_test']:.2f}, AUC: {model['auc_test']:.2f}, Sensitivity: {sensitivity_test:.2f}, Specificity: {specificity_test:.2f}")
    print(f"    Validation - Accuracy: {model['accuracy_validation']:.2f}, AUC: {model['auc_validation']:.2f}, Sensitivity: {sensitivity_validation:.2f}, Specificity: {specificity_validation:.2f}")
    print(f"    Best 'C' Value: {model['best_estimator'].get_params()['C']}")
    print()