In [1]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import networkx as nx
import itertools
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import os
import joblib

In [2]:
ctl_samples = pd.read_csv('../data/control/control_samples.csv')
print(ctl_samples.shape)
ctl_samples.columns = ctl_samples.columns.str.strip()
ctl_samples['Stage'] = ctl_samples['Stage'].str.replace('Stage: NA', 'Stage: CTL')
print(ctl_samples['Stage'])

(5643, 2570)
0       Stage: CTL
1       Stage: CTL
2       Stage: CTL
3       Stage: CTL
4       Stage: CTL
           ...    
5638    Stage: CTL
5639    Stage: CTL
5640    Stage: CTL
5641    Stage: CTL
5642    Stage: CTL
Name: Stage, Length: 5643, dtype: object


In [3]:
ctl_samples = ctl_samples[ctl_samples['Sex'] == 'Sex: Male']
print(ctl_samples.shape)

(2468, 2570)


In [4]:
stage_1_samples = pd.read_csv('../data/cancer/stage_1_prostate_cancer_samples.csv')
stage_1_samples.shape

(173, 2570)

In [5]:
# Age-related operations for control samples
ctl_samples = ctl_samples.dropna(subset=['Age'])
ctl_samples['Age'] = ctl_samples['Age'].astype(str).str.replace('age: ', '', regex=False)
ctl_samples['Age'] = pd.to_numeric(ctl_samples['Age'], errors='coerce')
ctl_samples = ctl_samples.dropna(subset=['Age'])
ctl_samples['Age'] = ctl_samples['Age'].astype(int)

# Age-related operations for stage 1 samples
stage_1_samples['Age'] = stage_1_samples['Age'].astype(str).str.replace('age: ', '', regex=False)
stage_1_samples['Age'] = pd.to_numeric(stage_1_samples['Age'], errors='coerce')
stage_1_samples = stage_1_samples.dropna(subset=['Age'])
stage_1_samples['Age'] = stage_1_samples['Age'].astype(int)

# Print average age of control samples
print("Average age of control samples:", ctl_samples['Age'].mean())

# Print average age of stage 1 samples
print("Average age of stage 1 samples:", stage_1_samples['Age'].mean())

Average age of control samples: 65.28183292781833
Average age of stage 1 samples: 66.61849710982659


In [6]:
combined_dataset = pd.concat([ctl_samples, stage_1_samples], ignore_index=True)

In [7]:
# Verify and clean the Stage column
print("Unique values in Stage column:", combined_dataset['Stage'].unique())
combined_dataset['Stage'] = combined_dataset['Stage'].str.strip()
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: CTL', 0, 1)
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 1', 1, combined_dataset['ID_REF'])

# Print class distribution to ensure both classes are present
print("Class distribution in ID_REF column:")
print(combined_dataset['ID_REF'].value_counts())

Unique values in Stage column: ['Stage: CTL' 'Stage: 1']
Class distribution in ID_REF column:
ID_REF
0    2466
1     173
Name: count, dtype: int64


In [8]:

def process_data(data, under_sample_factor, over_sample_factor=None):
    columns_to_drop = ['Sample_ID', 'Sex', 'Age', 'Stage', 'Disease']
    data = data.drop(columns=columns_to_drop, axis=1)
    
    id_ref = data["ID_REF"]
    data = data.drop(["ID_REF"], axis=1)
    data = data.apply(pd.to_numeric, errors='coerce')
    data = data.fillna(data.mean())
    data["ID_REF"] = id_ref

    x = np.array(data.drop(["ID_REF"], axis=1)).astype('float')
    y = np.array(data["ID_REF"]).astype('int')
    feature_names = data.columns[:-1]

    if under_sample_factor is not None and isinstance(under_sample_factor, float) and 0 < under_sample_factor <= 1:
        under_sampler = RandomUnderSampler(sampling_strategy=under_sample_factor)
        x, y = under_sampler.fit_resample(x, y)

    if over_sample_factor is not None and isinstance(over_sample_factor, float) and 0 < over_sample_factor <= 1:
        over_sampler = RandomOverSampler(sampling_strategy=over_sample_factor)
        x, y = over_sampler.fit_resample(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test, feature_names

In [9]:
# Define parameters
feature_selection_num = 50
feature_importance_num = 10

num_control_samples = combined_dataset['ID_REF'].value_counts()[0]
nums_s1_samples = combined_dataset['ID_REF'].value_counts()[1]
under_sample_factor = nums_s1_samples / num_control_samples

# Process data
x_train, x_test, y_train, y_test, feature_names = process_data(combined_dataset, under_sample_factor=under_sample_factor)

  data["ID_REF"] = id_ref


In [10]:
def svm_objective(trial):
    k = feature_selection_num
    
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    kernel = 'linear'  # Use only linear kernel for feature extraction
    
    # Relaxed LassoCV with a wider range of smaller alphas
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', SVC(C=C, kernel=kernel, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    try:
        scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
        return scores.mean()
    except ValueError as e:
        print(f"ValueError: {e}")
        print("Selected features by SelectKBest:", skb.get_support(indices=True))
        if hasattr(lasso.estimator_, 'coef_'):
            print("Lasso coefficients:", lasso.estimator_.coef_)
        return np.nan

In [11]:
def rf_objective(trial):
    k = feature_selection_num
    
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, criterion=criterion, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [12]:
def xgboost_objective(trial):
    k = feature_selection_num
    
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1.0, log=True)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, random_state=0, use_label_encoder=False, eval_metric='logloss'))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [13]:
# Function to save study
def save_study(study, filename):
    joblib.dump(study, filename)

# Function to load study
def load_study(filename):
    return joblib.load(filename)

In [14]:
# Optimize hyperparameters using Optuna
svm_study_filename = '50_svm_study_ctl_s1.pkl'
rf_study_filename = '50_rf_study_ctl_s1.pkl'
xgboost_study_filename = '50_xgboost_study_ctl_s1.pkl'

In [15]:
# Optimize hyperparameters using Optuna with early stopping
def optimize_with_early_stopping(objective, study_filename, n_trials=50, patience=10):
    if os.path.exists(study_filename):
        study = load_study(study_filename)
        return study
    else:
        study = optuna.create_study(direction='maximize')
    
    best_value = -np.inf
    trials_without_improvement = 0
    
    for trial in range(n_trials):
        study.optimize(objective, n_trials=1)
        
        current_best_value = study.best_value
        if current_best_value > best_value:
            best_value = current_best_value
            trials_without_improvement = 0
        else:
            trials_without_improvement += 1
        
        if trials_without_improvement >= patience:
            print(f"Early stopping at trial {trial + 1}")
            break
        
        save_study(study, study_filename)
    
    return study

### Find Hyperparmeters if not trained already

In [16]:
svm_study = optimize_with_early_stopping(svm_objective, svm_study_filename, n_trials=50, patience=10)

In [17]:
rf_study = optimize_with_early_stopping(rf_objective, rf_study_filename, n_trials=100, patience=10)

In [18]:
xgboost_study = optimize_with_early_stopping(xgboost_objective, xgboost_study_filename, n_trials=100, patience=10)

### Print the best trial for each study

For SVM, RF, and XGBoost

In [19]:
print("Best SVM trial:")
svm_trial = svm_study.best_trial
print("  Value: ", svm_trial.value)
print("  Params: ")
for key, value in svm_trial.params.items():
    print(f"    {key}: {value}")

Best SVM trial:
  Value:  1.0
  Params: 
    C: 0.004378045466885949


In [20]:
print("Best Random Forest trial:")
rf_trial = rf_study.best_trial
print("  Value: ", rf_trial.value)
print("  Params: ")
for key, value in rf_trial.params.items():
    print(f"    {key}: {value}")

Best Random Forest trial:
  Value:  1.0
  Params: 
    n_estimators: 448
    max_depth: 20
    max_features: sqrt
    criterion: gini


In [21]:
print("Best XGBoost trial:")
xgboost_trial = xgboost_study.best_trial
print("  Value: ", xgboost_trial.value)
print("  Params: ")
for key, value in xgboost_trial.params.items():
    print(f"    {key}: {value}")

Best XGBoost trial:
  Value:  0.9985781990521326
  Params: 
    learning_rate: 0.6589707437394396
    max_depth: 4
    n_estimators: 584


## Train and evaluate the models with the best hyperparameters

In [22]:
# Train and evaluate the models with the best hyperparameters
def train_and_evaluate(pipe, x_train, y_train, x_test, y_test):
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    print(f'Testing accuracy {accuracy_score(y_test, y_pred)}')
    print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')

### SVM

In [23]:
best_svm_params = svm_trial.params
svm_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))),
    ('estimator', SVC(C=best_svm_params['C'], kernel='linear', random_state=0))  # Ensure linear kernel
])
train_and_evaluate(svm_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 1.0
Confusion matrix: 
[[493   0]
 [  0  35]]


In [24]:
# svm_accuracies = []

# # Loop through the number of features from 1 to 150
# for k in range(1, 121):
#     best_svm_params = svm_trial.params
#     svm_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', SVC(C=best_svm_params['C'], kernel=best_svm_params['kernel'], random_state=0))
#     ])
    
#     # Train and evaluate the pipeline
#     svm_pipe.fit(x_train, y_train)
#     y_pred = svm_pipe.predict(x_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     svm_accuracies.append(accuracy)
#     print(f'Number of features: {k}, Accuracy: {accuracy}')

# # Plot the number of features vs accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), svm_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features (SVM)')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

### Random Forest

In [25]:
best_rf_params = rf_trial.params
rf_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))),
    ('estimator', RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
                                         max_depth=best_rf_params['max_depth'],
                                         max_features=best_rf_params['max_features'],
                                         criterion=best_rf_params['criterion'],
                                         random_state=0))
])
train_and_evaluate(rf_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 1.0
Confusion matrix: 
[[493   0]
 [  0  35]]


In [26]:
# rf_accuracies = []

# # Loop through the number of features from 1 to 200
# for k in range(1, 121):
#     best_rf_params = rf_trial.params
#     rf_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
#                                              max_depth=best_rf_params['max_depth'],
#                                              max_features=best_rf_params['max_features'],
#                                              criterion=best_rf_params['criterion'],
#                                              random_state=0))
#     ])
    
#     # Train and evaluate the pipeline
#     rf_pipe.fit(x_train, y_train)
#     y_pred = rf_pipe.predict(x_test)
#     rf_accuracy = accuracy_score(y_test, y_pred)
#     rf_accuracies.append(rf_accuracy)
#     print(f'Number of features: {k}, Accuracy: {rf_accuracy}')

# # Plot the number of features vs rf_accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), rf_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

### XGBoost

In [27]:
best_xgboost_params = xgboost_trial.params
xgboost_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
    ('estimator', XGBClassifier(learning_rate=best_xgboost_params['learning_rate'],
                                max_depth=best_xgboost_params['max_depth'],
                                n_estimators=best_xgboost_params['n_estimators'],
                                random_state=0,
                                use_label_encoder=False,
                                eval_metric='logloss'))
])
train_and_evaluate(xgboost_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 1.0
Confusion matrix: 
[[493   0]
 [  0  35]]


In [28]:
# xgboost_accuracies = []

# # Loop through the number of features from 1 to 150
# for k in range(1, 121):
#     best_xgboost_params = xgboost_trial.params
#     xgboost_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', XGBClassifier(learning_rate=best_xgboost_params['learning_rate'],
#                                     max_depth=best_xgboost_params['max_depth'],
#                                     n_estimators=best_xgboost_params['n_estimators'],
#                                     random_state=0,
#                                     use_label_encoder=False,
#                                     eval_metric='logloss'))
#     ])

#     # Train and evaluate the pipeline
#     xgboost_pipe.fit(x_train, y_train)
#     y_pred = xgboost_pipe.predict(x_test)
#     xgboost_accuracy = accuracy_score(y_test, y_pred)
#     xgboost_accuracies.append(xgboost_accuracy)
#     print(f'Number of features: {k}, Accuracy: {xgboost_accuracy}') 

# # Plot the number of features vs xgboost_accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), xgboost_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features (XGBoost)')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

In [29]:
def get_top_features(pipe, feature_names, top_feature_num):
    if isinstance(pipe.named_steps['estimator'], SVC):
        if pipe.named_steps['estimator'].kernel != 'linear':
            raise ValueError("Feature importance is not available for non-linear SVM kernels.")
        feature_scores = pipe.named_steps['estimator'].coef_[0]
    elif isinstance(pipe.named_steps['estimator'], XGBClassifier):
        feature_scores = pipe.named_steps['estimator'].feature_importances_
    else:
        raise ValueError("Unsupported estimator type for feature extraction.")
    
    # Get selected features from SelectKBest
    skb_support = pipe.named_steps['skb'].get_support(indices=True)
    print("Selected features from SelectKBest:", skb_support)
    
    # Transform the features using SelectKBest
    skb_features = pipe.named_steps['skb'].transform(x_train)
    
    # Get selected features from LassoCV
    lasso = LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0)
    lasso.fit(skb_features, y_train)
    lasso_support = np.where(lasso.coef_ != 0)[0]
    print("Selected features from LassoCV:", lasso_support)
    
    if len(lasso_support) == 0:
        print("No features selected after LassoCV.")
        return []
    
    # Ensure indices match feature_scores
    top_indices = np.argsort(np.abs(lasso.coef_[lasso_support]))[::-1][:top_feature_num]
    top_features = [(feature_names[skb_support[i]], lasso.coef_[lasso_support[i]]) for i in top_indices]
    
    return top_features

In [30]:
try:
    svm_top_features = get_top_features(svm_pipe, feature_names, feature_importance_num)
    print("Top SVM features:", svm_top_features)
except ValueError as e:
    print("SVM feature extraction error:", e)

xgboost_top_features = get_top_features(xgboost_pipe, feature_names, feature_importance_num)
print("Top XGBoost features:", xgboost_top_features)

Selected features from SelectKBest: [  76  388  423  463  473  549  590  697  726  739  785  821  832  837
  926  931  947  975 1010 1018 1044 1109 1171 1174 1187 1214 1254 1273
 1361 1386 1483 1493 1511 1560 1616 1724 1754 1781 1812 2058 2066 2123
 2131 2214 2285 2294 2309 2386 2460 2473]
Selected features from LassoCV: [ 1  2  4  7 10 11 12 13 14 15 16 18 19 20 21 23 24 25 26 27 28 29 30 31
 32 33 34 35 36 37 38 39 40 41 43 46 48]
Top SVM features: [('hsa-miR-614', 0.05421848013118901), ('hsa-miR-663a', -0.04925091627328719), ('hsa-miR-4532', -0.04385655172711609), ('hsa-miR-1228-5p', 0.03579125332342492), ('hsa-miR-1203', 0.02360807541776545), ('hsa-miR-4758-5p', 0.022224716940457023), ('hsa-miR-6090', 0.018070419990390968), ('hsa-miR-6729-5p', 0.01671793570372341), ('hsa-miR-6131', 0.01603751997805587), ('hsa-miR-6756-5p', -0.014503054277587992)]
Selected features from SelectKBest: [  76  388  423  463  473  549  590  697  726  739  785  821  832  837
  926  931  947  975 1010 1018

In [31]:
def normalize_importance(importance_scores):
    max_score = max(importance_scores, key=lambda x: abs(x[1]))[1]
    min_score = min(importance_scores, key=lambda x: abs(x[1]))[1]
    normalized_scores = [(feature, (score - min_score) / (max_score - min_score)) for feature, score in importance_scores]
    return normalized_scores

In [32]:
# Normalize the top features for SVM and XGBoost
normalized_svm_features = normalize_importance(svm_top_features)
normalized_xgboost_features = normalize_importance(xgboost_top_features)

# Compile normalized top features
def compile_normalized_top_features(normalized_svm_features, normalized_xgboost_features):
    top_features = []
    for feature, score in normalized_svm_features:
        top_features.append((f'SVM_{feature}', score))
    for feature, score in normalized_xgboost_features:
        top_features.append((f'XGB_{feature}', score))
    return top_features

normalized_top_features = compile_normalized_top_features(normalized_svm_features, normalized_xgboost_features)
print("Compiled normalized top features:", normalized_top_features)

Compiled normalized top features: [('SVM_hsa-miR-614', 1.0), ('SVM_hsa-miR-663a', -0.505632801925058), ('SVM_hsa-miR-4532', -0.42713681674981807), ('SVM_hsa-miR-1228-5p', 0.7318565866391571), ('SVM_hsa-miR-1203', 0.5545733229508035), ('SVM_hsa-miR-4758-5p', 0.5344434104101466), ('SVM_hsa-miR-6090', 0.47399224345343977), ('SVM_hsa-miR-6729-5p', 0.45431159606535076), ('SVM_hsa-miR-6131', 0.4444105405734839), ('SVM_hsa-miR-6756-5p', 0.0), ('XGB_hsa-miR-614', 1.0), ('XGB_hsa-miR-663a', -0.505632801925058), ('XGB_hsa-miR-4532', -0.42713681674981807), ('XGB_hsa-miR-1228-5p', 0.7318565866391571), ('XGB_hsa-miR-1203', 0.5545733229508035), ('XGB_hsa-miR-4758-5p', 0.5344434104101466), ('XGB_hsa-miR-6090', 0.47399224345343977), ('XGB_hsa-miR-6729-5p', 0.45431159606535076), ('XGB_hsa-miR-6131', 0.4444105405734839), ('XGB_hsa-miR-6756-5p', 0.0)]


In [33]:
normalized_top_features_df = pd.DataFrame(normalized_top_features, columns=['Feature', 'Normalized Importance'])
normalized_top_features_df.to_csv('../GSEA/miRNA/ctl_s1_miRNA.csv', index=False)
print("Compiled normalized top features saved to '../GSEA/miRNA/ctl_s1_miRNA.csv'")

Compiled normalized top features saved to '../GSEA/miRNA/ctl_s1_miRNA.csv'


In [34]:
def get_all_selected_features(pipe, feature_names, classifier_name):
    if isinstance(pipe.named_steps['estimator'], SVC):
        if pipe.named_steps['estimator'].kernel != 'linear':
            raise ValueError("Feature importance is not available for non-linear SVM kernels.")
        feature_scores = pipe.named_steps['estimator'].coef_[0]
    elif isinstance(pipe.named_steps['estimator'], XGBClassifier):
        feature_scores = pipe.named_steps['estimator'].feature_importances_
    else:
        raise ValueError("Unsupported estimator type for feature extraction.")
    
    # Get selected features from SelectKBest
    skb_support = pipe.named_steps['skb'].get_support(indices=True)
    print("Selected features from SelectKBest:", skb_support)
    
    # Transform the features using SelectKBest
    skb_features = pipe.named_steps['skb'].transform(x_train)
    
    # Get selected features from LassoCV
    lasso = LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0)
    lasso.fit(skb_features, y_train)
    lasso_support = np.where(lasso.coef_ != 0)[0]
    print("Selected features from LassoCV:", lasso_support)
    
    # Map the selected LassoCV features back to the original feature names
    lasso_support_mapped = skb_support[lasso_support]
    
    # Ensure the index does not go out of bounds
    all_features = [(feature_names[i], feature_scores[j], classifier_name) for j, i in enumerate(lasso_support_mapped) if j < len(feature_scores)]
    
    return all_features

In [35]:
# Extract all selected features from SVM and XGBoost
all_svm_features = get_all_selected_features(svm_pipe, feature_names, 'SVM')
print("All SVM features:", all_svm_features)

all_xgboost_features = get_all_selected_features(xgboost_pipe, feature_names, 'XGBoost')
print("All XGBoost features:", all_xgboost_features)

# Combine the features and save to CSV
all_features_df = pd.DataFrame(all_svm_features + all_xgboost_features, columns=['Feature', 'Importance', 'Classifier'])
all_features_df.to_csv('../GSEA/miRNA/50_ctl_s1.csv', index=False)
print("All selected features saved to '../GSEA/miRNA/50_ctl_s1.csv'")

Selected features from SelectKBest: [  76  388  423  463  473  549  590  697  726  739  785  821  832  837
  926  931  947  975 1010 1018 1044 1109 1171 1174 1187 1214 1254 1273
 1361 1386 1483 1493 1511 1560 1616 1724 1754 1781 1812 2058 2066 2123
 2131 2214 2285 2294 2309 2386 2460 2473]
Selected features from LassoCV: [ 1  2  4  7 10 11 12 13 14 15 16 18 19 20 21 23 24 25 26 27 28 29 30 31
 32 33 34 35 36 37 38 39 40 41 43 46 48]
All SVM features: [('hsa-miR-6802-5p', -0.005375705860577825, 'SVM'), ('hsa-miR-1307-3p', 0.07427224816901697, 'SVM'), ('hsa-miR-6858-5p', -0.013651691888493524, 'SVM'), ('hsa-miR-6131', 0.06812683755921892, 'SVM'), ('hsa-miR-6766-5p', 0.00467043824980559, 'SVM'), ('hsa-miR-663a', 0.032036513116655396, 'SVM'), ('hsa-miR-1228-5p', -0.06698378960784022, 'SVM'), ('hsa-miR-642b-3p', 0.03436914585615807, 'SVM'), ('hsa-miR-4728-5p', -0.0029306639388156416, 'SVM'), ('hsa-miR-4532', 0.0376111713385699, 'SVM'), ('hsa-miR-6756-5p', 0.022835877647829426, 'SVM'), ('hsa