In [80]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import networkx as nx
import itertools
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import os
import joblib

In [81]:
benign_samples = pd.read_csv('../data/benign/benign_prostate_samples.csv')
print(benign_samples.shape)

(230, 2570)


In [82]:
benign_samples.columns = benign_samples.columns.str.strip()

In [83]:
benign_samples['Stage'] = benign_samples['Stage'].str.replace('Stage: NA', 'Stage: B')

# Verify the replacements
print(benign_samples['Stage'])

0      Stage: B
1      Stage: B
2      Stage: B
3      Stage: B
4      Stage: B
         ...   
225    Stage: B
226    Stage: B
227    Stage: B
228    Stage: B
229    Stage: B
Name: Stage, Length: 230, dtype: object


In [84]:
stage_1_samples = pd.read_csv('../data/cancer/stage_1_prostate_cancer_samples.csv')
stage_1_samples.shape

(173, 2570)

In [85]:
combined_dataset = pd.concat([benign_samples, stage_1_samples], ignore_index=True)

In [86]:
# Verify and clean the Stage column
print("Unique values in Stage column:", combined_dataset['Stage'].unique())
combined_dataset['Stage'] = combined_dataset['Stage'].str.strip()
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: B', 0, 1)
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 1', 1, combined_dataset['ID_REF'])

# Print class distribution to ensure both classes are present
print("Class distribution in ID_REF column:")
print(combined_dataset['ID_REF'].value_counts())


Unique values in Stage column: ['Stage: B' 'Stage: 1']
Class distribution in ID_REF column:
ID_REF
0    230
1    173
Name: count, dtype: int64


In [87]:
def process_data(data, under_sample_factor=None, over_sample_factor=None):
    columns_to_drop = ['Sample_ID', 'Sex', 'Age', 'Stage', 'Disease']
    data = data.drop(columns=columns_to_drop, axis=1)
    
    # Ensure ID_REF is handled separately
    id_ref = data["ID_REF"]
    data = data.drop(["ID_REF"], axis=1)
    
    # Convert all remaining columns to numeric, coercing errors to NaN
    data = data.apply(pd.to_numeric, errors='coerce')
    
    # Handle NaN values, e.g., fill with the mean of each column
    data = data.fillna(data.mean())
    
    # Combine ID_REF back to the DataFrame
    data["ID_REF"] = id_ref

    # Convert to numpy arrays
    x = np.array(data.drop(["ID_REF"], axis=1)).astype('float')
    y = np.array(data["ID_REF"]).astype('int')
    feature_names = data.columns[:-1]

    if under_sample_factor is not None and isinstance(under_sample_factor, float) and 0 < under_sample_factor <= 1:
        under_sampler = RandomUnderSampler(sampling_strategy=under_sample_factor)
        x, y = under_sampler.fit_resample(x, y)

    if over_sample_factor is not None and isinstance(over_sample_factor, float) and 0 < over_sample_factor <= 1:
        over_sampler = RandomOverSampler(sampling_strategy=over_sample_factor)
        x, y = over_sampler.fit_resample(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test, feature_names

In [88]:
# Define parameters
feature_selection_num = 50
feature_importance_num = 10

# Process data
x_train, x_test, y_train, y_test, feature_names = process_data(combined_dataset)

  data["ID_REF"] = id_ref


In [89]:
def svm_objective(trial):
    k = feature_selection_num
    
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    kernel = 'linear'  # Use only linear kernel for feature extraction
    
    # Relaxed LassoCV with a wider range of smaller alphas
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', SVC(C=C, kernel=kernel, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    try:
        scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
        return scores.mean()
    except ValueError as e:
        print(f"ValueError: {e}")
        print("Selected features by SelectKBest:", skb.get_support(indices=True))
        if hasattr(lasso.estimator_, 'coef_'):
            print("Lasso coefficients:", lasso.estimator_.coef_)
        return np.nan

In [90]:
def rf_objective(trial):
    k = feature_selection_num
    
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, criterion=criterion, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [91]:
def xgboost_objective(trial):
    k = feature_selection_num
    
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1.0, log=True)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    
    lasso = SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('lasso', lasso),
        ('estimator', XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, random_state=0, use_label_encoder=False, eval_metric='logloss'))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [92]:
# Function to save study
def save_study(study, filename):
    joblib.dump(study, filename)

# Function to load study
def load_study(filename):
    return joblib.load(filename)

In [93]:
# Optimize hyperparameters using Optuna
svm_study_filename = '50_svm_study_b_s1.pkl'
rf_study_filename = '50_rf_study_b_s1.pkl'
xgboost_study_filename = '50_xgboost_study_b_s1.pkl'

In [94]:
# Optimize hyperparameters using Optuna with early stopping
def optimize_with_early_stopping(objective, study_filename, n_trials=50, patience=10):
    if os.path.exists(study_filename):
        study = load_study(study_filename)
        return study
    else:
        study = optuna.create_study(direction='maximize')
    
    best_value = -np.inf
    trials_without_improvement = 0
    
    for trial in range(n_trials):
        study.optimize(objective, n_trials=1)
        
        current_best_value = study.best_value
        if current_best_value > best_value:
            best_value = current_best_value
            trials_without_improvement = 0
        else:
            trials_without_improvement += 1
        
        if trials_without_improvement >= patience:
            print(f"Early stopping at trial {trial + 1}")
            break
        
        save_study(study, study_filename)
    
    return study

### Find Hyperparmeters if not trained already

In [95]:
svm_study = optimize_with_early_stopping(svm_objective, svm_study_filename, n_trials=50, patience=10)

[I 2024-06-19 15:13:23,041] A new study created in memory with name: no-name-3d8768eb-5153-4257-890b-25091b3d1dc6
[I 2024-06-19 15:13:24,584] Trial 0 finished with value: 1.0 and parameters: {'C': 0.003450066235204475}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:26,519] Trial 1 finished with value: 1.0 and parameters: {'C': 0.030024738425880833}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:28,381] Trial 2 finished with value: 1.0 and parameters: {'C': 0.02697599910238836}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:29,967] Trial 3 finished with value: 1.0 and parameters: {'C': 0.002229532074061881}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:31,863] Trial 4 finished with value: 1.0 and parameters: {'C': 0.004045743594431839}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:33,630] Trial 5 finished with value: 1.0 and parameters: {'C': 1.3588500637391627}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:35,478] Trial 6 finished with value

Early stopping at trial 11


In [96]:
rf_study = optimize_with_early_stopping(rf_objective, rf_study_filename, n_trials=100, patience=10)

[I 2024-06-19 15:13:42,742] A new study created in memory with name: no-name-6b082e6d-cfe2-4d03-8937-7def234be4ea
[I 2024-06-19 15:13:47,060] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 975, 'max_depth': 7, 'max_features': 'log2', 'criterion': 'gini'}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:51,321] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 946, 'max_depth': 27, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:53,241] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 121, 'max_depth': 5, 'max_features': 'sqrt', 'criterion': 'gini'}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:57,219] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 860, 'max_depth': 32, 'max_features': 'log2', 'criterion': 'gini'}. Best is trial 0 with value: 1.0.
[I 2024-06-19 15:13:59,794] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 403, 'max_d

Early stopping at trial 11


In [97]:
xgboost_study = optimize_with_early_stopping(xgboost_objective, xgboost_study_filename, n_trials=100, patience=10)

[I 2024-06-19 15:14:21,279] A new study created in memory with name: no-name-90206921-226e-4a13-bda5-aec533ab7e7d
[I 2024-06-19 15:14:23,466] Trial 0 finished with value: 0.996875 and parameters: {'learning_rate': 0.0015987480151282833, 'max_depth': 4, 'n_estimators': 216}. Best is trial 0 with value: 0.996875.
[I 2024-06-19 15:14:25,724] Trial 1 finished with value: 0.996875 and parameters: {'learning_rate': 0.2938447125063232, 'max_depth': 2, 'n_estimators': 768}. Best is trial 0 with value: 0.996875.
[I 2024-06-19 15:14:27,852] Trial 2 finished with value: 0.996875 and parameters: {'learning_rate': 0.8796183222025359, 'max_depth': 10, 'n_estimators': 204}. Best is trial 0 with value: 0.996875.
[I 2024-06-19 15:14:30,255] Trial 3 finished with value: 0.996875 and parameters: {'learning_rate': 0.013693960046169268, 'max_depth': 2, 'n_estimators': 711}. Best is trial 0 with value: 0.996875.
[I 2024-06-19 15:14:32,601] Trial 4 finished with value: 0.996875 and parameters: {'learning_rat

Early stopping at trial 11


### Print the best trial for each study

For SVM, RF, and XGBoost

In [98]:
print("Best SVM trial:")
svm_trial = svm_study.best_trial
print("  Value: ", svm_trial.value)
print("  Params: ")
for key, value in svm_trial.params.items():
    print(f"    {key}: {value}")

Best SVM trial:
  Value:  1.0
  Params: 
    C: 0.003450066235204475


In [99]:
print("Best Random Forest trial:")
rf_trial = rf_study.best_trial
print("  Value: ", rf_trial.value)
print("  Params: ")
for key, value in rf_trial.params.items():
    print(f"    {key}: {value}")

Best Random Forest trial:
  Value:  1.0
  Params: 
    n_estimators: 975
    max_depth: 7
    max_features: log2
    criterion: gini


In [100]:
print("Best XGBoost trial:")
xgboost_trial = xgboost_study.best_trial
print("  Value: ", xgboost_trial.value)
print("  Params: ")
for key, value in xgboost_trial.params.items():
    print(f"    {key}: {value}")

Best XGBoost trial:
  Value:  0.996875
  Params: 
    learning_rate: 0.0015987480151282833
    max_depth: 4
    n_estimators: 216


## Train and evaluate the models with the best hyperparameters

In [101]:
# Train and evaluate the models with the best hyperparameters
def train_and_evaluate(pipe, x_train, y_train, x_test, y_test):
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    print(f'Testing accuracy {accuracy_score(y_test, y_pred)}')
    print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')

### SVM

In [102]:
best_svm_params = svm_trial.params
svm_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))),
    ('estimator', SVC(C=best_svm_params['C'], kernel='linear', random_state=0))  # Ensure linear kernel
])
train_and_evaluate(svm_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 1.0
Confusion matrix: 
[[46  0]
 [ 0 35]]


In [103]:
# svm_accuracies = []

# # Loop through the number of features from 1 to 150
# for k in range(1, 121):
#     best_svm_params = svm_trial.params
#     svm_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', SVC(C=best_svm_params['C'], kernel=best_svm_params['kernel'], random_state=0))
#     ])
    
#     # Train and evaluate the pipeline
#     svm_pipe.fit(x_train, y_train)
#     y_pred = svm_pipe.predict(x_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     svm_accuracies.append(accuracy)
#     print(f'Number of features: {k}, Accuracy: {accuracy}')

# # Plot the number of features vs accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), svm_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features (SVM)')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

### Random Forest

In [104]:
best_rf_params = rf_trial.params
rf_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0))),
    ('estimator', RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
                                         max_depth=best_rf_params['max_depth'],
                                         max_features=best_rf_params['max_features'],
                                         criterion=best_rf_params['criterion'],
                                         random_state=0))
])
train_and_evaluate(rf_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 0.9876543209876543
Confusion matrix: 
[[46  0]
 [ 1 34]]


In [105]:
# rf_accuracies = []

# # Loop through the number of features from 1 to 200
# for k in range(1, 121):
#     best_rf_params = rf_trial.params
#     rf_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
#                                              max_depth=best_rf_params['max_depth'],
#                                              max_features=best_rf_params['max_features'],
#                                              criterion=best_rf_params['criterion'],
#                                              random_state=0))
#     ])
    
#     # Train and evaluate the pipeline
#     rf_pipe.fit(x_train, y_train)
#     y_pred = rf_pipe.predict(x_test)
#     rf_accuracy = accuracy_score(y_test, y_pred)
#     rf_accuracies.append(rf_accuracy)
#     print(f'Number of features: {k}, Accuracy: {rf_accuracy}')

# # Plot the number of features vs rf_accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), rf_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

### XGBoost

In [106]:
best_xgboost_params = xgboost_trial.params
xgboost_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
    ('estimator', XGBClassifier(learning_rate=best_xgboost_params['learning_rate'],
                                max_depth=best_xgboost_params['max_depth'],
                                n_estimators=best_xgboost_params['n_estimators'],
                                random_state=0,
                                use_label_encoder=False,
                                eval_metric='logloss'))
])
train_and_evaluate(xgboost_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 0.9876543209876543
Confusion matrix: 
[[46  0]
 [ 1 34]]


In [107]:
# xgboost_accuracies = []

# # Loop through the number of features from 1 to 150
# for k in range(1, 121):
#     best_xgboost_params = xgboost_trial.params
#     xgboost_pipe = Pipeline([
#         ('skb', SelectKBest(f_classif, k=k)),
#         ('lasso', SelectFromModel(LassoCV(cv=5, random_state=0))),
#         ('estimator', XGBClassifier(learning_rate=best_xgboost_params['learning_rate'],
#                                     max_depth=best_xgboost_params['max_depth'],
#                                     n_estimators=best_xgboost_params['n_estimators'],
#                                     random_state=0,
#                                     use_label_encoder=False,
#                                     eval_metric='logloss'))
#     ])

#     # Train and evaluate the pipeline
#     xgboost_pipe.fit(x_train, y_train)
#     y_pred = xgboost_pipe.predict(x_test)
#     xgboost_accuracy = accuracy_score(y_test, y_pred)
#     xgboost_accuracies.append(xgboost_accuracy)
#     print(f'Number of features: {k}, Accuracy: {xgboost_accuracy}') 

# # Plot the number of features vs xgboost_accuracy
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 121), xgboost_accuracies, marker='o')
# plt.title('Accuracy vs Number of Features (XGBoost)')
# plt.xlabel('Number of Features')
# plt.ylabel('Accuracy')
# plt.grid(True)
# plt.show()

In [108]:
def get_top_features(pipe, feature_names, top_feature_num):
    if isinstance(pipe.named_steps['estimator'], SVC):
        if pipe.named_steps['estimator'].kernel != 'linear':
            raise ValueError("Feature importance is not available for non-linear SVM kernels.")
        feature_scores = pipe.named_steps['estimator'].coef_[0]
    elif isinstance(pipe.named_steps['estimator'], XGBClassifier):
        feature_scores = pipe.named_steps['estimator'].feature_importances_
    else:
        raise ValueError("Unsupported estimator type for feature extraction.")
    
    # Get selected features from SelectKBest
    skb_support = pipe.named_steps['skb'].get_support(indices=True)
    print("Selected features from SelectKBest:", skb_support)
    
    # Transform the features using SelectKBest
    skb_features = pipe.named_steps['skb'].transform(x_train)
    
    # Get selected features from LassoCV
    lasso = LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0)
    lasso.fit(skb_features, y_train)
    lasso_support = np.where(lasso.coef_ != 0)[0]
    print("Selected features from LassoCV:", lasso_support)
    
    if len(lasso_support) == 0:
        print("No features selected after LassoCV.")
        return []
    
    # Ensure indices match feature_scores
    top_indices = np.argsort(np.abs(lasso.coef_[lasso_support]))[::-1][:top_feature_num]
    top_features = [(feature_names[skb_support[i]], lasso.coef_[lasso_support[i]]) for i in top_indices]
    
    return top_features

In [109]:
try:
    svm_top_features = get_top_features(svm_pipe, feature_names, feature_importance_num)
    print("Top SVM features:", svm_top_features)
except ValueError as e:
    print("SVM feature extraction error:", e)

xgboost_top_features = get_top_features(xgboost_pipe, feature_names, feature_importance_num)
print("Top XGBoost features:", xgboost_top_features)

Selected features from SelectKBest: [  69  139  159  221  231  248  262  290  331  332  344  531  639  654
  671  778  792  802  869  990  992 1056 1090 1091 1095 1151 1283 1287
 1301 1385 1576 1587 1667 1746 1925 1934 1935 1945 1988 1993 2027 2087
 2095 2190 2241 2273 2302 2309 2381 2560]
Selected features from LassoCV: [ 0  2  3  4  5  8 12 16 18 20 21 22 25 28 29 31 32 33 34 38 39 40 47 49]
Top SVM features: [('hsa-miR-6836-3p', 0.2949321869366091), ('hsa-miR-4417', -0.06826909362398707), ('hsa-miR-6805-3p', -0.06135703180436593), ('hsa-miR-4456', -0.04703965335515834), ('hsa-miR-1185-1-3p', 0.021090670940266013), ('hsa-miR-1225-3p', 0.020981720538378615), ('hsa-miR-6839-5p', -0.01417069469956532), ('hsa-miR-3141', -0.013864032777814094), ('hsa-miR-504-3p', 0.012390546675234777), ('hsa-miR-4459', -0.01171079935147726)]
Selected features from SelectKBest: [  69  139  159  221  231  248  262  290  331  332  344  531  639  654
  671  778  792  802  869  990  992 1056 1090 1091 1095 115

In [110]:
def normalize_importance(importance_scores):
    max_score = max(importance_scores, key=lambda x: abs(x[1]))[1]
    min_score = min(importance_scores, key=lambda x: abs(x[1]))[1]
    normalized_scores = [(feature, (score - min_score) / (max_score - min_score)) for feature, score in importance_scores]
    return normalized_scores

In [111]:
# Normalize the top features for SVM and XGBoost
normalized_svm_features = normalize_importance(svm_top_features)
normalized_xgboost_features = normalize_importance(xgboost_top_features)

# Compile normalized top features
def compile_normalized_top_features(normalized_svm_features, normalized_xgboost_features):
    top_features = []
    for feature, score in normalized_svm_features:
        top_features.append((f'SVM_{feature}', score))
    for feature, score in normalized_xgboost_features:
        top_features.append((f'XGB_{feature}', score))
    return top_features

normalized_top_features = compile_normalized_top_features(normalized_svm_features, normalized_xgboost_features)
print("Compiled normalized top features:", normalized_top_features)

Compiled normalized top features: [('SVM_hsa-miR-6836-3p', 1.0), ('SVM_hsa-miR-4417', -0.18444346292458216), ('SVM_hsa-miR-6805-3p', -0.1619023903134272), ('SVM_hsa-miR-4456', -0.11521168128231755), ('SVM_hsa-miR-1185-1-3p', 0.10696957621240616), ('SVM_hsa-miR-1225-3p', 0.1066142757269581), ('SVM_hsa-miR-6839-5p', -0.008022017323354091), ('SVM_hsa-miR-3141', -0.007021955572510323), ('SVM_hsa-miR-504-3p', 0.07859741492365063), ('SVM_hsa-miR-4459', 0.0), ('XGB_hsa-miR-6836-3p', 1.0), ('XGB_hsa-miR-4417', -0.18444346292458216), ('XGB_hsa-miR-6805-3p', -0.1619023903134272), ('XGB_hsa-miR-4456', -0.11521168128231755), ('XGB_hsa-miR-1185-1-3p', 0.10696957621240616), ('XGB_hsa-miR-1225-3p', 0.1066142757269581), ('XGB_hsa-miR-6839-5p', -0.008022017323354091), ('XGB_hsa-miR-3141', -0.007021955572510323), ('XGB_hsa-miR-504-3p', 0.07859741492365063), ('XGB_hsa-miR-4459', 0.0)]


In [112]:
normalized_top_features_df = pd.DataFrame(normalized_top_features, columns=['Feature', 'Normalized Importance'])
normalized_top_features_df.to_csv('../GSEA/miRNA/s1_s2_miRNA.csv', index=False)
print("Compiled normalized top features saved to '../GSEA/miRNA/s1_s2_miRNA.csv'")

Compiled normalized top features saved to '../GSEA/miRNA/s1_s2_miRNA.csv'


In [114]:
def get_all_selected_features(pipe, feature_names):
    if isinstance(pipe.named_steps['estimator'], SVC):
        if pipe.named_steps['estimator'].kernel != 'linear':
            raise ValueError("Feature importance is not available for non-linear SVM kernels.")
        feature_scores = pipe.named_steps['estimator'].coef_[0]
    elif isinstance(pipe.named_steps['estimator'], XGBClassifier):
        feature_scores = pipe.named_steps['estimator'].feature_importances_
    else:
        raise ValueError("Unsupported estimator type for feature extraction.")
    
    # Get selected features from SelectKBest
    skb_support = pipe.named_steps['skb'].get_support(indices=True)
    print("Selected features from SelectKBest:", skb_support)
    
    # Transform the features using SelectKBest
    skb_features = pipe.named_steps['skb'].transform(x_train)
    
    # Get selected features from LassoCV
    lasso = LassoCV(alphas=np.logspace(-6, -1, 50), cv=5, max_iter=10000, random_state=0)
    lasso.fit(skb_features, y_train)
    lasso_support = np.where(lasso.coef_ != 0)[0]
    print("Selected features from LassoCV:", lasso_support)
    
    # Map the selected LassoCV features back to the original feature names
    lasso_support_mapped = skb_support[lasso_support]
    
    # Ensure the index does not go out of bounds
    all_features = [(feature_names[i], feature_scores[j]) for j, i in enumerate(lasso_support_mapped) if j < len(feature_scores)]
    
    return all_features

# Extract all selected features from SVM and XGBoost
all_svm_features = get_all_selected_features(svm_pipe, feature_names)
print("All SVM features:", all_svm_features)

all_xgboost_features = get_all_selected_features(xgboost_pipe, feature_names)
print("All XGBoost features:", all_xgboost_features)

# Save all selected features to CSV
all_features_df = pd.DataFrame(all_svm_features + all_xgboost_features, columns=['Feature', 'Importance'])
all_features_df.to_csv('../GSEA/miRNA/50_b_s1.csv', index=False)
print("All selected features saved to '../GSEA/miRNA/50_b_s1.csv'")

Selected features from SelectKBest: [  69  139  159  221  231  248  262  290  331  332  344  531  639  654
  671  778  792  802  869  990  992 1056 1090 1091 1095 1151 1283 1287
 1301 1385 1576 1587 1667 1746 1925 1934 1935 1945 1988 1993 2027 2087
 2095 2190 2241 2273 2302 2309 2381 2560]
Selected features from LassoCV: [ 0  2  3  4  5  8 12 16 18 20 21 22 25 28 29 31 32 33 34 38 39 40 47 49]
All SVM features: [('hsa-miR-6805-3p', -0.08555831121511477), ('hsa-miR-6836-3p', -0.04165675791029131), ('hsa-miR-5100', 0.24909653849298777), ('hsa-miR-504-3p', 0.05578176049200083), ('hsa-miR-17-3p', 0.14271696427948427), ('hsa-miR-1290', 0.14413002400904676), ('hsa-miR-6839-5p', 0.03419987195636753), ('hsa-miR-6741-5p', 0.09221891321873943), ('hsa-miR-6780a-5p', 0.032526412064578195), ('hsa-miR-1225-3p', -0.027921543236082296), ('hsa-miR-516a-5p', -0.05131814595537483), ('hsa-miR-4456', -0.20390865287052515), ('hsa-miR-1224-5p', -0.11168865122296627), ('hsa-miR-129-1-3p', -0.05135816980856107