In [1]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import networkx as nx
import itertools
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from catboost import CatBoostClassifier
import os
import joblib

In [2]:
stage_3_samples = pd.read_csv('../data/cancer/stage_3_prostate_cancer_samples.csv')
stage_3_samples.shape

(150, 2570)

In [3]:
stage_4_samples = pd.read_csv('../data/cancer/stage_4_prostate_cancer_samples.csv')
stage_4_samples.shape

(91, 2570)

In [4]:
combined_dataset = pd.concat([stage_3_samples, stage_4_samples], ignore_index=True)

In [5]:
# Verify and clean the Stage column
print("Unique values in Stage column:", combined_dataset['Stage'].unique())
combined_dataset['Stage'] = combined_dataset['Stage'].str.strip()
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 3', 0, 1)
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 4', 1, combined_dataset['ID_REF'])

# Print class distribution to ensure both classes are present
print("Class distribution in ID_REF column:")
print(combined_dataset['ID_REF'].value_counts())


Unique values in Stage column: ['Stage: 3' 'Stage: 4']
Class distribution in ID_REF column:
ID_REF
0    150
1     91
Name: count, dtype: int64


In [6]:
# Define process_data function
def process_data(data, under_sample_factor=None, over_sample_factor=None):
    columns_to_drop = ['Sample_ID', 'Sex', 'Age', 'Stage', 'Disease']
    data = data.drop(columns=columns_to_drop, axis=1)
    
    x = np.array(data.drop(["ID_REF"], axis=1)).astype('float')
    y = np.array(data["ID_REF"]).astype('int')
    feature_names = data.columns[1:]

    if under_sample_factor is not None and isinstance(under_sample_factor, float) and 0 < under_sample_factor <= 1:
        under_sampler = RandomUnderSampler(sampling_strategy=under_sample_factor)
        x, y = under_sampler.fit_resample(x, y)

    if over_sample_factor is not None and isinstance(over_sample_factor, float) and 0 < over_sample_factor <= 1:
        over_sampler = RandomOverSampler(sampling_strategy=over_sample_factor)
        x, y = over_sampler.fit_resample(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test, feature_names

In [7]:
# Define parameters
feature_selection_num = 500
feature_importance_num = 20
pca_components = 100  # Number of principal components for PCA

# Process data
x_train, x_test, y_train, y_test, feature_names = process_data(combined_dataset)

In [8]:
# Define the objective function for Optuna - SVM
def svm_objective(trial):
    k = feature_selection_num
    
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('pca', PCA(n_components=pca_components)),
        ('estimator', SVC(C=C, kernel=kernel, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [9]:
def rf_objective(trial):
    k = feature_selection_num
    
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('pca', PCA(n_components=pca_components)),
        ('estimator', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, criterion=criterion, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [10]:
def catboost_objective(trial):
    k = feature_selection_num
    
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1.0)
    depth = trial.suggest_int('depth', 2, 10)
    iterations = trial.suggest_int('iterations', 100, 1000)
    
    pipe = Pipeline([
        ('skb', SelectKBest(f_classif, k=k)),
        ('pca', PCA(n_components=pca_components)),
        ('estimator', CatBoostClassifier(learning_rate=learning_rate, depth=depth, iterations=iterations, verbose=0, random_state=0))
    ])
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(pipe, x_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

In [11]:
# Function to save study
def save_study(study, filename):
    joblib.dump(study, filename)

# Function to load study
def load_study(filename):
    return joblib.load(filename)

In [12]:
# Optimize hyperparameters using Optuna
svm_study_filename = 'svm_study_s3_s4.pkl'
rf_study_filename = 'rf_study_s3_s4.pkl'
catboost_study_filename = 'catboost_study_s3_s4.pkl'

### Find Hyperparmeters if not trained already

In [13]:
# SVM Hyperparameter Optimization
if os.path.exists(svm_study_filename):
    svm_study = load_study(svm_study_filename)
else:
    svm_study = optuna.create_study(direction='maximize')
    svm_study.optimize(svm_objective, n_trials=50)
    save_study(svm_study, svm_study_filename)

In [14]:
# RF Hyperparameter Optimization
if os.path.exists(rf_study_filename):
    rf_study = load_study(rf_study_filename)
else:
    rf_study = optuna.create_study(direction='maximize')
    rf_study.optimize(rf_objective, n_trials=50)
    save_study(rf_study, rf_study_filename)

In [15]:
# CatBoost Hyperparameter Optimization
if os.path.exists(catboost_study_filename):
    catboost_study = load_study(catboost_study_filename)
else:
    catboost_study = optuna.create_study(direction='maximize')
    catboost_study.optimize(catboost_objective, n_trials=50)
    save_study(catboost_study, catboost_study_filename)

### Print the best trial for each study

For SVM, RF, and Catboost

In [16]:
print("Best SVM trial:")
svm_trial = svm_study.best_trial
print("  Value: ", svm_trial.value)
print("  Params: ")
for key, value in svm_trial.params.items():
    print(f"    {key}: {value}")

Best SVM trial:
  Value:  0.6353576248313091
  Params: 
    C: 1.8492946891319226
    kernel: poly


In [17]:
print("Best Random Forest trial:")
rf_trial = rf_study.best_trial
print("  Value: ", rf_trial.value)
print("  Params: ")
for key, value in rf_trial.params.items():
    print(f"    {key}: {value}")

Best Random Forest trial:
  Value:  0.6354925775978407
  Params: 
    n_estimators: 585
    max_depth: 14
    max_features: sqrt
    criterion: entropy


In [18]:
print("Best CatBoost trial:")
catboost_trial = catboost_study.best_trial
print("  Value: ", catboost_trial.value)
print("  Params: ")
for key, value in catboost_trial.params.items():
    print(f"    {key}: {value}")

Best CatBoost trial:
  Value:  0.6356275303643726
  Params: 
    learning_rate: 0.01462911525171483
    depth: 10
    iterations: 368


## Train and evaluate the models with the best hyperparameters

In [19]:
# Train and evaluate the models with the best hyperparameters
def train_and_evaluate(pipe, x_train, y_train, x_test, y_test):
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    print(f'Testing accuracy {accuracy_score(y_test, y_pred)}')
    print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')

### SVM

In [20]:
best_svm_params = svm_trial.params
svm_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('pca', PCA(n_components=pca_components)),
    ('estimator', SVC(C=best_svm_params['C'], kernel=best_svm_params['kernel'], random_state=0))
])
train_and_evaluate(svm_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 0.5918367346938775
Confusion matrix: 
[[29  1]
 [19  0]]


### Random Forest

In [21]:
best_rf_params = rf_trial.params
rf_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('pca', PCA(n_components=pca_components)),
    ('estimator', RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
                                         max_depth=best_rf_params['max_depth'],
                                         max_features=best_rf_params['max_features'],
                                         criterion=best_rf_params['criterion'],
                                         random_state=0))
])
train_and_evaluate(rf_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 0.6530612244897959
Confusion matrix: 
[[30  0]
 [17  2]]


### CatBoost

In [22]:
best_catboost_params = catboost_trial.params
catboost_pipe = Pipeline([
    ('skb', SelectKBest(f_classif, k=feature_selection_num)),
    ('pca', PCA(n_components=pca_components)),
    ('estimator', CatBoostClassifier(learning_rate=best_catboost_params['learning_rate'],
                                     depth=best_catboost_params['depth'],
                                     iterations=best_catboost_params['iterations'],
                                     verbose=0,
                                     random_state=0))
])
train_and_evaluate(catboost_pipe, x_train, y_train, x_test, y_test)

Testing accuracy 0.5918367346938775
Confusion matrix: 
[[28  2]
 [18  1]]


In [23]:
# Feature importance and top features can be extracted similarly to the previous script
def get_top_features(pipe, feature_names, top_feature_num):
    if isinstance(pipe.named_steps['estimator'], SVC):
        if pipe.named_steps['estimator'].kernel != 'linear':
            raise ValueError("Feature importance is not available for non-linear SVM kernels.")
        feature_scores = pipe.named_steps['estimator'].coef_[0]
    elif isinstance(pipe.named_steps['estimator'], RandomForestClassifier) or isinstance(pipe.named_steps['estimator'], CatBoostClassifier):
        feature_scores = pipe.named_steps['estimator'].feature_importances_
    features = pipe.named_steps['skb'].get_support(indices=True)
    top_indices = np.argsort(np.abs(feature_scores))[::-1][:top_feature_num]
    top_features = [(feature_names[i], feature_scores[i]) for i in top_indices]
    return top_features

In [24]:
# Get top features for each model
try:
    svm_top_features = get_top_features(svm_pipe, feature_names, feature_importance_num)
    print("Top SVM features:", svm_top_features)
except ValueError as e:
    print("SVM feature extraction error:", e)

rf_top_features = get_top_features(rf_pipe, feature_names, feature_importance_num)
catboost_top_features = get_top_features(catboost_pipe, feature_names, feature_importance_num)

print("Top Random Forest features:", rf_top_features)
print("Top CatBoost features:", catboost_top_features)

SVM feature extraction error: Feature importance is not available for non-linear SVM kernels.
Top Random Forest features: [('hsa-miR-106a-3p', 0.03830524499573508), ('hsa-miR-548l', 0.03151128579083724), ('hsa-miR-124-5p', 0.031072531952670023), ('hsa-miR-512-3p', 0.027539310522499534), ('hsa-miR-1207-3p', 0.024001600606678185), ('hsa-miR-27a-5p', 0.02247050653083074), ('hsa-miR-498', 0.018352545805131026), ('hsa-miR-6829-3p', 0.01623961783500971), ('hsa-miR-4680-5p', 0.015200772536816948), ('hsa-miR-490-3p', 0.014342863051951774), ('hsa-miR-5583-3p', 0.013941718885508962), ('hsa-miR-3124-3p', 0.013474662817620052), ('hsa-miR-4802-3p', 0.012682279240234155), ('hsa-miR-6716-3p', 0.012332769282919613), ('hsa-miR-1254', 0.011906986089753118), ('hsa-miR-1208', 0.011800972090224989), ('hsa-miR-3973', 0.011605988377774795), ('hsa-miR-6717-5p', 0.011418844437422512), ('hsa-miR-4722-3p', 0.011277881685529609), ('hsa-miR-196a-3p', 0.011081294595093476)]
Top CatBoost features: [('hsa-miR-106a-3p

In [25]:
# Compile and print the list of top features
def compile_top_features_list(rf_features, cb_features):
    top_features = rf_features + cb_features
    feature_names = [feature[0] for feature in top_features]
    return '\n'.join(feature_names)

top_features_list = compile_top_features_list(rf_top_features, catboost_top_features)
print(top_features_list)

hsa-miR-106a-3p
hsa-miR-548l
hsa-miR-124-5p
hsa-miR-512-3p
hsa-miR-1207-3p
hsa-miR-27a-5p
hsa-miR-498
hsa-miR-6829-3p
hsa-miR-4680-5p
hsa-miR-490-3p
hsa-miR-5583-3p
hsa-miR-3124-3p
hsa-miR-4802-3p
hsa-miR-6716-3p
hsa-miR-1254
hsa-miR-1208
hsa-miR-3973
hsa-miR-6717-5p
hsa-miR-4722-3p
hsa-miR-196a-3p
hsa-miR-106a-3p
hsa-miR-124-5p
hsa-miR-548l
hsa-miR-1207-3p
hsa-miR-512-3p
hsa-miR-490-3p
hsa-miR-498
hsa-miR-27a-5p
hsa-miR-4680-5p
hsa-miR-6717-5p
hsa-miR-518e-3p
hsa-miR-4281
hsa-miR-1255a
hsa-miR-4483
hsa-miR-4285
hsa-miR-4802-3p
hsa-miR-2277-3p
hsa-miR-451a
hsa-miR-8072
hsa-miR-3140-5p


In [26]:
# Save top features list to a file using pandas
top_features_df = pd.DataFrame(top_features_list.split('\n'), columns=['Feature'])
top_features_df.to_csv('../GSEA/miRNA/s3_s4_miRNA.csv', index=False)