In [363]:
import numpy as np
import pandas as pd

In [364]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [365]:
dataset = pd.read_csv("dataset.csv")
print(dataset)

                                           ID_REF  hsa-miR-28-3p  \
0      Benign Disease In The Bone And Soft Tissue       2.793825   
1      Benign Disease In The Bone And Soft Tissue       5.461192   
2      Benign Disease In The Bone And Soft Tissue       1.819089   
3      Benign Disease In The Bone And Soft Tissue       4.317970   
4      Benign Disease In The Bone And Soft Tissue       5.520351   
...                                           ...            ...   
16185                             Prostate Cancer       3.686871   
16186                             Prostate Cancer       0.121448   
16187                             Prostate Cancer      -0.661008   
16188                             Prostate Cancer      -1.769902   
16189                             Prostate Cancer       2.596167   

       hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  hsa-miR-498  \
0           -1.981370     -1.981370     -1.981370     5.857238   
1            6.619583      6.130041      3.067365  

In [366]:
def process_data(data):
    x = np.array(data.drop(["ID_REF"], axis=1)).astype('float')
    y = np.array(data["ID_REF"]).astype('int')
    feature_names = data.columns[1:]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0, stratify=y)

    return x_train, y_train, x_test, y_test, feature_names

In [367]:
def support_vector_machine(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif, k=feature_num)), ('estimator', SVC())])

    pipe_parameters = {'skb__k' : [feature_num],
                       'estimator__C': [0.25, 0.5, 0.75, 1],
                       'estimator__kernel': ['linear']
                       }

    svm_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    svm_grid_search.fit(x_train, y_train)

    return svm_grid_search

def get_svm_metrics(svm_grid_search, x_test, y_test):
    best_accuracy = svm_grid_search.best_score_
    best_parameters = svm_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = svm_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_svm_features(svm_grid_search, feature_names, top_feature_num):
    coef_list = svm_grid_search.best_estimator_.named_steps['estimator'].coef_[0]
    features = svm_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    coef_list, selected_features_list = zip(*sorted(zip(abs(coef_list), selected_features_list), reverse=True))
    coef_list, selected_features_list = list(coef_list), list(selected_features_list)

    return coef_list[:top_feature_num], selected_features_list[:top_feature_num]

In [368]:
def random_forest(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif, k=feature_num)), ('estimator', RandomForestClassifier(random_state=0))])

    pipe_parameters = {'skb__k': [feature_num],
                       'estimator__n_estimators': [100, 500],
                       'estimator__max_features': ['auto', 'sqrt', 'log2'],
                       'estimator__criterion' :['gini', 'entropy']
                       }

    rf_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    rf_grid_search.fit(x_train, y_train)

    return rf_grid_search

def get_rf_metrics(rf_grid_search, x_test, y_test):
    best_accuracy = rf_grid_search.best_score_
    best_parameters = rf_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = rf_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_rf_features(rf_grid_search, feature_names, top_feature_num):
    importance_list = rf_grid_search.best_estimator_.named_steps['estimator'].feature_importances_
    features = rf_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    importance_list, selected_features_list = zip(*sorted(zip(importance_list, selected_features_list), reverse=True))
    importance_list, selected_features_list = list(importance_list), list(selected_features_list)

    return importance_list[:top_feature_num], selected_features_list[:top_feature_num]

In [369]:
def gradient_boosting(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif)), ('estimator', GradientBoostingClassifier())])

    pipe_parameters = {'skb__k': [feature_num],
                       'estimator__learning_rate': [0.5, 1],
                       'estimator__n_estimators': [50],
                       }

    gb_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    gb_grid_search.fit(x_train, y_train)

    return gb_grid_search

def get_gb_metrics(gb_grid_search, x_test, y_test):
    best_accuracy = gb_grid_search.best_score_
    best_parameters = gb_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = gb_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_gb_features(gb_grid_search, feature_names, top_feature_num):
    importance_list = gb_grid_search.best_estimator_.named_steps['estimator'].feature_importances_
    features = gb_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    importance_list, selected_features_list = zip(*sorted(zip(importance_list, selected_features_list), reverse=True))
    importance_list, selected_features_list = list(importance_list), list(selected_features_list)

    return importance_list[:top_feature_num], selected_features_list[:top_feature_num]

In [370]:
feature_selection_num = 500
feature_importance_num = 100

Normal vs Cancer Dataset

In [371]:
normal_cancer_dataset = dataset.copy()

normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "Benign Disease In The Bone And Soft Tissue", "ID_REF"] = 0
normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "Benign Disease In The Breast", "ID_REF"] = 0
normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "Benign Disease In The Ovary", "ID_REF"] = 0
normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "Benign Disease In The Prostate", "ID_REF"] = 0
normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "Extraparenchymal Brain Tumor And Benign Disease In The Brain", "ID_REF"] = 0
normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0

normal_cancer_dataset.loc[normal_cancer_dataset["ID_REF"] != 0, "ID_REF"] = 1

print(normal_cancer_dataset, "\n")
print("Cancer Sample Number:", normal_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", normal_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", normal_cancer_dataset["ID_REF"].value_counts()[1] + normal_cancer_dataset["ID_REF"].value_counts()[0])

x_train_normal, y_train_normal, x_test_normal, y_test_normal, feature_names_normal = process_data(normal_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
0          0       2.793825       -1.981370     -1.981370     -1.981370   
1          0       5.461192        6.619583      6.130041      3.067365   
2          0       1.819089       -2.105596      2.259142     -2.105596   
3          0       4.317970        2.289873      4.857213      4.956817   
4          0       5.520351        0.131023      0.131023      1.981760   
...      ...            ...             ...           ...           ...   
16185      1       3.686871        5.402453      5.126641      6.142145   
16186      1       0.121448        0.121448      0.121448      0.121448   
16187      1      -0.661008        4.343310      4.541016      3.901700   
16188      1      -1.769902       -1.769902      0.133393      2.717976   
16189      1       2.596167       -0.620166      3.208942      1.720689   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
0         5.857238        4.2

In [372]:
svm_grid_search_normal = support_vector_machine(x_train_normal, y_train_normal, feature_selection_num)

In [373]:
get_svm_metrics(svm_grid_search_normal, x_test_normal, y_test_normal)
svm_top_coef_normal, svm_top_features_normal = get_top_svm_features(svm_grid_search_normal, feature_names_normal, feature_importance_num)
svm_top_features_normal = svm_top_features_normal
print(svm_top_features_normal)
print(svm_top_coef_normal)

Training Accuracy: 0.9681903331903332
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 0.968190240889438
Confusion Matrix:
[[1205   49]
 [  54 1930]]
['hsa-miR-1228-5p', 'hsa-miR-6765-5p', 'hsa-miR-8069', 'hsa-miR-5100', 'hsa-miR-1343-5p', 'hsa-miR-8059', 'hsa-miR-128-2-5p', 'hsa-miR-6789-5p', 'hsa-miR-6752-5p', 'hsa-miR-614', 'hsa-miR-4697-5p', 'hsa-miR-6805-5p', 'hsa-miR-6769a-5p', 'hsa-miR-642b-3p', 'hsa-miR-4649-5p', 'hsa-miR-6798-5p', 'hsa-miR-6088', 'hsa-miR-3180', 'hsa-miR-6875-5p', 'hsa-miR-6729-5p', 'hsa-miR-6787-5p', 'hsa-miR-6786-5p', 'hsa-miR-6800-5p', 'hsa-miR-6756-5p', 'hsa-miR-7975', 'hsa-miR-744-5p', 'hsa-miR-3663-3p', 'hsa-miR-1469', 'hsa-miR-6090', 'hsa-miR-4294', 'hsa-miR-6861-5p', 'hsa-miR-320a', 'hsa-miR-4258', 'hsa-miR-4687-5p', 'hsa-miR-1233-5p', 'hsa-miR-4787-3p', 'hsa-miR-4690-5p', 'hsa-miR-663b', 'hsa-miR-4454', 'hsa-miR-4286', 'hsa-miR-4530', 'hsa-miR-6746-5p', 'hsa-miR-6784-5p', 'hsa-miR-575', 'hsa-miR-4

In [374]:
rf_grid_search_normal = random_forest(x_train_normal, y_train_normal, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fi

In [375]:
get_rf_metrics(rf_grid_search_normal, x_test_normal, y_test_normal)
rf_top_importance_normal, rf_top_features_normal = get_top_rf_features(rf_grid_search_normal, feature_names_normal, feature_importance_num)
print(rf_top_features_normal)
print(rf_top_importance_normal)

Training Accuracy: 0.9722045855379188
Best Parameters: {'estimator__criterion': 'gini', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 100, 'skb__k': 500}
Testing Accuracy: 0.9706609017912291
Confusion Matrix:
[[1161   93]
 [   2 1982]]
['hsa-miR-663a', 'hsa-miR-1307-3p', 'hsa-miR-4730', 'hsa-miR-6784-5p', 'hsa-miR-3184-5p', 'hsa-miR-320a', 'hsa-miR-548ah-5p', 'hsa-miR-1228-5p', 'hsa-miR-376c-5p', 'hsa-miR-3940-5p', 'hsa-miR-1233-5p', 'hsa-miR-1469', 'hsa-miR-3686', 'hsa-miR-1258', 'hsa-miR-3672', 'hsa-miR-4783-3p', 'hsa-miR-3923', 'hsa-miR-561-5p', 'hsa-miR-3115', 'hsa-miR-4477b', 'hsa-miR-320b', 'hsa-miR-513c-3p', 'hsa-miR-4782-5p', 'hsa-miR-548au-5p', 'hsa-miR-548i', 'hsa-miR-1290', 'hsa-miR-6802-5p', 'hsa-miR-8073', 'hsa-miR-1203', 'hsa-miR-5100', 'hsa-miR-620', 'hsa-miR-3683', 'hsa-miR-4464', 'hsa-miR-4678', 'hsa-miR-1262', 'hsa-miR-548aw', 'hsa-miR-1537-5p', 'hsa-miR-8061', 'hsa-miR-4696', 'hsa-miR-548av-5p', 'hsa-miR-1277-3p', 'hsa-miR-3924', 'hsa-miR-5579-3p', 'h

In [376]:
gb_grid_search_normal = gradient_boosting(x_train_normal, y_train_normal, feature_selection_num)



In [377]:
get_gb_metrics(gb_grid_search_normal, x_test_normal, y_test_normal)
gb_top_importance_normal, gb_top_features_normal = get_top_gb_features(gb_grid_search_normal, feature_names_normal, feature_importance_num)
print(gb_top_features_normal)
print(gb_top_importance_normal)

Training Accuracy: 0.9769148314981649
Best Parameters: {'estimator__learning_rate': 0.5, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 0.9756022235948116
Confusion Matrix:
[[1197   57]
 [  22 1962]]
['hsa-miR-663a', 'hsa-miR-3184-5p', 'hsa-miR-5100', 'hsa-miR-1228-5p', 'hsa-miR-6729-5p', 'hsa-miR-4787-3p', 'hsa-miR-6784-5p', 'hsa-miR-4532', 'hsa-miR-29b-1-5p', 'hsa-miR-29b-3p', 'hsa-miR-4730', 'hsa-miR-4697-5p', 'hsa-miR-1203', 'hsa-miR-6756-5p', 'hsa-miR-1290', 'hsa-miR-8073', 'hsa-miR-320a', 'hsa-miR-4783-3p', 'hsa-miR-320b', 'hsa-miR-6885-5p', 'hsa-miR-4258', 'hsa-miR-17-3p', 'hsa-miR-575', 'hsa-miR-4505', 'hsa-miR-6090', 'hsa-miR-6875-5p', 'hsa-miR-6821-5p', 'hsa-miR-4677-3p', 'hsa-miR-4675', 'hsa-miR-7977', 'hsa-miR-6798-5p', 'hsa-miR-4718', 'hsa-miR-1307-3p', 'hsa-miR-5706', 'hsa-miR-1343-5p', 'hsa-miR-3119', 'hsa-miR-6079', 'hsa-miR-151b', 'hsa-miR-4417', 'hsa-miR-4429', 'hsa-miR-548aw', 'hsa-miR-4740-5p', 'hsa-miR-4696', 'hsa-miR-8059', 'hsa-miR-4525', 'hsa-mi

Lung Cancer Dataset

In [378]:
lung_cancer_dataset = dataset.copy()

lung_cancer_dataset.loc[lung_cancer_dataset["ID_REF"] == "Lung Cancer", "ID_REF"] = 1
lung_cancer_dataset.loc[lung_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
lung_cancer_dataset = lung_cancer_dataset[(lung_cancer_dataset["ID_REF"] == 0) | (lung_cancer_dataset["ID_REF"] == 1)]

print(lung_cancer_dataset, "\n")
print("Lung Cancer Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[1] + lung_cancer_dataset["ID_REF"].value_counts()[0])

x_train_lung, y_train_lung, x_test_lung, y_test_lung, feature_names_lung = process_data(lung_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
6570       1      -0.912744       -0.912744     -0.912744     -0.912744   
6571       1       5.943489        6.645981      5.537396      4.770859   
6572       1       4.266132        4.008883      4.540517     -0.268746   
6573       1      -1.140124        2.646410      2.768230      2.434285   
6574       1       0.292553        0.292553      2.881859      0.292553   
...      ...            ...             ...           ...           ...   
13907      0       0.592899        1.892105      3.260905     -3.842394   
13908      0      -3.083570       -3.083570      3.472014     -3.083570   
13909      0       2.879964       -1.514097      2.997163      0.251458   
13910      0       5.509703        5.171011      5.667294      5.430266   
13911      0       1.268251        1.757426      2.885682      0.769274   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
6570      6.620132        3.6

In [379]:
svm_grid_search_lung = support_vector_machine(x_train_lung, y_train_lung, feature_selection_num)

In [380]:
get_svm_metrics(svm_grid_search_lung, x_test_lung, y_test_lung)
svm_top_coef_lung, svm_top_features_lung = get_top_svm_features(svm_grid_search_lung, feature_names_lung, feature_importance_num)
print(svm_top_features_lung)
print(svm_top_coef_lung)

Training Accuracy: 0.999488926746167
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  340]]
['hsa-miR-125a-3p', 'hsa-miR-4783-3p', 'hsa-miR-4730', 'hsa-miR-4648', 'hsa-miR-1587', 'hsa-miR-1307-3p', 'hsa-miR-744-5p', 'hsa-miR-4787-3p', 'hsa-miR-3184-5p', 'hsa-miR-1203', 'hsa-miR-663a', 'hsa-miR-6875-5p', 'hsa-miR-1246', 'hsa-miR-575', 'hsa-miR-6784-5p', 'hsa-miR-1469', 'hsa-miR-4258', 'hsa-miR-6857-5p', 'hsa-miR-3927-3p', 'hsa-miR-4481', 'hsa-miR-7110-5p', 'hsa-miR-151b', 'hsa-miR-6869-5p', 'hsa-miR-642b-3p', 'hsa-miR-4744', 'hsa-miR-92a-2-5p', 'hsa-miR-128-1-5p', 'hsa-miR-3940-5p', 'hsa-miR-1343-3p', 'hsa-miR-92b-5p', 'hsa-miR-4532', 'hsa-miR-4666a-5p', 'hsa-miR-6825-5p', 'hsa-miR-8069', 'hsa-miR-6765-5p', 'hsa-miR-6802-5p', 'hsa-miR-6746-5p', 'hsa-miR-6501-3p', 'hsa-miR-3135b', 'hsa-miR-4757-5p', 'hsa-miR-1228-5p', 'hsa-miR-1268b', 'hsa-miR-3160-5p', 'hsa-miR-4635', 'hsa-miR-1914-5p', 'hs

In [381]:
rf_grid_search_lung = random_forest(x_train_lung, y_train_lung, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fi

In [382]:
get_rf_metrics(rf_grid_search_lung, x_test_lung, y_test_lung)
rf_top_importance_lung, rf_top_features_lung = get_top_rf_features(rf_grid_search_lung, feature_names_lung, feature_importance_num)
print(rf_top_features_lung)
print(rf_top_importance_lung)

Training Accuracy: 0.9993185689948894
Best Parameters: {'estimator__criterion': 'entropy', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 100, 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  340]]
['hsa-miR-4783-3p', 'hsa-miR-663a', 'hsa-miR-1228-5p', 'hsa-miR-8073', 'hsa-miR-1307-3p', 'hsa-miR-3184-5p', 'hsa-miR-6802-5p', 'hsa-miR-4787-3p', 'hsa-miR-1290', 'hsa-miR-6765-5p', 'hsa-miR-5100', 'hsa-miR-3940-5p', 'hsa-miR-6131', 'hsa-miR-6784-5p', 'hsa-miR-320a', 'hsa-miR-4706', 'hsa-miR-6717-5p', 'hsa-miR-4429', 'hsa-miR-320b', 'hsa-miR-1343-3p', 'hsa-miR-1469', 'hsa-miR-4727-3p', 'hsa-miR-4732-5p', 'hsa-miR-4286', 'hsa-miR-4259', 'hsa-miR-4730', 'hsa-miR-6787-5p', 'hsa-miR-4258', 'hsa-miR-6825-5p', 'hsa-miR-1238-5p', 'hsa-miR-668-5p', 'hsa-miR-3658', 'hsa-miR-4532', 'hsa-miR-4687-5p', 'hsa-miR-17-3p', 'hsa-miR-4758-5p', 'hsa-miR-211-3p', 'hsa-miR-614', 'hsa-miR-4690-5p', 'hsa-miR-4771', 'hsa-miR-4635', 'hsa-miR-8059', 'hsa-miR-7975', 'hsa-miR-2

In [383]:
gb_grid_search_lung = gradient_boosting(x_train_lung, y_train_lung, feature_selection_num)

In [384]:
get_gb_metrics(gb_grid_search_lung, x_test_lung, y_test_lung)
gb_top_importance_lung, gb_top_features_lung = get_top_gb_features(gb_grid_search_lung, feature_names_lung, feature_importance_num)
print(gb_top_features_lung)
print(gb_top_importance_lung)

Training Accuracy: 0.9991485009676783
Best Parameters: {'estimator__learning_rate': 1, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  340]]
['hsa-miR-1228-5p', 'hsa-miR-4783-3p', 'hsa-miR-663a', 'hsa-miR-1343-3p', 'hsa-miR-651-3p', 'hsa-miR-6816-5p', 'hsa-miR-4730', 'hsa-miR-2114-5p', 'hsa-miR-4635', 'hsa-miR-3663-3p', 'hsa-miR-4490', 'hsa-miR-6769a-5p', 'hsa-miR-6885-5p', 'hsa-miR-937-5p', 'hsa-miR-619-3p', 'hsa-miR-4787-3p', 'hsa-miR-4530', 'hsa-miR-4417', 'hsa-miR-8069', 'hsa-miR-6869-5p', 'hsa-miR-296-5p', 'hsa-miR-3135b', 'hsa-miR-4257', 'hsa-miR-4734', 'hsa-miR-3688-5p', 'hsa-miR-3184-5p', 'hsa-miR-4685-5p', 'hsa-miR-6075', 'hsa-miR-551b-5p', 'hsa-miR-6765-5p', 'hsa-miR-296-3p', 'hsa-miR-92b-5p', 'hsa-miR-4515', 'hsa-miR-6786-5p', 'hsa-miR-889-3p', 'hsa-miR-663b', 'hsa-miR-6781-5p', 'hsa-miR-4706', 'hsa-miR-4532', 'hsa-miR-6794-5p', 'hsa-miR-4757-5p', 'hsa-miR-548at-5p', 'hsa-miR-365a-5p', 'hsa-miR-7977', 'hsa-miR-1273g-

Colorectal Cancer Dataset

In [385]:
colorectal_cancer_dataset = dataset.copy()

colorectal_cancer_dataset.loc[colorectal_cancer_dataset["ID_REF"] == "Colorectal Cancer", "ID_REF"] = 1
colorectal_cancer_dataset.loc[colorectal_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
colorectal_cancer_dataset = colorectal_cancer_dataset[(colorectal_cancer_dataset["ID_REF"] == 0) | (colorectal_cancer_dataset["ID_REF"] == 1)]

print(colorectal_cancer_dataset, "\n")
print("Colorectal Cancer Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[1] + colorectal_cancer_dataset["ID_REF"].value_counts()[0])

x_train_colorectal, y_train_colorectal, x_test_colorectal, y_test_colorectal, feature_names_colorectal = process_data(colorectal_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
2377       1       6.867068        6.701113      6.734917      7.008525   
2378       1       6.374151        4.943700      4.150693      5.422829   
2379       1       6.151220        5.059069      5.764714      2.619580   
2380       1       3.588643        4.604267      4.742507      4.468745   
2381       1       7.119708        6.839405      4.780490      6.296561   
...      ...            ...             ...           ...           ...   
13907      0       0.592899        1.892105      3.260905     -3.842394   
13908      0      -3.083570       -3.083570      3.472014     -3.083570   
13909      0       2.879964       -1.514097      2.997163      0.251458   
13910      0       5.509703        5.171011      5.667294      5.430266   
13911      0       1.268251        1.757426      2.885682      0.769274   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
2377      7.973037        7.3

In [386]:
svm_grid_search_colorectal = support_vector_machine(x_train_colorectal, y_train_colorectal, feature_selection_num)

In [387]:
get_svm_metrics(svm_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
svm_top_coef_colorectal, svm_top_features_colorectal = get_top_svm_features(svm_grid_search_colorectal, feature_names_colorectal, feature_importance_num)
print(svm_top_features_colorectal)
print(svm_top_coef_colorectal)

Training Accuracy: 0.9996545768566495
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 0.9993093922651933
Confusion Matrix:
[[1129    0]
 [   1  318]]
['hsa-miR-125a-3p', 'hsa-miR-4730', 'hsa-miR-4783-3p', 'hsa-miR-1307-3p', 'hsa-miR-885-3p', 'hsa-miR-744-5p', 'hsa-miR-3184-5p', 'hsa-miR-1587', 'hsa-miR-4532', 'hsa-miR-1246', 'hsa-miR-575', 'hsa-miR-6132', 'hsa-miR-4481', 'hsa-miR-642b-3p', 'hsa-miR-1233-5p', 'hsa-miR-4787-3p', 'hsa-miR-3194-5p', 'hsa-miR-663a', 'hsa-miR-7114-5p', 'hsa-miR-92b-5p', 'hsa-miR-1203', 'hsa-miR-4674', 'hsa-miR-4258', 'hsa-miR-4327', 'hsa-miR-3158-5p', 'hsa-miR-1469', 'hsa-miR-320b', 'hsa-miR-92a-2-5p', 'hsa-miR-1185-1-3p', 'hsa-miR-6727-5p', 'hsa-miR-4690-5p', 'hsa-miR-4476', 'hsa-miR-411-5p', 'hsa-miR-6869-5p', 'hsa-miR-8069', 'hsa-miR-887-3p', 'hsa-miR-6825-5p', 'hsa-miR-4648', 'hsa-miR-4662b', 'hsa-miR-211-3p', 'hsa-miR-501-3p', 'hsa-miR-514a-3p', 'hsa-miR-6088', 'hsa-miR-145-3p', 'hsa-miR-4435', 'hs

In [388]:
rf_grid_search_colorectal = random_forest(x_train_colorectal, y_train_colorectal, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fit

In [389]:
get_rf_metrics(rf_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
rf_top_importance_colorectal, rf_top_features_colorectal = get_top_rf_features(rf_grid_search_colorectal, feature_names_colorectal, feature_importance_num)
print(rf_top_features_colorectal)
print(rf_top_importance_colorectal)

Training Accuracy: 0.9993091537132989
Best Parameters: {'estimator__criterion': 'entropy', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 500, 'skb__k': 500}
Testing Accuracy: 0.9993093922651933
Confusion Matrix:
[[1129    0]
 [   1  318]]
['hsa-miR-4783-3p', 'hsa-miR-3184-5p', 'hsa-miR-663a', 'hsa-miR-3940-5p', 'hsa-miR-1228-5p', 'hsa-miR-4730', 'hsa-miR-6784-5p', 'hsa-miR-1469', 'hsa-miR-1307-3p', 'hsa-miR-6825-5p', 'hsa-miR-6088', 'hsa-miR-6802-5p', 'hsa-miR-4532', 'hsa-miR-1233-5p', 'hsa-miR-320a', 'hsa-miR-4787-3p', 'hsa-miR-4675', 'hsa-miR-6787-5p', 'hsa-miR-8073', 'hsa-miR-8069', 'hsa-miR-885-3p', 'hsa-miR-320b', 'hsa-miR-575', 'hsa-miR-92a-2-5p', 'hsa-miR-6781-5p', 'hsa-miR-4258', 'hsa-miR-1203', 'hsa-miR-6765-5p', 'hsa-miR-6800-5p', 'hsa-miR-128-1-5p', 'hsa-miR-642b-3p', 'hsa-miR-373-5p', 'hsa-miR-191-5p', 'hsa-miR-6717-5p', 'hsa-miR-4530', 'hsa-miR-6729-5p', 'hsa-miR-6861-5p', 'hsa-miR-17-3p', 'hsa-miR-5195-3p', 'hsa-miR-6805-5p', 'hsa-miR-1914-5p', 'hsa-miR-31

In [390]:
gb_grid_search_colorectal = gradient_boosting(x_train_colorectal, y_train_colorectal, feature_selection_num)

In [391]:
get_gb_metrics(gb_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
gb_top_importance_colorectal, gb_top_features_colorectal = get_top_gb_features(gb_grid_search_colorectal, feature_names_colorectal, feature_importance_num)
print(gb_top_features_colorectal)
print(gb_top_importance_colorectal)

Training Accuracy: 0.9994818652849743
Best Parameters: {'estimator__learning_rate': 0.5, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 0.9993093922651933
Confusion Matrix:
[[1129    0]
 [   1  318]]
['hsa-miR-3184-5p', 'hsa-miR-3940-5p', 'hsa-miR-4787-3p', 'hsa-miR-4730', 'hsa-miR-1469', 'hsa-miR-4783-3p', 'hsa-miR-4532', 'hsa-miR-4649-5p', 'hsa-miR-1307-3p', 'hsa-miR-663a', 'hsa-miR-6784-5p', 'hsa-miR-342-5p', 'hsa-miR-128-2-5p', 'hsa-miR-876-5p', 'hsa-miR-7114-5p', 'hsa-miR-6787-5p', 'hsa-miR-3158-5p', 'hsa-miR-6088', 'hsa-miR-1203', 'hsa-miR-6798-5p', 'hsa-miR-1228-5p', 'hsa-miR-4790-5p', 'hsa-miR-7977', 'hsa-miR-8073', 'hsa-miR-4690-5p', 'hsa-miR-6875-5p', 'hsa-miR-6857-5p', 'hsa-miR-23a-3p', 'hsa-miR-302e', 'hsa-miR-92b-5p', 'hsa-miR-148b-5p', 'hsa-miR-6825-5p', 'hsa-miR-1233-5p', 'hsa-miR-7110-5p', 'hsa-miR-155-5p', 'hsa-miR-103a-3p', 'hsa-miR-6729-5p', 'hsa-miR-8069', 'hsa-miR-587', 'hsa-miR-6786-5p', 'hsa-miR-3917', 'hsa-miR-135a-3p', 'hsa-miR-3663-3p', 'hsa-m

Gastric Cancer Dataset

In [392]:
gastric_cancer_dataset = dataset.copy()

gastric_cancer_dataset.loc[gastric_cancer_dataset["ID_REF"] == "Gastric Cancer", "ID_REF"] = 1
gastric_cancer_dataset.loc[gastric_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
gastric_cancer_dataset = gastric_cancer_dataset[(gastric_cancer_dataset["ID_REF"] == 0) | (gastric_cancer_dataset["ID_REF"] == 1)]

print(gastric_cancer_dataset, "\n")
print("Gastric Cancer Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[1] + gastric_cancer_dataset["ID_REF"].value_counts()[0])

x_train_gastric, y_train_gastric, x_test_gastric, y_test_gastric, feature_names_gastric = process_data(gastric_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
4563       1       3.979417        3.469969      6.192362      6.725776   
4564       1      -0.194516       -0.194516     -0.194516     -0.194516   
4565       1       5.109533        5.073537      5.726348      2.670043   
4566       1       4.327134        3.516137      4.996419      3.682273   
4567       1       5.144090        4.915262      5.271562      5.682571   
...      ...            ...             ...           ...           ...   
13907      0       0.592899        1.892105      3.260905     -3.842394   
13908      0      -3.083570       -3.083570      3.472014     -3.083570   
13909      0       2.879964       -1.514097      2.997163      0.251458   
13910      0       5.509703        5.171011      5.667294      5.430266   
13911      0       1.268251        1.757426      2.885682      0.769274   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
4563      7.707212        6.3

In [393]:
svm_grid_search_gastric = support_vector_machine(x_train_gastric, y_train_gastric, feature_selection_num)

In [394]:
get_svm_metrics(svm_grid_search_gastric, x_test_gastric, y_test_gastric)
svm_top_coef_gastric, svm_top_features_gastric = get_top_svm_features(svm_grid_search_gastric, feature_names_gastric, feature_importance_num)
print(svm_top_features_gastric)
print(svm_top_coef_gastric)

Training Accuracy: 0.9994683989204795
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  284]]
['hsa-miR-125a-3p', 'hsa-miR-4783-3p', 'hsa-miR-4730', 'hsa-miR-1246', 'hsa-miR-4648', 'hsa-miR-4787-3p', 'hsa-miR-1587', 'hsa-miR-1203', 'hsa-miR-744-5p', 'hsa-miR-1307-3p', 'hsa-miR-3184-5p', 'hsa-miR-4481', 'hsa-miR-575', 'hsa-miR-4532', 'hsa-miR-124-3p', 'hsa-miR-663a', 'hsa-miR-6857-5p', 'hsa-miR-6132', 'hsa-miR-6825-5p', 'hsa-miR-191-5p', 'hsa-miR-6784-5p', 'hsa-miR-6131', 'hsa-miR-885-3p', 'hsa-miR-1343-3p', 'hsa-miR-3135b', 'hsa-miR-6765-5p', 'hsa-miR-128-1-5p', 'hsa-miR-1469', 'hsa-miR-3940-5p', 'hsa-miR-1228-5p', 'hsa-miR-4258', 'hsa-miR-26a-5p', 'hsa-miR-887-3p', 'hsa-miR-499a-3p', 'hsa-miR-4783-5p', 'hsa-miR-6875-5p', 'hsa-miR-4460', 'hsa-miR-642b-3p', 'hsa-miR-668-5p', 'hsa-miR-4635', 'hsa-miR-2115-3p', 'hsa-miR-92a-2-5p', 'hsa-miR-8069', 'hsa-miR-1290', 'hsa-miR-1206', 'hsa-miR-6781-5

In [395]:
rf_grid_search_gastric = random_forest(x_train_gastric, y_train_gastric, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fi

In [396]:
get_rf_metrics(rf_grid_search_gastric, x_test_gastric, y_test_gastric)
rf_top_importance_gastric, rf_top_features_gastric = get_top_rf_features(rf_grid_search_gastric, feature_names_gastric, feature_importance_num)
print(rf_top_features_gastric)
print(rf_top_importance_gastric)

Training Accuracy: 0.999646017699115
Best Parameters: {'estimator__criterion': 'gini', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 100, 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  284]]
['hsa-miR-1228-5p', 'hsa-miR-5100', 'hsa-miR-1307-3p', 'hsa-miR-4787-3p', 'hsa-miR-663a', 'hsa-miR-8073', 'hsa-miR-3940-5p', 'hsa-miR-4706', 'hsa-miR-1343-3p', 'hsa-miR-6765-5p', 'hsa-miR-6784-5p', 'hsa-miR-1246', 'hsa-miR-1290', 'hsa-miR-4429', 'hsa-miR-320b', 'hsa-miR-4259', 'hsa-miR-3184-5p', 'hsa-miR-4690-5p', 'hsa-miR-6802-5p', 'hsa-miR-4730', 'hsa-miR-191-5p', 'hsa-miR-4732-5p', 'hsa-miR-4727-3p', 'hsa-miR-124-3p', 'hsa-miR-8060', 'hsa-miR-17-3p', 'hsa-miR-614', 'hsa-miR-6717-5p', 'hsa-miR-6787-5p', 'hsa-miR-6746-5p', 'hsa-miR-6781-5p', 'hsa-miR-1260b', 'hsa-miR-668-5p', 'hsa-miR-320a', 'hsa-miR-4687-5p', 'hsa-miR-3622a-5p', 'hsa-miR-4419b', 'hsa-miR-6861-5p', 'hsa-miR-8059', 'hsa-miR-23b-3p', 'hsa-miR-5001-5p', 'hsa-miR-4515', 'hsa-miR-4782-5p', '

In [397]:
gb_grid_search_gastric = gradient_boosting(x_train_gastric, y_train_gastric, feature_selection_num)

In [398]:
get_gb_metrics(gb_grid_search_gastric, x_test_gastric, y_test_gastric)
gb_top_importance_gastric, gb_top_features_gastric = get_top_gb_features(gb_grid_search_gastric, feature_names_gastric, feature_importance_num)
print(gb_top_features_gastric)
print(gb_top_importance_gastric)

Training Accuracy: 0.9976984874160546
Best Parameters: {'estimator__learning_rate': 0.5, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 0.9992922859164898
Confusion Matrix:
[[1128    1]
 [   0  284]]
['hsa-miR-1228-5p', 'hsa-miR-6784-5p', 'hsa-miR-6787-5p', 'hsa-miR-4454', 'hsa-miR-6079', 'hsa-miR-1290', 'hsa-miR-6800-5p', 'hsa-miR-614', 'hsa-miR-744-5p', 'hsa-miR-605-3p', 'hsa-miR-128-1-5p', 'hsa-miR-1469', 'hsa-miR-4530', 'hsa-miR-4705', 'hsa-miR-1343-3p', 'hsa-miR-4286', 'hsa-miR-619-3p', 'hsa-miR-454-3p', 'hsa-miR-4781-3p', 'hsa-miR-6131', 'hsa-miR-4481', 'hsa-miR-4708-3p', 'hsa-miR-548ad-5p, hsa-miR-548ae-5p', 'hsa-miR-1246', 'hsa-miR-548n', 'hsa-miR-4740-5p', 'hsa-miR-3115', 'hsa-miR-320d', 'hsa-miR-548al', 'hsa-miR-4755-3p', 'hsa-miR-4525', 'hsa-miR-548au-3p', 'hsa-miR-3613-5p', 'hsa-miR-6073', 'hsa-miR-6717-5p', 'hsa-miR-3136-3p', 'hsa-miR-8073', 'hsa-miR-5582-5p', 'hsa-miR-26a-5p', 'hsa-miR-4477b', 'hsa-miR-5100', 'hsa-miR-4666a-5p', 'hsa-miR-6769a-5p', 'hsa-m

Prostate Cancer

In [399]:
prostate_cancer_dataset = dataset.copy()

prostate_cancer_dataset.loc[prostate_cancer_dataset["ID_REF"] == "Prostate Cancer", "ID_REF"] = 1
prostate_cancer_dataset.loc[prostate_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
prostate_cancer_dataset = prostate_cancer_dataset[(prostate_cancer_dataset["ID_REF"] == 0) | (prostate_cancer_dataset["ID_REF"] == 1)]

print(prostate_cancer_dataset, "\n")
print("Prostate Cancer Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[1] + prostate_cancer_dataset["ID_REF"].value_counts()[0])

x_train_prostate, y_train_prostate, x_test_prostate, y_test_prostate, feature_names_prostate = process_data(prostate_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
8269       0       4.854582       -0.300574      5.839072      4.140201   
8270       0       2.792939       -3.260724      1.681541     -3.260724   
8271       0       3.603448        1.875933     -0.914627      4.476280   
8272       0      -1.586508       -1.586508     -1.586508     -1.586508   
8273       0      -0.897651       -0.897651     -0.897651     -0.897651   
...      ...            ...             ...           ...           ...   
16185      1       3.686871        5.402453      5.126641      6.142145   
16186      1       0.121448        0.121448      0.121448      0.121448   
16187      1      -0.661008        4.343310      4.541016      3.901700   
16188      1      -1.769902       -1.769902      0.133393      2.717976   
16189      1       2.596167       -0.620166      3.208942      1.720689   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
8269      6.209651        4.7

In [400]:
svm_grid_search_prostate = support_vector_machine(x_train_prostate, y_train_prostate, feature_selection_num)

In [401]:
get_svm_metrics(svm_grid_search_prostate, x_test_prostate, y_test_prostate)
svm_top_coef_prostate, svm_top_features_prostate = get_top_svm_features(svm_grid_search_prostate, feature_names_prostate, feature_importance_num)
print(svm_top_features_prostate)
print(svm_top_coef_prostate)

Training Accuracy: 0.999625468164794
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  205]]
['hsa-miR-4783-3p', 'hsa-miR-125b-1-3p', 'hsa-miR-125a-3p', 'hsa-miR-1307-3p', 'hsa-miR-4730', 'hsa-miR-4648', 'hsa-miR-1203', 'hsa-miR-4532', 'hsa-miR-3194-5p', 'hsa-miR-6131', 'hsa-miR-6073', 'hsa-miR-1246', 'hsa-miR-3184-5p', 'hsa-miR-744-5p', 'hsa-miR-663a', 'hsa-miR-4674', 'hsa-miR-1185-2-3p', 'hsa-miR-1185-1-3p', 'hsa-miR-602', 'hsa-miR-575', 'hsa-miR-17-3p', 'hsa-miR-668-5p', 'hsa-miR-1469', 'hsa-miR-6736-5p', 'hsa-miR-191-5p', 'hsa-miR-124-3p', 'hsa-miR-6869-5p', 'hsa-miR-4276', 'hsa-miR-873-3p', 'hsa-miR-887-3p', 'hsa-miR-642b-3p', 'hsa-miR-103a-3p', 'hsa-miR-1233-5p', 'hsa-miR-1290', 'hsa-miR-6784-5p', 'hsa-miR-6800-5p', 'hsa-miR-4489', 'hsa-miR-6132', 'hsa-miR-92a-2-5p', 'hsa-miR-92b-5p', 'hsa-miR-128-1-5p', 'hsa-miR-6727-5p', 'hsa-miR-4668-5p', 'hsa-miR-8069', 'hsa-miR-514a-3p', 'hsa-miR

In [402]:
rf_grid_search_prostate = random_forest(x_train_prostate, y_train_prostate, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fi

In [403]:
get_rf_metrics(rf_grid_search_prostate, x_test_prostate, y_test_prostate)
rf_top_importance_prostate, rf_top_features_prostate = get_top_rf_features(rf_grid_search_prostate, feature_names_prostate, feature_importance_num)
print(rf_top_features_prostate)
print(rf_top_importance_prostate)

Training Accuracy: 0.999812382739212
Best Parameters: {'estimator__criterion': 'entropy', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 500, 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  205]]
['hsa-miR-1307-3p', 'hsa-miR-4783-3p', 'hsa-miR-3184-5p', 'hsa-miR-3940-5p', 'hsa-miR-8073', 'hsa-miR-1238-5p', 'hsa-miR-6802-5p', 'hsa-miR-1228-5p', 'hsa-miR-6784-5p', 'hsa-miR-1203', 'hsa-miR-320a', 'hsa-miR-6787-5p', 'hsa-miR-1290', 'hsa-miR-320b', 'hsa-miR-4758-5p', 'hsa-miR-1233-5p', 'hsa-miR-6805-5p', 'hsa-miR-4532', 'hsa-miR-663a', 'hsa-miR-191-5p', 'hsa-miR-4730', 'hsa-miR-885-3p', 'hsa-miR-4419b', 'hsa-miR-1469', 'hsa-miR-17-3p', 'hsa-miR-6717-5p', 'hsa-miR-4690-5p', 'hsa-miR-4489', 'hsa-miR-6766-5p', 'hsa-miR-1246', 'hsa-miR-5100', 'hsa-miR-4675', 'hsa-miR-6769a-5p', 'hsa-miR-6861-5p', 'hsa-miR-6088', 'hsa-miR-197-5p', 'hsa-miR-6729-5p', 'hsa-miR-1343-3p', 'hsa-miR-6131', 'hsa-miR-614', 'hsa-miR-6786-5p', 'hsa-miR-342-5p', 'hsa-miR-92a-2-5p'

In [404]:
gb_grid_search_prostate = gradient_boosting(x_train_prostate, y_train_prostate, feature_selection_num)

In [405]:
get_gb_metrics(gb_grid_search_prostate, x_test_prostate, y_test_prostate)
gb_top_importance_prostate, gb_top_features_prostate = get_top_gb_features(gb_grid_search_prostate, feature_names_prostate, feature_importance_num)
print(gb_top_features_prostate)
print(gb_top_importance_prostate)

Training Accuracy: 0.9962525735888301
Best Parameters: {'estimator__learning_rate': 0.5, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 0.9970014992503748
Confusion Matrix:
[[1127    2]
 [   2  203]]
['hsa-miR-1307-3p', 'hsa-miR-1228-5p', 'hsa-miR-4783-3p', 'hsa-miR-4730', 'hsa-miR-6869-5p', 'hsa-miR-4454', 'hsa-miR-575', 'hsa-miR-3184-5p', 'hsa-miR-6741-5p', 'hsa-miR-744-5p', 'hsa-miR-3684', 'hsa-miR-3192-5p', 'hsa-miR-125b-1-3p', 'hsa-miR-4448', 'hsa-miR-1246', 'hsa-miR-663a', 'hsa-miR-4530', 'hsa-miR-4775', 'hsa-miR-328-5p', 'hsa-miR-5692a', 'hsa-miR-1233-5p', 'hsa-miR-617', 'hsa-miR-5691', 'hsa-miR-208b-5p', 'hsa-miR-190a-5p', 'hsa-miR-4536-5p', 'hsa-miR-1264', 'hsa-miR-6784-5p', 'hsa-miR-548b-5p', 'hsa-miR-548ad-5p, hsa-miR-548ae-5p', 'hsa-miR-4659a-3p', 'hsa-miR-3940-5p', 'hsa-miR-889-3p', 'hsa-miR-296-5p', 'hsa-miR-6765-5p', 'hsa-miR-3194-5p', 'hsa-miR-92a-2-5p', 'hsa-miR-4690-5p', 'hsa-miR-937-5p', 'hsa-miR-92b-5p', 'hsa-miR-92b-3p', 'hsa-miR-92a-3p', 'hsa-miR-

Pancreatic Cancer

In [406]:
pancreatic_cancer_dataset = dataset.copy()

pancreatic_cancer_dataset.loc[pancreatic_cancer_dataset["ID_REF"] == "Pancreatic Cancer", "ID_REF"] = 1
pancreatic_cancer_dataset.loc[pancreatic_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
pancreatic_cancer_dataset = pancreatic_cancer_dataset[(pancreatic_cancer_dataset["ID_REF"] == 0) | (pancreatic_cancer_dataset["ID_REF"] == 1)]

print(pancreatic_cancer_dataset, "\n")
print("Pancreatic Cancer Sample Number:", pancreatic_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", pancreatic_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", pancreatic_cancer_dataset["ID_REF"].value_counts()[1] + pancreatic_cancer_dataset["ID_REF"].value_counts()[0])

x_train_pancreatic, y_train_pancreatic, x_test_pancreatic, y_test_pancreatic, feature_names_pancreatic = process_data(pancreatic_cancer_dataset)

      ID_REF  hsa-miR-28-3p  hsa-miR-27a-5p  hsa-miR-518b  hsa-miR-520b  \
8269       0       4.854582       -0.300574      5.839072      4.140201   
8270       0       2.792939       -3.260724      1.681541     -3.260724   
8271       0       3.603448        1.875933     -0.914627      4.476280   
8272       0      -1.586508       -1.586508     -1.586508     -1.586508   
8273       0      -0.897651       -0.897651     -0.897651     -0.897651   
...      ...            ...             ...           ...           ...   
15158      1       0.551267        0.551267      4.935549      0.551267   
15159      1      -0.962073        2.523465      2.636927      2.313804   
15160      1       4.216940        4.928627      3.498331      2.448755   
15161      1      -1.112312       -1.112312     -1.112312     -1.112312   
15162      1       3.624010        1.995368      5.029056      3.416968   

       hsa-miR-498  hsa-miR-512-3p  hsa-miR-491-5p  hsa-miR-490-3p  \
8269      6.209651        4.7

In [407]:
svm_grid_search_pancreatic = support_vector_machine(x_train_pancreatic, y_train_pancreatic, feature_selection_num)

In [408]:
get_svm_metrics(svm_grid_search_pancreatic, x_test_pancreatic, y_test_pancreatic)
svm_top_coef_pancreatic, svm_top_features_pancreatic = get_top_svm_features(svm_grid_search_pancreatic, feature_names_pancreatic, feature_importance_num)
print(svm_top_features_pancreatic)
print(svm_top_coef_pancreatic)

Training Accuracy: 0.9998076923076923
Best Parameters: {'estimator__C': 0.25, 'estimator__kernel': 'linear', 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  170]]
['hsa-miR-4730', 'hsa-miR-125a-3p', 'hsa-miR-4783-3p', 'hsa-miR-1203', 'hsa-miR-3184-5p', 'hsa-miR-4648', 'hsa-miR-885-3p', 'hsa-miR-744-5p', 'hsa-miR-320c', 'hsa-miR-1307-3p', 'hsa-miR-1273c', 'hsa-miR-6132', 'hsa-miR-4787-3p', 'hsa-miR-1233-5p', 'hsa-miR-663a', 'hsa-miR-125b-1-3p', 'hsa-miR-4481', 'hsa-miR-629-5p', 'hsa-miR-3158-5p', 'hsa-miR-6780b-5p', 'hsa-miR-6869-5p', 'hsa-miR-4725-3p', 'hsa-miR-4327', 'hsa-miR-320e', 'hsa-miR-3617-5p', 'hsa-miR-128-1-5p', 'hsa-miR-92b-5p', 'hsa-miR-1185-2-3p', 'hsa-miR-642b-3p', 'hsa-miR-575', 'hsa-miR-6880-5p', 'hsa-miR-92a-2-5p', 'hsa-miR-4504', 'hsa-miR-1185-1-3p', 'hsa-miR-1469', 'hsa-miR-6088', 'hsa-miR-6810-5p', 'hsa-miR-3156-5p', 'hsa-miR-6073', 'hsa-miR-4258', 'hsa-miR-3940-5p', 'hsa-miR-1273g-3p', 'hsa-miR-6825-5p', 'hsa-miR-7114-5p', 'hsa-miR-6839-

In [409]:
rf_grid_search_pancreatic = random_forest(x_train_pancreatic, y_train_pancreatic, feature_selection_num)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/thomastseng/Documents/Pioneer Computational Medicine/Classifiers/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 420, in fi

In [410]:
get_rf_metrics(rf_grid_search_pancreatic, x_test_pancreatic, y_test_pancreatic)
rf_top_importance_pancreatic, rf_top_features_pancreatic = get_top_rf_features(rf_grid_search_pancreatic, feature_names_pancreatic, feature_importance_num)
print(rf_top_features_pancreatic)
print(rf_top_importance_pancreatic)

Training Accuracy: 0.9996150140803319
Best Parameters: {'estimator__criterion': 'gini', 'estimator__max_features': 'sqrt', 'estimator__n_estimators': 100, 'skb__k': 500}
Testing Accuracy: 1.0
Confusion Matrix:
[[1129    0]
 [   0  170]]
['hsa-miR-4730', 'hsa-miR-663a', 'hsa-miR-4787-3p', 'hsa-miR-6784-5p', 'hsa-miR-92b-5p', 'hsa-miR-3184-5p', 'hsa-miR-1233-5p', 'hsa-miR-1469', 'hsa-miR-1203', 'hsa-miR-373-5p', 'hsa-miR-3940-5p', 'hsa-miR-1228-5p', 'hsa-miR-4258', 'hsa-miR-4783-3p', 'hsa-miR-1238-5p', 'hsa-miR-1307-3p', 'hsa-miR-320a', 'hsa-miR-6088', 'hsa-miR-4532', 'hsa-miR-92a-2-5p', 'hsa-miR-6802-5p', 'hsa-miR-320b', 'hsa-miR-6781-5p', 'hsa-miR-4687-5p', 'hsa-miR-5090', 'hsa-miR-6717-5p', 'hsa-miR-4648', 'hsa-miR-642b-3p', 'hsa-miR-3917', 'hsa-miR-6087', 'hsa-miR-7110-5p', 'hsa-miR-3180', 'hsa-miR-4675', 'hsa-miR-575', 'hsa-miR-320e', 'hsa-miR-6729-5p', 'hsa-miR-887-3p', 'hsa-miR-320d', 'hsa-miR-320c', 'hsa-miR-128-1-5p', 'hsa-miR-3156-5p', 'hsa-miR-885-3p', 'hsa-miR-6839-5p', 'hsa-

In [411]:
gb_grid_search_pancreatic = gradient_boosting(x_train_pancreatic, y_train_pancreatic, feature_selection_num)

In [412]:
get_gb_metrics(gb_grid_search_pancreatic, x_test_pancreatic, y_test_pancreatic)
gb_top_importance_pancreatic, gb_top_features_pancreatic = get_top_gb_features(gb_grid_search_pancreatic, feature_names_pancreatic, feature_importance_num)
print(gb_top_features_pancreatic)
print(gb_top_importance_pancreatic)

Training Accuracy: 0.9984596857862753
Best Parameters: {'estimator__learning_rate': 0.5, 'estimator__n_estimators': 50, 'skb__k': 500}
Testing Accuracy: 0.9992301770592764
Confusion Matrix:
[[1129    0]
 [   1  169]]
['hsa-miR-1228-5p', 'hsa-miR-4730', 'hsa-miR-1469', 'hsa-miR-4532', 'hsa-miR-301a-3p', 'hsa-miR-128-1-5p', 'hsa-miR-23b-3p', 'hsa-miR-718', 'hsa-miR-4693-5p', 'hsa-miR-3940-5p', 'hsa-miR-1233-5p', 'hsa-miR-663a', 'hsa-miR-1307-3p', 'hsa-miR-6090', 'hsa-miR-6131', 'hsa-miR-320a', 'hsa-miR-4417', 'hsa-miR-6805-5p', 'hsa-miR-4281', 'hsa-miR-4783-3p', 'hsa-miR-4648', 'hsa-miR-6780b-5p', 'hsa-miR-4787-3p', 'hsa-miR-940', 'hsa-miR-663b', 'hsa-miR-4708-3p', 'hsa-miR-4685-5p', 'hsa-miR-340-5p', 'hsa-miR-6869-5p', 'hsa-miR-744-5p', 'hsa-miR-544b', 'hsa-miR-4488', 'hsa-miR-3156-5p', 'hsa-miR-2392', 'hsa-miR-3184-5p', 'hsa-miR-4725-3p', 'hsa-miR-6731-5p', 'hsa-miR-328-5p', 'hsa-miR-7110-5p', 'hsa-miR-4258', 'hsa-miR-125a-3p', 'hsa-miR-4463', 'hsa-miR-148a-5p', 'hsa-miR-3656', 'hsa-mi