In [None]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, precision_recall_curve, confusion_matrix
from sklearn.svm import SVC

In [None]:
dataset = pd.read_csv("ultimateData.csv")
dataset.set_index('label')

First train full SVM classifier for the full dataset

In [None]:
labels_full = dataset['value'].values
features_full = dataset.drop('value', axis=1).values

In [None]:
features_full = features_full[:,1:].astype(np.float32)

In [None]:
sc = StandardScaler(copy=True, with_mean=False, with_std=False)
X_full = sc.fit_transform(features_full[:,1:])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_full, labels_full, test_size=0.33, random_state=42)

In [None]:
best_classifier = SVC(C=77358.56620434714, cache_size=512, class_weight=None,
  coef0=5694.538815205853, decision_function_shape='ovr', degree=2.0,
  gamma=758.5493658267706, kernel='poly', max_iter=707238803.0,
  probability=False, random_state=1, shrinking=True,
  tol=1.621850765788926e-05, verbose=False)

In [None]:
y_score_full = best_classifier.fit(X_train, y_train)

In [None]:
y_pred_full =best_classifier.predict(X_test)

In [None]:
#Plot the ROC curve for the best model
from sklearn.metrics import roc_curve, auc
fpr_full, tpr_full, thresholds = roc_curve(y_test, y_score_full)
roc_auc_full = auc(fpr_full, tpr_full)

plt.figure()
plt.plot(fpr_full, tpr_full, color='darkorange', linewidth=1, label='ROC curve (area = %0.2f)' % roc_auc_full)
plt.plot([0, 1], [0, 1], color='navy', linewidth=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig("withcoregenes.png", dpi=600)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = [False, True]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
classes = [False, True]

In [None]:
#Plot confusion matrix for the best model
plot_confusion_matrix(y_test, y_pred_full, classes, title="Confusion matrix for the best classifier")
plt.savefig("svm_confusionmatrix.png", dpi=600)

In [None]:
with open('CoreGenes.txt', 'r') as text_file:
    core_genes_list = text_file.read().split('\n')

In [None]:
core_genes_list[-1]

In [None]:
#Create a subset of the dataset without core genes
data_subset = dataset.drop(core_genes_list[:-1], axis=1)

In [None]:
data_subset.shape

In [None]:
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, precision_recall_curve, confusion_matrix
from sklearn.svm import SVC

Fit the SVM model chosen in the previous step to the modified dataset without the core genes. Also in this case the model will be fitted on training set and prediction with be made on the test set

In [None]:
labels = data_subset['value'].values
features = data_subset.drop('value', axis=1).values

In [None]:
features = features[:,1:].astype(np.float32)

In [None]:
sc = StandardScaler(copy=True, with_mean=False, with_std=False)
X = sc.fit_transform(features[:,1:])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

In [None]:
best_classifier = SVC(C=77358.56620434714, cache_size=512, class_weight=None,
  coef0=5694.538815205853, decision_function_shape='ovr', degree=2.0,
  gamma=758.5493658267706, kernel='poly', max_iter=707238803.0,
  probability=False, random_state=1, shrinking=True,
  tol=1.621850765788926e-05, verbose=False)

In [None]:
%%time
y_score = best_classifier.fit(X_train, y_train)

In [None]:
y_score = y_score.decision_function(X_test)

In [None]:
predictions = best_classifier.predict(X_test)

In [None]:
#Calculate the evaluation metric for the model fitted on the subset of main data
F1 = f1_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
print(F1, accuracy, precision, recall)
#Output:0.6757164404223228 0.8 0.6726726726726727 0.6787878787878788

In [None]:
#Draw roc curve of the model fitted on the subset of the main data
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', linewidth=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linewidth=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig("withoutcoregenes.png", dpi=600)

In [None]:
#Plot both roc curves: the one for the model fitted in the full data 
# and the one for the model fitted on the subset of the data on the same plot
plt.figure(figsize=(7,5), dpi=600)
plt.plot(fpr_full, tpr_full, color='darkorange', linewidth=1, label='With core genes ROC curve (area = %0.2f)' % roc_auc_full)
plt.plot(fpr, tpr, color='magenta', linewidth=1, label='Without core genes ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linewidth=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig("bestmodelwitnandwithoutcoregenes.png", dpi=600)

Train a SVM classifier specifically for the dataset without the core genes

In [None]:
%%time
from hyperopt import hp, tpe
from hpsklearn import HyperoptEstimator, any_preprocessing, svc

In [None]:
%%time
estim = HyperoptEstimator(algo=tpe.suggest, classifier = svc('svm'), preprocessing=any_preprocessing('pre'), max_evals=100, trial_timeout=120)

In [None]:
%%time
estim.fit(X_train, y_train)

In [None]:
#Get the sccuracy score of the new model
print(estim.score(X_test, y_test))

Output: 0.7981395348837209

In [None]:
#Get the new classifier
print(estim.best_model())

Output: {'learner': SVC(C=72675.83342839088, cache_size=512, class_weight=None,
  coef0=47.835768423975146, decision_function_shape='ovr', degree=3.0,
  gamma=4.928279913317778, kernel='poly', max_iter=93037515.0,
  probability=False, random_state=0, shrinking=False,
  tol=0.0030041920162780875, verbose=False), 'preprocs': (MinMaxScaler(copy=True, feature_range=(0.0, 1.0)),), 'ex_preprocs': ()}

In [None]:
# New classifier optimized for the data without the core genes
newclassifier = SVC(C=72675.83342839088, cache_size=512, class_weight=None,
  coef0=47.835768423975146, decision_function_shape='ovr', degree=3.0,
  gamma=4.928279913317778, kernel='poly', max_iter=93037515.0,
  probability=False, random_state=0, shrinking=False,
  tol=0.0030041920162780875, verbose=False)

In [None]:
#Fit the data to the MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(copy=True)
X = sc.fit_transform(features[:,1:])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

In [None]:
#Fit new classifier to the data without the core genes
y_score_new = newclassifier.fit(X_train, y_train)

In [None]:
#Predict the labels
y_pred = newclassifier.predict(X_test)

In [None]:
#Calculate the metrics for the new model
F1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
print(F1, accuracy, precision, recall)

Output: 0.6756352765321374 0.7981395348837209 0.6666666666666666 0.6848484848484848


In [None]:
#Draw roc curve for the model optimized for the data without the core genes
from sklearn.metrics import roc_curve, auc
fpr_new, tpr_new, thresholds = roc_curve(y_test, y_score_new)
roc_auc_new = auc(fpr_new, tpr_new)

plt.figure(figsize=(7,5),dpi=600)
plt.plot(fpr_new, tpr_new, color='darkorange', linewidth=1, label='ROC curve (area = %0.2f)' % roc_auc_new)
plt.plot([0, 1], [0, 1], color='navy', linewidth=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('newmodel.png', dpi=600)

In [None]:
#Plot all of the roc curves for all the models: initial best model fitted on full dataset;
# initial best model fitted on the data set without the core genes;
# the model optimized for the data set without the core genes
plt.figure(figsize=(7,5), dpi=600)
plt.plot(fpr_full, tpr_full, color='darkorange', linewidth=1, label='Best model ROC curve (area = %0.2f)' % roc_auc_full)
plt.plot(fpr, tpr, color='magenta', linewidth=1, label='Best model without core genes ROC curve (area = %0.2f)' % roc_auc)
plt.plot(fpr_new, tpr_new, color='purple', linewidth=1, label='New model without core genes ROC curve (area = %0.2f)' % roc_auc_new)
plt.plot([0, 1], [0, 1], color='navy', linewidth=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig("allSVM.png", dpi=600)