## Create SVM Classifier

In [11]:
import re
from sklearn.metrics import accuracy_score
%matplotlib inline
import pandas as pd
import numpy as np
import glob
import os
from sklearn.svm import SVC
from sklearn import metrics

Create X_train, y_train, X_test, y_test

In [12]:
train_data = pd.read_csv('train-data.csv', index_col=[0])
test_data = pd.read_csv('test-data.csv', index_col=[0])

train_data.head()

y_train = train_data.target
y_test = test_data.target

X_train = train_data.drop(['target', 'target_name'], axis='columns')
X_test = test_data.drop(['target', 'target_name'], axis='columns')
print("X_train, X_test, y_train, y_test created")

X_train, X_test, y_train, y_test created


In [13]:
# from sklearn.model_selection import GridSearchCV
# Cs = np.logspace(-6, -1, 10)
# svc = SVC(kernel='rbf')
# clf = GridSearchCV(svc, param_grid=dict(C=Cs), n_jobs=-1)
# clf.fit(X_train, y_train)
# print(clf.best_score_)
# print(clf.best_estimator_.C)
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_


In [14]:
#svc_param_selection(X_train, y_train, 5)

Probabilistic Output Model

In [15]:
# TRAINING
# model w/ RBF kernel, prob output
p_clf = SVC(kernel='rbf', gamma=0.1, C=0.0077, probability=True)
print("Starting to train...")
p_clf.fit(X_train, y_train)
print("Training done")

Starting to train...
Training done


In [16]:
# TESTING
print("Starting to test...")
# binary output
y_pred_binary = p_clf.predict(X_test)
# probabilistic output
y_pred = p_clf.predict_proba(X_test)
print("Testing completed")
print("Testing Accuracy:", accuracy_score(y_test, y_pred_binary))

Starting to test...
Testing completed
Testing Accuracy: 0.8635361993669618


In [17]:
# VALIDATION
# use stratified k-fold as validation method
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(p_clf, X_train, y_train, cv=skf)
print("Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Validation Accuracy: 0.79 (+/- 0.16)


In [21]:

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# in this order to make sure false positive and false negative are correct
conf_matrix = confusion_matrix(y_test, y_pred_binary, labels=[0,1])
print(conf_matrix)
tn, fp, fn, tp = conf_matrix.ravel()
print("true negative: ", tn)
print("false positive: ", fp)
print("false negative: ", fn)
print("true positive: ", tp)
print("Specificity (true neg rate): ", tn / (tn + fp))
print("Sensitivity: (true pos rate)", tp / (tp + fn))
print(classification_report(y_test, y_pred_binary, target_names=['CONTROL', 'DISCRETE']))

cmtx = pd.DataFrame(
    confusion_matrix(y_test, y_pred_binary, labels=[0, 1]), 
    index=['true:pos', 'true:neg'], 
    columns=['pred:pos', 'pred:neg']
)
print(cmtx)

[[13697   756]
 [ 2650  7856]]
true negative:  13697
false positive:  756
false negative:  2650
true positive:  7856
Specificity (true neg rate):  0.9476925205839618
Sensitivity: (true pos rate) 0.7477631829430802
              precision    recall  f1-score   support

     CONTROL       0.84      0.95      0.89     14453
    DISCRETE       0.91      0.75      0.82     10506

    accuracy                           0.86     24959
   macro avg       0.88      0.85      0.86     24959
weighted avg       0.87      0.86      0.86     24959

0.8218432890469715
          pred:pos  pred:neg
true:pos     13697       756
true:neg      2650      7856


In [19]:
# import matplotlib.pyplot as plt
# auc = metrics.roc_auc_score(y_test, y_pred)
# print('AUC: %.3f' % auc)
# 
# fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
# auc = metrics.roc_auc_score(y_test, y_pred)
# plt.plot(fpr,tpr,label="ROC curve, auc="+str(auc)[0:5])
# plt.legend(loc=4)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic')
# plt.savefig('ROC_curve.png')


In [20]:
# save probabilistic model for use in testing.ipynb
import joblib
joblib.dump(p_clf, 'p-rbf-clf.joblib')

['p-rbf-clf.joblib']