In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# <span style="color:green">Beginning of KNN GridSearch</span>

#### Adjust the variables to your liking.

In [2]:
degree = range(1,5)                            # Set the range of degrees we want to test.
kernel = ['linear', 'poly', 'rbf','sigmoid']   # Set GridSearch to compare against every form of SVM model.
cv = 5                                         # Set CrossValidation

### The cell below <span style="color:red">10 minutes</span> to run 
The results will be saved to your disk allowing you to explore futher at a later time without needing to re-run this cell every time.

In [None]:
svm = SVC(gamma='scale',    # Set gamma to 'scale' since we scaled our data.
          probability=True) # We want to return the probability so that we can use it for the ROC curve in our comparison notebook.

# set grid search parameters based on variables assigned above.
param_grid = [{'kernel': kernel,
               'degree': degree}]

svm_grid_search = GridSearchCV(svm, param_grid, cv=cv,  # Just passing in the variables declared above.
                               scoring='f1',            # We want to refit based on better f1 scores.
                               n_jobs=-2,               # Uses all but one of machine's processors.
                               verbose=50)              # verbose > 0 gives us a progress bar to check on.

svm_grid_search.fit(X_train, y_train)
%store svm_grid_search

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-2)]: Done   9 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-2)]: Done  13 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed: 12.

# <span style="color:orange">Review GridSeach Results.</span>
#### Determine if there are any tunings we want to make before we test our model against the `test set`. 

In [None]:
%store -r svm_grid_search       # restores the results from our previous run of the above cell.
svm_grid_search.best_estimator_ # outputs the best parameters to use based on our GridSearch

# <span style="color:red">Final Test Calculation</span>

Now that we're comfortable with the hyper tunings of our parameters, we can finally run our model on our `test set` to truly challenge our model and determine an unbiased result.

In [None]:
y_pred = svm_grid_search.predict(X_test)                # predicting y hat
svm_model = svm_grid_search.best_params_['kernel']      # name which SVM model was the best parameter
svm_acc = round(accuracy_score(y_test, y_pred)*100, 3)  # calculating accuracy
svm_f1 = round(svm_grid_search.best_score_*100, 3)      # calculating F1 Score
print("After hyper tuning the {} model, the best accuracy we could compute was {} with a f1 score of {}".format(svm_model, svm_acc, svm_f1))

svm = {'accuracy': svm_acc,
       'f1': svm_f1 }
%store svm

# Below are examples of each individual SVM model

## Linear SVM

In [None]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

## Polynomial Kernel

In [None]:
scores = {}
for num in range(1,8):
    svclassifier = SVC(kernel='poly', degree=num, gamma='scale')
    svclassifier.fit(X_train, y_train)
    y_pred = svclassifier.predict(X_test)
    scores[num] = accuracy_score(y_test, y_pred)

high = 0
for degree, accuracy in scores.items():
    if accuracy > high:
        high = accuracy
        deg = degree
print("A degree of {} results in the highest accuracy of {}".format(deg, round(high, 3)))

In [None]:
svclassifier = SVC(kernel='poly', degree=1, gamma='scale')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Gaussian Kernel

In [None]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Sigmoid Kernel

In [None]:
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))