In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Linear SVM

In [3]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [6]:
y_pred = svclassifier.predict(X_test)
classes = ['<=50K', '>50K']
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred,target_names=classes))

[[4201  326]
 [ 624  848]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4527
           1       0.72      0.58      0.64      1472

    accuracy                           0.84      5999
   macro avg       0.80      0.75      0.77      5999
weighted avg       0.83      0.84      0.84      5999



# Polynomial Kernel

In [None]:
# This can be done using a GridSearch to expedite the process. 
# Because otherwise, the third cell in this section is essentially re-running the classifier using the optimal parameters that we already ran
scores = {}
for num in range(1,8):
    svclassifier = SVC(kernel='poly', degree=num, gamma='scale')
    svclassifier.fit(X_train, y_train)
    y_pred = svclassifier.predict(X_test)
    scores[num] = accuracy_score(y_test, y_pred)
%store scores

In [None]:
%store -r scores
high = 0
for degree, accuracy in scores.items():
    if accuracy > high:
        high = accuracy
        deg = degree
print("A degree of {} results in the highest accuracy of {}".format(deg, round(high, 3)))

In [None]:
svclassifier = SVC(kernel='poly', degree=1, gamma='scale')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
# accuracy_poly_svc = accuracy_score()
# f1_poly_svc = 
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Gaussian Kernel

In [None]:
svclassifier = SVC(kernel='rbf', gamma='scale')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Sigmoid Kernel

In [None]:
svclassifier = SVC(kernel='sigmoid', gamma='scale')
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Fill in the variables below with actual results

In [None]:
SVM_f1 = 0
SVM_accuracy = 0
svm = {
    'accuracy': SVM_accuracy,
    'f1': SVM_f1
}
%store svm