In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Linear SVM

In [3]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [4]:
y_pred = svclassifier.predict(X_test)
classes = ['<=50K', '>50K']
linear_accuracy = accuracy_score(y_test,y_pred)
linear_f1 = f1_score(y_test,y_pred)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred,target_names=classes))

[[4201  326]
 [ 624  848]]
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90      4527
        >50K       0.72      0.58      0.64      1472

    accuracy                           0.84      5999
   macro avg       0.80      0.75      0.77      5999
weighted avg       0.83      0.84      0.84      5999



# Polynomial Kernel

In [5]:
# This can be done using a GridSearch to expedite the process. 
# Because otherwise, the third cell in this section is essentially re-running the classifier using the optimal parameters that we already ran
scores = {}
for num in range(1,8):
    svclassifier = SVC(kernel='poly', degree=num, gamma='scale')
    svclassifier.fit(X_train, y_train)
    y_pred = svclassifier.predict(X_test)
    scores[num] = accuracy_score(y_test, y_pred)
%store scores

Stored 'scores' (dict)


In [6]:
%store -r scores
high = 0
for degree, accuracy in scores.items():
    if accuracy > high:
        high = accuracy
        deg = degree
print("A degree of {} results in the highest accuracy of {}".format(deg, round(high, 3)))

A degree of 1 results in the highest accuracy of 0.843


In [7]:
svclassifier = SVC(kernel='poly', degree=1, gamma='scale')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
poly_accuracy = accuracy_score(y_test,y_pred)
poly_f1 = f1_score(y_test,y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4208  319]
 [ 622  850]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4527
           1       0.73      0.58      0.64      1472

    accuracy                           0.84      5999
   macro avg       0.80      0.75      0.77      5999
weighted avg       0.84      0.84      0.84      5999



# Gaussian Kernel

In [8]:
svclassifier = SVC(kernel='rbf', gamma='scale')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
y_pred = svclassifier.predict(X_test)
gaussian_accuracy = accuracy_score(y_test,y_pred)
gaussian_f1 = f1_score(y_test,y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4209  318]
 [ 637  835]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4527
           1       0.72      0.57      0.64      1472

    accuracy                           0.84      5999
   macro avg       0.80      0.75      0.77      5999
weighted avg       0.83      0.84      0.83      5999



# Sigmoid Kernel

In [10]:
svclassifier = SVC(kernel='sigmoid', gamma='scale')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
y_pred = svclassifier.predict(X_test)
sigmoid_accuracy = accuracy_score(y_test,y_pred)
sigmoid_f1 = f1_score(y_test,y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4098  429]
 [ 627  845]]
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4527
           1       0.66      0.57      0.62      1472

    accuracy                           0.82      5999
   macro avg       0.77      0.74      0.75      5999
weighted avg       0.82      0.82      0.82      5999



## Compare accuracy and f1 scores across all variation of svc

In [12]:
comparison_dictionary = {
    'accuracy':{
        'linear': linear_accuracy,
        'poly': poly_accuracy,
        'gaussian': gaussian_accuracy,
        'sigmoid': sigmoid_accuracy,
    },
    'f1':{
        'linear': linear_f1,
        'poly': poly_f1,
        'gaussian': gaussian_f1,
        'sigmoid': sigmoid_f1,
    }
}

# compares accuracy between models
high_accuracy = 0
best_model = None
for model, accuracy in comparison_dictionary['accuracy'].items():
    if accuracy > high_accuracy:
        high_accuracy = accuracy
        best_model = model
print("The best accuracy score that we compared was {} and was a result of the {} model".format(high_accuracy, best_model))    

# compares f1 between models
high_f1 = 0
best_model = None
for model, f1 in comparison_dictionary['f1'].items():
    if f1 > high_f1:
        high_f1 = f1
        best_model = model
print("The best f1 score that we compared was {} and was a result from the {} model".format(high_f1, best_model)) 

The best accuracy score that we compared was 0.8431405234205701 and was a result of the poly model
The best f1 score that we compared was 0.6436955698599016 and was a result from the poly model


# Fill in the variables below with actual results

In [14]:
SVM_f1 = poly_f1
SVM_accuracy = poly_accuracy
svm = {
    'accuracy': SVM_accuracy,
    'f1': SVM_f1
}
%store svm

Stored 'svm' (dict)
