In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data preprocessing

In [4]:
df = pd.read_csv('Social_Network_Ads.csv')

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Training the SVM model

In [5]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', random_state=0)
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

## Predicting the test set

In [7]:
cust = X_test[:1, :]

y_pred = svm_clf.predict(cust)
y_pred

array([0])

In [8]:
y_pred = svm_clf.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1])

In [9]:
np.set_printoptions(precision=2)
np.concatenate(
    (
        y_pred.reshape(len(y_pred), 1), 
        y_test.reshape(len(y_test), 1)
    ),
    1
)

array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 1],
       [0, 1],
       [0, 0],
       [0, 0],
       [0, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0,

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score

c_matrix = confusion_matrix(y_test, y_pred)
c_matrix

array([[63,  5],
       [ 7, 25]])

In [11]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(tn, fp, fn, tp)

63 5 7 25


We can conclude from this that:

63 records were correctly predicted negative (true negative) i.e. predicted to not purchase and did not <br>
25 records were correctly predicted positive (true positives) i.e. predicted to purchase and did <br>

7 records were incorrectly predicted negative (false negative) i.e. predicted to not purchase but did <br>
5 records were incorrectly predicted positive (false positive) i.e. predicted to purchase but did not <br>

In [12]:
acc = accuracy_score(y_test, y_pred)
acc

0.88

Our accuracy is the same as the logistic regression model however, it did not manage to beat our K-NN model. So why did it not beat it? Well our kernel is linear, as K-NN is non-linear it is far more likely to make better predictions.