In [6]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [7]:
# Read data, clean up names

df = pd.read_csv("data/PurchaseData.csv")
df = df.drop(['User ID'], axis=1)
print(df.shape)
df.describe()

(400, 4)


Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   EstimatedSalary  400 non-null    int64 
 3   Purchased        400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [9]:
# Convert Categorical Features to numerical.
df.loc[df['Gender'] == 'Male', 'Gender'] = 0
df.loc[df['Gender'] == 'Female', 'Gender'] = 1

In [10]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,0,19,19000,0
1,0,35,20000,0
2,1,26,43000,0
3,1,27,57000,0
4,0,19,76000,0


### Use Gender, Age and EstimatedSalary as Independent Variables - Predict 'Purchased' ###

In [11]:
# Store relevant columns as variables
X = df[['Gender', 'Age', 'EstimatedSalary']]
y = df['Purchased']


In [12]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Run SVM Models Using Different Kernels (linear, poly & rbf) and Show Results ###

In [13]:
#Import svm model
from sklearn import svm
# For each of the kernels:
#     Create a svm Classifier
#     Train the model using the training sets
#     Predict the response for test dataset

# Dictionary to store trained models and predictions
models = {}
predictions = {}

# List of kernels to evaluate
kernels = ['linear', 'poly', 'rbf']

for kernel in kernels:
    clf = svm.SVC(kernel=kernel)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    models[kernel] = clf
    predictions[kernel] = y_pred


In [14]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# For each of the models in the cell above:
#     Model Accuracy: how often is the classifier correct?
#     Model Precision: what percentage of positive tuples are labeled as such?
#     Model Recall: what percentage of positive tuples are labelled as such?
#     Print a Confusion Matrix

for kernel in kernels:
    print(f"\n--- SVM with {kernel} kernel ---")
    y_pred = predictions[kernel]

    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)

    print(f"Accuracy : {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall   : {recall:.2f}")
    print("Confusion Matrix:\n", conf_matrix)


--- SVM with linear kernel ---
Accuracy : 0.85
Precision: 0.85
Recall   : 0.68
Confusion Matrix:
 [[74  5]
 [13 28]]

--- SVM with poly kernel ---
Accuracy : 0.77
Precision: 0.88
Recall   : 0.37
Confusion Matrix:
 [[77  2]
 [26 15]]

--- SVM with rbf kernel ---
Accuracy : 0.79
Precision: 0.90
Recall   : 0.44
Confusion Matrix:
 [[77  2]
 [23 18]]
