In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
df = pd.read_csv('Advertising_data.csv')
X = df.iloc[:,1:4]
y = df.iloc[:,4]
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
# Convert the categorical features into one-hot encoding
gender = pd.get_dummies(X['Gender'], drop_first=True)

# And drop the 'Gender' column from the dataset
X = X.drop(['Gender'], axis=1)

# And concatenate the one-hot encoded column to the dataset
X = pd.concat([X, gender], axis=1)

In [4]:
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# The network
# Applying SVM
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [7]:
# Predicting the test set results
y_pred = classifier.predict(X_test)

In [8]:
# Calculating the default confusion matrix and accuracy
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Confusion: \n",confusion)
print("Accuracy: ", accuracy)

Confusion: 
 [[57  1]
 [ 6 16]]
Accuracy:  0.9125


In [9]:
# Applying Grid Search 
from sklearn.model_selection import GridSearchCV
parameters = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1, 
                           refit=True,
                           verbose=1)
grid_search = grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [10]:
# Mean cross-validated score of the best estimator
grid_search.best_score_

0.9

In [11]:
# Parameter setting that gave the best results on the hold out(validation) data
grid_search.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

In [12]:
# Estimator that was chosen by the search
grid_search.best_estimator_

SVC(C=0.1, gamma=1, random_state=0)

In [13]:
# The index which corresponds to the best candidate parameter setting
grid_search.best_index_

0

In [14]:
# Using the best model
classifier = grid_search.best_estimator_
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Confusion: \n",confusion)
print("Accuracy: ", accuracy)

Confusion: 
 [[55  3]
 [ 1 21]]
Accuracy:  0.95


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        58
           1       0.88      0.95      0.91        22

    accuracy                           0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80

