# Kernel SVM case study 

CSV file which tells which of the users purchased/not purchased a particular product.
I have used a Kernel SVM model to predict the purchase based on certain variables like gender, Age and Estimated Salary 

## Importing libraries 

In [39]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

## Importing dataset

https://www.kaggle.com/rakeshrau/social-network-ads

In [40]:
dataset = pd.read_csv('C:/Users/binayak/Videos/Machine Learning/Classification/Kernel SVM/Social_Network_Ads.csv')

In [41]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [42]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 14.1+ KB


In [43]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


## Separate the dependent variable from independent 

In [44]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [45]:
X

array([['Male', 19, 19000],
       ['Male', 35, 20000],
       ['Female', 26, 43000],
       ...,
       ['Female', 50, 20000],
       ['Male', 36, 33000],
       ['Female', 49, 36000]], dtype=object)

In [46]:
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

## Categorical variables : Encoded

In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],
                       remainder='passthrough')

In [48]:
X= np.array(ct.fit_transform(X))

In [49]:
X

array([[0.0, 1.0, 19, 19000],
       [0.0, 1.0, 35, 20000],
       [1.0, 0.0, 26, 43000],
       ...,
       [1.0, 0.0, 50, 20000],
       [0.0, 1.0, 36, 33000],
       [1.0, 0.0, 49, 36000]], dtype=object)

## Split into Training and Test set

In [50]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/4)

In [51]:
print('The shape of X_train: {}; y_train: {}; X_test:{}; y_test:{}'.
      format(X_train.shape,y_train.shape,X_test.shape,y_test.shape))

The shape of X_train: (300, 4); y_train: (300,); X_test:(100, 4); y_test:(100,)


## Feature Scaling 

In [52]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train[:,2:] = ss.fit_transform(X_train[:,2:])
X_test[:,2:] = ss.transform(X_test[:,2:])

In [53]:
X_train[:10,:]

array([[0.0, 1.0, 0.7622909316130493, -1.3695908111329085],
       [1.0, 0.0, -0.2960010643158792, 0.8233349737318997],
       [1.0, 0.0, -0.39220942758214544, -0.7769081665748522],
       [1.0, 0.0, 0.37745747854798434, 1.1493104282388307],
       [0.0, 1.0, -0.199792701049613, -0.2731279187005044],
       [1.0, 0.0, 0.9547076581455817, 1.8308954694805952],
       [1.0, 0.0, -1.1618763337122753, -0.5102009765237269],
       [1.0, 0.0, 0.28124911528171814, 0.31955472585755185],
       [1.0, 0.0, 0.8584992948793155, -0.658371637663241],
       [0.0, 1.0, -0.199792701049613, -0.5102009765237269]], dtype=object)

In [54]:
X_test[:10,:]

array([[1.0, 0.0, 0.7622909316130493, 0.14174993249013498],
       [1.0, 0.0, -1.931543239842405, 0.49735951922496874],
       [0.0, 1.0, -1.2580846969785415, -1.3992249433608113],
       [1.0, 0.0, 1.3395411112106466, 2.038334395075915],
       [0.0, 1.0, -0.2960010643158792, 0.1713840647180378],
       [1.0, 0.0, -0.10358433778334676, 2.2754074528991377],
       [1.0, 0.0, -1.7391265133098726, -1.3695908111329085],
       [0.0, 1.0, -0.5846261541146779, 1.5049200149736643],
       [1.0, 0.0, -1.065667970446009, -0.4509327120679213],
       [1.0, 0.0, -1.1618763337122753, -0.7769081665748522]], dtype=object)

## Training the Kernel SVM model on training set

class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)

In [55]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf')
svc.fit(X_train,y_train)

SVC()

## Predict Purchase on test set

In [56]:
y_pred = svc.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),
                    axis=1))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]]


## Making a Confusion Matrix & calculate Accuracy Score

In [57]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

from sklearn.metrics import accuracy_score
a_score = accuracy_score(y_test,y_pred)
print(a_score)

[[58  5]
 [ 4 33]]
0.91


58 : correct prediction of class 0 

33 : correct prediction of class 1 

Hence 91% are correct (since test set has 100 observations)

Thus,

5 : incorrect prediction of class 1

4 : incorrect prediction of class 0


So, I find that with kernel='rbf' i.e. a non linear decision boundary the Kernel SVM model beats the Logistic Regression Model & linear SVM model in accuracy but it is little less accurate than the K-NN model.