# SVM case study 

CSV file which tells which of the users purchased/not purchased a particular product.
I have used a SVM model to predict the purchase based on certain variables like gender, Age and Estimated Salary 

## Importing libraries 

In [115]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

## Importing dataset

https://www.kaggle.com/rakeshrau/social-network-ads

In [116]:
dataset = pd.read_csv('C:/Users/binayak/Videos/Machine Learning/Classification/SVM/Social_Network_Ads.csv')

In [117]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [118]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 14.1+ KB


In [119]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


## Separate the dependent variable from independent 

In [120]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [121]:
X

array([['Male', 19, 19000],
       ['Male', 35, 20000],
       ['Female', 26, 43000],
       ...,
       ['Female', 50, 20000],
       ['Male', 36, 33000],
       ['Female', 49, 36000]], dtype=object)

In [122]:
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

## Categorical variables : Encoded

In [123]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],
                       remainder='passthrough')

In [124]:
X= np.array(ct.fit_transform(X))

In [125]:
X

array([[0.0, 1.0, 19, 19000],
       [0.0, 1.0, 35, 20000],
       [1.0, 0.0, 26, 43000],
       ...,
       [1.0, 0.0, 50, 20000],
       [0.0, 1.0, 36, 33000],
       [1.0, 0.0, 49, 36000]], dtype=object)

## Split into Training and Test set

In [126]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/4)

In [127]:
print('The shape of X_train: {}; y_train: {}; X_test:{}; y_test:{}'.
      format(X_train.shape,y_train.shape,X_test.shape,y_test.shape))

The shape of X_train: (300, 4); y_train: (300,); X_test:(100, 4); y_test:(100,)


## Feature Scaling 

In [128]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train[:,2:] = ss.fit_transform(X_train[:,2:])
X_test[:,2:] = ss.transform(X_test[:,2:])

In [129]:
X_train[:10,:]

array([[1.0, 0.0, 1.556900888666933, -1.2854861434180087],
       [1.0, 0.0, 1.74867681418534, 1.008624271162635],
       [1.0, 0.0, -0.6485222547947461, -0.050195920182277476],
       [0.0, 1.0, 0.21446941003808484, -0.13843093612768687],
       [1.0, 0.0, -0.4567463292763393, -0.5501943438729306],
       [1.0, 0.0, 1.1733490376301192, 0.5380375194537851],
       [0.0, 1.0, 0.1185814472788814, 1.8909744306167289],
       [0.0, 1.0, -0.07319447823952549, -0.4325476559457181],
       [0.0, 1.0, 0.21446941003808484, 0.15568578369034441],
       [0.0, 1.0, -0.2649704037579324, 0.6262725353991945]], dtype=object)

In [130]:
X_test[:10,:]

array([[0.0, 1.0, 0.21446941003808484, 0.24392079963575378],
       [0.0, 1.0, -0.2649704037579324, 0.09686243972673815],
       [0.0, 1.0, -1.6074018823867806, 0.06745076774493503],
       [1.0, 0.0, -1.3197379941091703, -1.2560744714362055],
       [1.0, 0.0, -1.1279620685907634, 0.06745076774493503],
       [0.0, 1.0, 0.31035737279728826, -0.5207826718911275],
       [1.0, 0.0, 0.7897971865933056, 0.3615674875629663],
       [0.0, 1.0, 0.21446941003808484, -0.6678410318001431],
       [1.0, 0.0, -0.7444102175539496, 0.2733324716175569],
       [0.0, 1.0, 0.02269348451967795, -0.25607762405489937]],
      dtype=object)

## Training the SVM model on training set

class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)

In [131]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)

SVC(kernel='linear')

## Predict Purchase in test set

In [132]:
y_pred = svc.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),
                    axis=1))

[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]]


## Making a Confusion Matrix & calculate Accuracy Score

In [133]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

from sklearn.metrics import accuracy_score
a_score = accuracy_score(y_test,y_pred)
print(a_score)

[[62  5]
 [10 23]]
0.85


62 : correct prediction of class 0 

23 : correct prediction of class 1 

Hence 85% are correct (since test set has 100 observations)

Thus,

5 : incorrect prediction of class 1

10 : incorrect prediction of class 0


So, I find that with kernel='linear' i.e. a linear decision boundary the SVM model is very similar to the Logistic Regression Model in accuracy but it does not beat the K-NN model since it is non-linear classifier model i.e. with a non linear decision boundary and hence captures more accurate datapoints.