# KERNEL SUPPORT VECTOR MACHINE (SVM)

## DATA PREPROCESSING

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_excel("C:/Users/Latitude/Desktop/Mushroom Classification/Mushroom.xlsx")
order = np.arange(1, 21, 1).tolist()
neworder = order + [0]
dataset = dataset[dataset.columns[neworder]]
len(dataset.columns)

21

## Dealing with NAs

Find Missing Values Per Column and Eliminate the Columns

In [3]:
percent_missing = dataset.isnull().sum() * 100 / len(dataset)
print(percent_missing)

dataset = dataset.dropna(axis = 1)
len(dataset.columns)

cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
class                    0.000000
dtype: float64


12

Check whether the columns were eliminated

In [4]:
percent_missing = dataset.isnull().sum() * 100 / len(dataset)
print(percent_missing)

cap-diameter            0.0
cap-shape               0.0
cap-color               0.0
does-bruise-or-bleed    0.0
gill-color              0.0
stem-height             0.0
stem-width              0.0
stem-color              0.0
has-ring                0.0
habitat                 0.0
season                  0.0
class                   0.0
dtype: float64


## Encoding Categorical Data

Filter columns with categorical Data

In [5]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
cols1 = np.arange(1, 5, 1).tolist()
cols2 = np.arange(7, 11, 1).tolist()
cols = cols1 + cols2

Xnew = X[:, cols]
print(cols)

[1, 2, 3, 4, 7, 8, 9, 10]


In [7]:
print(Xnew[:10])

[['x' 'o' 'f' 'w' 'w' 't' 'd' 'w']
 ['x' 'o' 'f' 'w' 'w' 't' 'd' 'u']
 ['x' 'o' 'f' 'w' 'w' 't' 'd' 'w']
 ['f' 'e' 'f' 'w' 'w' 't' 'd' 'w']
 ['x' 'o' 'f' 'w' 'w' 't' 'd' 'w']
 ['x' 'o' 'f' 'w' 'w' 't' 'd' 'u']
 ['f' 'o' 'f' 'w' 'w' 't' 'd' 'w']
 ['x' 'e' 'f' 'w' 'w' 't' 'd' 'u']
 ['f' 'o' 'f' 'w' 'w' 't' 'd' 'a']
 ['f' 'e' 'f' 'w' 'w' 't' 'd' 'w']]


Encoding Categorical Variables

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 7, 8, 9, 10])], remainder='passthrough')
X = ct.fit_transform(X)

In [10]:
print(X[:10])

  (0, 6)	1.0
  (0, 13)	1.0
  (0, 19)	1.0
  (0, 31)	1.0
  (0, 44)	1.0
  (0, 47)	1.0
  (0, 48)	1.0
  (0, 59)	1.0
  (0, 60)	15.26
  (0, 61)	16.95
  (0, 62)	17.09
  (1, 6)	1.0
  (1, 13)	1.0
  (1, 19)	1.0
  (1, 31)	1.0
  (1, 44)	1.0
  (1, 47)	1.0
  (1, 48)	1.0
  (1, 58)	1.0
  (1, 60)	16.6
  (1, 61)	17.99
  (1, 62)	18.19
  (2, 6)	1.0
  (2, 13)	1.0
  (2, 19)	1.0
  :	:
  (7, 60)	14.86
  (7, 61)	17.03
  (7, 62)	17.44
  (8, 2)	1.0
  (8, 13)	1.0
  (8, 19)	1.0
  (8, 31)	1.0
  (8, 44)	1.0
  (8, 47)	1.0
  (8, 48)	1.0
  (8, 56)	1.0
  (8, 60)	12.85
  (8, 61)	17.27
  (8, 62)	18.69
  (9, 2)	1.0
  (9, 8)	1.0
  (9, 19)	1.0
  (9, 31)	1.0
  (9, 44)	1.0
  (9, 47)	1.0
  (9, 48)	1.0
  (9, 59)	1.0
  (9, 60)	13.55
  (9, 61)	16.04
  (9, 62)	16.88


In [11]:
print(y)

[1 1 1 ... 1 1 1]


## Splitting the dataset into the Training set and Test set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean = False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## FITTING AND EVALUATING THE MODEL

## Training the Kernel SVM model on the Training set

In [14]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train) #2 Minutes 34 Seconds!

## Predicting the Test set results

In [15]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [0 1]
 [1 1]
 ...
 [0 0]
 [0 0]
 [1 1]]


## Making the Confusion Matrix

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred) #96.04%

[[6349  330]
 [ 274 8315]]


0.9604401362326435

## Applying K-Fold Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score
#To use all the CPUs you can set "n_jobs = -1"
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)#cv is number of folds of your data
#24 Minutes later!

In [18]:
#Take mean of all the accuracies
accuracies.mean() #96.04%

0.9603721132187344

In [19]:
#Standard Deviation of the Accuracies to find variance
accuracies.std() #<0.01% Variance. That is good (Low Bias and Low Variance)

0.00270888542589281