### Binary classification of Mushroom  poisonous or edible 
#### By: Adwaith Moothezhath
#### Using Support Vector Machine algorithm(Supervised learning)

In [44]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler #standardize data
from sklearn.model_selection import train_test_split #spli to train and test
from sklearn.preprocessing import LabelEncoder #map label into number
from sklearn import svm #support vector machine
from sklearn.metrics import accuracy_score 

In [45]:
#loading dataset as pandas dataframe
mushroom_dataset = pd.read_csv('mushrooms.csv')

In [46]:
#print first 5 row of dataset
mushroom_dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [48]:
# takes class col and finds num of e and p
# e = edible
# p = poisonous
mushroom_dataset['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [69]:
#mapping the text to numeric value
#mappings is list of 23 dictionary 
mappings = list()

encoder = LabelEncoder()

for column in range(len(mushroom_dataset.columns)):
    mushroom_dataset[mushroom_dataset.columns[column]] = encoder.fit_transform(mushroom_dataset[mushroom_dataset.columns[column]]) #take col and tranform it to numeric val
    mappings_dict = {index: label for index, label in enumerate(encoder.classes_)}# dictionary of mapping
    mappings.append(mappings_dict)

In [70]:
# seperating data and label
X = mushroom_dataset.drop(columns = 'class', axis = 1)
Y = mushroom_dataset['class']

In [71]:
print(X)

      cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0             5            2          4        1     6                1   
1             5            2          9        1     0                1   
2             0            2          8        1     3                1   
3             5            3          8        1     6                1   
4             5            2          3        0     5                1   
...         ...          ...        ...      ...   ...              ...   
8119          3            2          4        0     5                0   
8120          5            2          4        0     5                0   
8121          2            2          4        0     5                0   
8122          3            3          4        0     8                1   
8123          5            2          4        0     5                0   

      gill-spacing  gill-size  gill-color  stalk-shape  ...  \
0                0          1       

In [52]:
print(Y)

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int32


In [53]:
scaler = StandardScaler()

In [54]:
#Fitting data with standard Scaler function
scaler.fit(X)

StandardScaler()

In [55]:
#standarzing data 
standardized_data = scaler.transform(X)

In [56]:
print(standardized_data)

[[ 1.02971224  0.14012794 -0.19824983 ... -0.67019486 -0.5143892
   2.03002809]
 [ 1.02971224  0.14012794  1.76587407 ... -0.2504706  -1.31310821
  -0.29572966]
 [-2.08704716  0.14012794  1.37304929 ... -0.2504706  -1.31310821
   0.86714922]
 ...
 [-0.8403434   0.14012794 -0.19824983 ... -1.50964337 -2.11182722
   0.28570978]
 [-0.21699152  0.95327039 -0.19824983 ...  1.42842641  0.28432981
   0.28570978]
 [ 1.02971224  0.14012794 -0.19824983 ...  0.16925365 -2.11182722
   0.28570978]]


In [57]:
# reassign X and Y to standarized data
X = standardized_data
Y = mushroom_dataset['class']

In [67]:
print(X)
print(Y)

[[ 1.02971224  0.14012794 -0.19824983 ... -0.67019486 -0.5143892
   2.03002809]
 [ 1.02971224  0.14012794  1.76587407 ... -0.2504706  -1.31310821
  -0.29572966]
 [-2.08704716  0.14012794  1.37304929 ... -0.2504706  -1.31310821
   0.86714922]
 ...
 [-0.8403434   0.14012794 -0.19824983 ... -1.50964337 -2.11182722
   0.28570978]
 [-0.21699152  0.95327039 -0.19824983 ...  1.42842641  0.28432981
   0.28570978]
 [ 1.02971224  0.14012794 -0.19824983 ...  0.16925365 -2.11182722
   0.28570978]]
0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int32


In [59]:
# train data and test data ml model will not see test data
# y train represents all data for x train
#using .2 which is 20 percent of test data and 80% as training, 
#stratify by Y similar proprotion of p and e in both X test and X train 
#so not all the p goes in one, random state is splitting of data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, stratify = Y, random_state=2)

In [60]:
#check how much of oringal dataset, training part,testing part 
print(X.shape,X_train.shape,X_test.shape)

(8124, 22) (6499, 22) (1625, 22)


In [61]:
#training model using support vector machine
classifier = svm.SVC(kernel='linear')

In [62]:
#training model using support vector machine with 
#traning data and label for training
classifier.fit(X_train, Y_train)

SVC(kernel='linear')

In [63]:
#accuary score of training data
#predict label for all x train
X_train_prediction = classifier.predict(X_train) 
# using predcitions with orignal label to get accuary score
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [64]:
print('Accuracy score of training data: ', training_data_accuracy)

Accuracy score of training data:  0.988767502692722


In [65]:
# accuary score of test data
#predict label for all x test
X_test_prediction = classifier.predict(X_test)
# using predcitions with orignal label to get accuary score
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [66]:
print('Accuracy score of test data: ', test_data_accuracy)

Accuracy score of test data:  0.9846153846153847
