# Mushroom Classification : Edible or poisonous

In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style(style = "dark")

In [3]:
df = pd.read_csv("mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.shape

(8124, 23)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

#### There are no Null values, and all features appear to be categorical

In [6]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


Column "class" is the target class, and has two values: 

        p = poisonous 
        e = edible

In [7]:
plt.figure()
sns.countplot(df["class"], palette = 'viridis')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fe5f171c910>

#### We have almost equal class instances

## DATA EXPLORATION 

In [8]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [9]:
def bar_plot(column, data):
    plt.figure()
    sns.countplot(x = column, hue = 'class', data = data, palette = 'viridis')
    print(df.groupby([column, "class"])["class"].count())

## Feature 1: cap-shape

In [10]:
bar_plot("cap-shape", df)

<IPython.core.display.Javascript object>

cap-shape  class
b          e         404
           p          48
c          p           4
f          e        1596
           p        1556
k          e         228
           p         600
s          e          32
x          e        1948
           p        1708
Name: class, dtype: int64


### Observations: 

        In category s, we have only edible mushrooms
        In category c, we have only poisonous mushrooms
        
        In category b, we have a good amount of mushrooms which are edible, but small amt of poisonous
        mushrooms are present
        In category k, most of the mushrooms are poisonous while a small percent are edible
        
        In categories x and f, the division is approx. 50 - 50
        

## Feature 2: cap-surface

In [11]:
bar_plot("cap-surface", df)

<IPython.core.display.Javascript object>

cap-surface  class
f            e        1560
             p         760
g            p           4
s            e        1144
             p        1412
y            e        1504
             p        1740
Name: class, dtype: int64


### Observations:

    Category f has mostly edible mushrooms and moderate % of poisonous ones
    Category g has all poisonous mushrooms
    
    Categories s and y are almost equal, but no of poisonous mushrooms is still a little bit higher in both 
    cases
    

## Feature 3: cap-color

In [12]:
bar_plot('cap-color', df)

<IPython.core.display.Javascript object>

cap-color  class
b          e          48
           p         120
c          e          32
           p          12
e          e         624
           p         876
g          e        1032
           p         808
n          e        1264
           p        1020
p          e          56
           p          88
r          e          16
u          e          16
w          e         720
           p         320
y          e         400
           p         672
Name: class, dtype: int64


## Feature 4: cap-color

In [13]:
bar_plot("cap-color", df)

<IPython.core.display.Javascript object>

cap-color  class
b          e          48
           p         120
c          e          32
           p          12
e          e         624
           p         876
g          e        1032
           p         808
n          e        1264
           p        1020
p          e          56
           p          88
r          e          16
u          e          16
w          e         720
           p         320
y          e         400
           p         672
Name: class, dtype: int64


## Feature 5: bruises

In [14]:
bar_plot("bruises", df)

<IPython.core.display.Javascript object>

bruises  class
f        e        1456
         p        3292
t        e        2752
         p         624
Name: class, dtype: int64


## Feature 6: odor

In [15]:
bar_plot("odor", df)

<IPython.core.display.Javascript object>

odor  class
a     e         400
c     p         192
f     p        2160
l     e         400
m     p          36
n     e        3408
      p         120
p     p         256
s     p         576
y     p         576
Name: class, dtype: int64


### Observations:

    This feature clearly distinguishes the poisonous and edible mushrooms, an important feature

## Feature 7: gill-attachment

In [16]:
bar_plot("gill-attachment", df)

<IPython.core.display.Javascript object>

gill-attachment  class
a                e         192
                 p          18
f                e        4016
                 p        3898
Name: class, dtype: int64


### Observations:
    
    gill-attachment category a will be helpful in predicting edible mushrooms
    

## Feature 8: gill-spacing

In [17]:
bar_plot("gill-spacing", df)

<IPython.core.display.Javascript object>

gill-spacing  class
c             e        3008
              p        3804
w             e        1200
              p         112
Name: class, dtype: int64


### Observations:

    class w can be helpful in predicting edible mushrooms

## Feature 9: gill-size

In [18]:
bar_plot("gill-size", df)

<IPython.core.display.Javascript object>

gill-size  class
b          e        3920
           p        1692
n          e         288
           p        2224
Name: class, dtype: int64


### Observations:

    class n has high number of poisonous mushrooms, can help in predicting posionous mushrooms

## Feature 10: gill-color

In [19]:
bar_plot("gill-color", df)

<IPython.core.display.Javascript object>

gill-color  class
b           p        1728
e           e          96
g           e         248
            p         504
h           e         204
            p         528
k           e         344
            p          64
n           e         936
            p         112
o           e          64
p           e         852
            p         640
r           p          24
u           e         444
            p          48
w           e         956
            p         246
y           e          64
            p          22
Name: class, dtype: int64


### Observations:

    Categories b, r have only poisonous mushrooms, while category e has only edible ones

## Feature 11: stalk-shape

In [20]:
bar_plot("stalk-shape", df)

<IPython.core.display.Javascript object>

stalk-shape  class
e            e        1616
             p        1900
t            e        2592
             p        2016
Name: class, dtype: int64


## Feature 12: stalk -root

In [21]:
bar_plot("stalk-root", df)

<IPython.core.display.Javascript object>

stalk-root  class
?           e         720
            p        1760
b           e        1920
            p        1856
c           e         512
            p          44
e           e         864
            p         256
r           e         192
Name: class, dtype: int64


In [22]:
df["stalk-root"].unique()

array(['e', 'c', 'b', 'r', '?'], dtype=object)

In [23]:
df["stalk-root"].tail()

8119    ?
8120    ?
8121    ?
8122    ?
8123    ?
Name: stalk-root, dtype: object

there seems to be some problem in stalk-root categories

    The question mark represents missing or unknown, we will keep the column as it is

## Feature 13: stalk-surface-above-ring

In [24]:
bar_plot("stalk-surface-above-ring", df)

<IPython.core.display.Javascript object>

stalk-surface-above-ring  class
f                         e         408
                          p         144
k                         e         144
                          p        2228
s                         e        3640
                          p        1536
y                         e          16
                          p           8
Name: class, dtype: int64


### Observations:

    category k and s can be helpful in classifying poisonous and edible mushrooms

## Feature 14 - stalk-surface-below-ring

In [25]:
bar_plot("stalk-surface-below-ring", df)

<IPython.core.display.Javascript object>

stalk-surface-below-ring  class
f                         e         456
                          p         144
k                         e         144
                          p        2160
s                         e        3400
                          p        1536
y                         e         208
                          p          76
Name: class, dtype: int64


### Observations:

    categories k and f can help in classifying the mushrooms

## Feature 15: stalk-color-above-ring

In [26]:
bar_plot("stalk-color-above-ring", df)

<IPython.core.display.Javascript object>

stalk-color-above-ring  class
b                       p         432
c                       p          36
e                       e          96
g                       e         576
n                       e          16
                        p         432
o                       e         192
p                       e         576
                        p        1296
w                       e        2752
                        p        1712
y                       p           8
Name: class, dtype: int64


### Observations:

    categories g e and o have only edible mushrooms,
    categories b c and n have mostly poisonous mushrooms

## Feature 16: stalk-color-below-ring

In [27]:
bar_plot("stalk-color-below-ring", df)

<IPython.core.display.Javascript object>

stalk-color-below-ring  class
b                       p         432
c                       p          36
e                       e          96
g                       e         576
n                       e          64
                        p         448
o                       e         192
p                       e         576
                        p        1296
w                       e        2704
                        p        1680
y                       p          24
Name: class, dtype: int64


### Observations: 

    categories b, c, n and y have only poisonous mushrooms
    categories e, g, o have only edible mushrooms

## Feature 17: veil-type

In [28]:
bar_plot("veil-type", df)

<IPython.core.display.Javascript object>

veil-type  class
p          e        4208
           p        3916
Name: class, dtype: int64


In [29]:
df['veil-type'].unique()

array(['p'], dtype=object)

In [30]:
df["veil-type"]

0       p
1       p
2       p
3       p
4       p
       ..
8119    p
8120    p
8121    p
8122    p
8123    p
Name: veil-type, Length: 8124, dtype: object

Since this column contains only 1 unique value, it can be dropped since it wont give any valueable input to the model

In [31]:
df.drop("veil-type", axis = 1, inplace = True)

## Feature 18: Veil-color

In [32]:
bar_plot("veil-color", df)

<IPython.core.display.Javascript object>

veil-color  class
n           e          96
o           e          96
w           e        4016
            p        3908
y           p           8
Name: class, dtype: int64


### Observations:
    
    categories n, o has only edible mushrooms, while category y has only poisonous ones

## Feature 19: ring-number

In [33]:
bar_plot("ring-number", df)

<IPython.core.display.Javascript object>

ring-number  class
n            p          36
o            e        3680
             p        3808
t            e         528
             p          72
Name: class, dtype: int64


## Feature 20: ring-type

In [34]:
bar_plot("ring-type", df)

  plt.figure()


<IPython.core.display.Javascript object>

ring-type  class
e          e        1008
           p        1768
f          e          48
l          p        1296
n          p          36
p          e        3152
           p         816
Name: class, dtype: int64


## Feature 21: spore-print-color

In [35]:
bar_plot("spore-print-color", df)

  plt.figure()


<IPython.core.display.Javascript object>

spore-print-color  class
b                  e          48
h                  e          48
                   p        1584
k                  e        1648
                   p         224
n                  e        1744
                   p         224
o                  e          48
r                  p          72
u                  e          48
w                  e         576
                   p        1812
y                  e          48
Name: class, dtype: int64


## Feature 22: population

In [36]:
bar_plot("population", df)

  plt.figure()


<IPython.core.display.Javascript object>

population  class
a           e         384
c           e         288
            p          52
n           e         400
s           e         880
            p         368
v           e        1192
            p        2848
y           e        1064
            p         648
Name: class, dtype: int64


## Feature 23: Habitat

In [37]:
bar_plot("habitat", df)

  plt.figure()


<IPython.core.display.Javascript object>

habitat  class
d        e        1880
         p        1268
g        e        1408
         p         740
l        e         240
         p         592
m        e         256
         p          36
p        e         136
         p        1008
u        e          96
         p         272
w        e         192
Name: class, dtype: int64


## We have 22 feature columns, with multiple categories in each feature.
## If we do one-hot-encoding, the feature space will be much larger than before

In [38]:
df.shape

(8124, 22)

In [39]:
y = df['class']
df = df.drop('class', axis = 1)

In [40]:
def encode(row):
    if row == 'e':
        row = 1
    else:
        row = 0
    return row
y = y.apply(encode)
y.head()

0    0
1    1
2    1
3    0
4    1
Name: class, dtype: int64

## Data Preprocessing

In [41]:

def dummies(column):
    global df
    df1 = pd.get_dummies(df[column], drop_first = True, prefix = i)
    df  = pd.concat([df1,df], axis = 1)
    df.drop(column, axis = 1, inplace = True)

for i in df.columns:
    dummies(i)

df.shape

(8124, 95)

In [42]:
df.columns

Index(['habitat_g', 'habitat_l', 'habitat_m', 'habitat_p', 'habitat_u',
       'habitat_w', 'population_c', 'population_n', 'population_s',
       'population_v', 'population_y', 'spore-print-color_h',
       'spore-print-color_k', 'spore-print-color_n', 'spore-print-color_o',
       'spore-print-color_r', 'spore-print-color_u', 'spore-print-color_w',
       'spore-print-color_y', 'ring-type_f', 'ring-type_l', 'ring-type_n',
       'ring-type_p', 'ring-number_o', 'ring-number_t', 'veil-color_o',
       'veil-color_w', 'veil-color_y', 'stalk-color-below-ring_c',
       'stalk-color-below-ring_e', 'stalk-color-below-ring_g',
       'stalk-color-below-ring_n', 'stalk-color-below-ring_o',
       'stalk-color-below-ring_p', 'stalk-color-below-ring_w',
       'stalk-color-below-ring_y', 'stalk-color-above-ring_c',
       'stalk-color-above-ring_e', 'stalk-color-above-ring_g',
       'stalk-color-above-ring_n', 'stalk-color-above-ring_o',
       'stalk-color-above-ring_p', 'stalk-color-above-

Multiple columns are named similar, will this affect  ML model??

So, after reading pandas documentation, we have an option to give prefix to the column names:


In [43]:
df.head()

Unnamed: 0,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,population_c,population_n,population_s,population_v,...,cap-color_w,cap-color_y,cap-surface_g,cap-surface_s,cap-surface_y,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,1,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
2,0,0,1,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


#### No need for normalizing or scaling, proceeding to model building:

## Model building

In [44]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df, y)

In [45]:
#Decision Tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier().fit(X_train, y_train)
y_predict = dt_clf.predict(X_test)
accuracy_score(y_test, y_predict)

1.0

In [46]:
cv = cross_val_score(dt_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[1. 1. 1. 1. 1.]
1.0


Is this even possible??

In [47]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
max_acc = 0
k = 0
for i in range(1,50,2):
    knn_clf = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    y_predict = knn_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)
    if accuracy>max_acc:
        max_acc = accuracy
        k = i
    print("Accuracy score of k = {} is {}".format(i, accuracy))
print("Max accuracy score is {} for k = {}".format(max_acc, k))

Accuracy score of k = 1 is 1.0
Accuracy score of k = 3 is 1.0
Accuracy score of k = 5 is 1.0
Accuracy score of k = 7 is 1.0
Accuracy score of k = 9 is 1.0
Accuracy score of k = 11 is 1.0
Accuracy score of k = 13 is 0.999015263417036
Accuracy score of k = 15 is 0.999015263417036
Accuracy score of k = 17 is 0.999015263417036
Accuracy score of k = 19 is 0.999015263417036
Accuracy score of k = 21 is 0.999015263417036
Accuracy score of k = 23 is 0.999015263417036
Accuracy score of k = 25 is 0.999015263417036
Accuracy score of k = 27 is 0.999015263417036
Accuracy score of k = 29 is 0.999015263417036
Accuracy score of k = 31 is 0.999015263417036
Accuracy score of k = 33 is 0.999015263417036
Accuracy score of k = 35 is 0.999015263417036
Accuracy score of k = 37 is 0.999015263417036
Accuracy score of k = 39 is 0.999015263417036
Accuracy score of k = 41 is 0.9980305268340719
Accuracy score of k = 43 is 0.9980305268340719
Accuracy score of k = 45 is 0.9980305268340719
Accuracy score of k = 47 is 

In [48]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(max_iter = 1000).fit(X_train,y_train)
y_predict = lr_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.999507631708518


In [49]:
cv = cross_val_score(lr_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.99753897 1.         1.         0.99917898 0.99917898]
0.9991793860482158


In [50]:
#SVC
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'linear').fit(X_train,y_train)
y_predict = svc_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

1.0


In [51]:
cv = cross_val_score(svc_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[1. 1. 1. 1. 1.]
1.0


In [52]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier().fit(X_train, y_train)
y_predict = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

1.0


In [53]:
cv = cross_val_score(rf_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[1. 1. 1. 1. 1.]
1.0
