# Mushrooms

Goal: Determine whether a mushroom is edible or poisonous.

## Load in the data

In [55]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

In [2]:
dataset = pd.read_csv('mushrooms.csv')

## EDA

In [3]:
dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
dataset['class'] = (dataset['class'] == 'e').astype('int32')

In [6]:
dataset[['class', 'cap-shape']].groupby(['cap-shape']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
cap-shape,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.893805,452
c,0.0,4
f,0.506345,3152
k,0.275362,828
s,1.0,32
x,0.532823,3656


cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

- convex & flat are inconclusive. There are about an equal of amount of edible vs poisonous.

- knobbed has 0.27 which may suggest it may favor poisonous.

- bell has 0.89 which may suggest it may favor edible.

- sunken & conical are 1.0 and 0.0 but have only have 32 and 4 sample sizes. It may be not enough information to confidently conclude sunken are always edible or conical are always poisonous.

In [7]:
dataset[['class', 'cap-surface']].groupby(['cap-surface']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
cap-surface,Unnamed: 1_level_2,Unnamed: 2_level_2
f,0.672414,2320
g,0.0,4
s,0.447574,2556
y,0.463625,3244


cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

- scaly & smooth hard are inconclusive.

- fibrous may be more edible but with 0.67, it's not very high.

- grooves only has sample size 4 so it's too small to say anything.

In [8]:
dataset[['class', 'cap-color']].groupby(['cap-color']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
cap-color,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.285714,168
c,0.727273,44
e,0.416,1500
g,0.56087,1840
n,0.553415,2284
p,0.388889,144
r,1.0,16
u,1.0,16
w,0.692308,1040
y,0.373134,1072


cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

- Hard to say with cap color. 

In [9]:
dataset[['class', 'bruises']].groupby(['bruises']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
bruises,Unnamed: 1_level_2,Unnamed: 2_level_2
f,0.306655,4748
t,0.815166,3376


bruises: bruises=t,no=f

- with bruises seem to be more edible than no bruises.

In [10]:
dataset[['class', 'odor']].groupby(['odor']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
odor,Unnamed: 1_level_2,Unnamed: 2_level_2
a,1.0,400
c,0.0,192
f,0.0,2160
l,1.0,400
m,0.0,36
n,0.965986,3528
p,0.0,256
s,0.0,576
y,0.0,576


odor: almond=a,anise=l,creosote(aka "smoky")=c, fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

- Most good or neutral smells are edible (almond, anise, none)

- Most bad smells are poisonous (creosote, fishy, foul, musty, pungent, spicy)

In [11]:
dataset[['class', 'gill-attachment']].groupby(['gill-attachment']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
gill-attachment,Unnamed: 1_level_2,Unnamed: 2_level_2
a,0.914286,210
f,0.507455,7914


gill-attachment: attached=a,descending=d,free=f,notched=n

- **There are no descending or notched.**

- Most gill attached are edible but not large sample size.

- Free gill inconclusive.

In [12]:
dataset[['class', 'gill-spacing']].groupby(['gill-spacing']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
gill-spacing,Unnamed: 1_level_2,Unnamed: 2_level_2
c,0.441574,6812
w,0.914634,1312


gill-spacing: close=c,crowded=w,distant=d

- **No distant gill spacing.**

- Most crowded are edible.

In [13]:
dataset[['class', 'gill-size']].groupby(['gill-size']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
gill-size,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.698503,5612
n,0.11465,2512


gill-size: broad=b,narrow=n

- Most narrow are poisonous.

In [14]:
dataset[['class', 'gill-color']].groupby(['gill-color']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
gill-color,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.0,1728
e,1.0,96
g,0.329787,752
h,0.278689,732
k,0.843137,408
n,0.89313,1048
o,1.0,64
p,0.571046,1492
r,0.0,24
u,0.902439,492


gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

- Buff gill color is a good sign of poisonous.

- Gray or chocolate may be poisonous.

- Brown may be edible.

In [15]:
dataset[['class', 'stalk-shape']].groupby(['stalk-shape']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-shape,Unnamed: 1_level_2,Unnamed: 2_level_2
e,0.459613,3516
t,0.5625,4608


stalk-shape: enlarging=e,tapering=t

- Both are too close to make anything conclusive.

In [16]:
dataset[['class', 'stalk-root']].groupby(['stalk-root']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-root,Unnamed: 1_level_2,Unnamed: 2_level_2
?,0.290323,2480
b,0.508475,3776
c,0.920863,556
e,0.771429,1120
r,1.0,192


stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

- **Does "missing=?" mean missing value (NA) or it does not have a stalk-root?**

- Club may be edible.

In [17]:
dataset[['class', 'stalk-surface-above-ring']].groupby(['stalk-surface-above-ring']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-surface-above-ring,Unnamed: 1_level_2,Unnamed: 2_level_2
f,0.73913,552
k,0.060708,2372
s,0.703246,5176
y,0.666667,24


stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

- Silky may be a good sign it is poisonous.

In [18]:
dataset[['class', 'stalk-surface-below-ring']].groupby(['stalk-surface-below-ring']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-surface-below-ring,Unnamed: 1_level_2,Unnamed: 2_level_2
f,0.76,600
k,0.0625,2304
s,0.688817,4936
y,0.732394,284


stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

- Silky may be a good sign it is poisonous. Same as stalk-surface-above-ring.

In [19]:
dataset[['class', 'stalk-color-above-ring']].groupby(['stalk-color-above-ring']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-color-above-ring,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.0,432
c,0.0,36
e,1.0,96
g,1.0,576
n,0.035714,448
o,1.0,192
p,0.307692,1872
w,0.616487,4464
y,0.0,8


stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

- pink may be a good sign of poisonous.

In [20]:
dataset[['class', 'stalk-color-below-ring']].groupby(['stalk-color-below-ring']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
stalk-color-below-ring,Unnamed: 1_level_2,Unnamed: 2_level_2
b,0.0,432
c,0.0,36
e,1.0,96
g,1.0,576
n,0.125,512
o,1.0,192
p,0.307692,1872
w,0.616788,4384
y,0.0,24


stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

- pink may be a good sign of poisonous.

In [21]:
dataset[['class', 'veil-type']].groupby(['veil-type']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
veil-type,Unnamed: 1_level_2,Unnamed: 2_level_2
p,0.517971,8124


veil-type: partial=p,universal=u

- No universal.

- Inconclusive on just partial.

In [22]:
dataset[['class', 'veil-color']].groupby(['veil-color']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
veil-color,Unnamed: 1_level_2,Unnamed: 2_level_2
n,1.0,96
o,1.0,96
w,0.506815,7924
y,0.0,8


veil-color: brown=n,orange=o,white=w,yellow=y

- Can't say much about veil volor.

In [23]:
dataset[['class', 'ring-number']].groupby(['ring-number']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
ring-number,Unnamed: 1_level_2,Unnamed: 2_level_2
n,0.0,36
o,0.491453,7488
t,0.88,600


ring-number: none=n,one=o,two=t

- two rings may be poisonous.

In [24]:
dataset[['class', 'ring-type']].groupby(['ring-type']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
ring-type,Unnamed: 1_level_2,Unnamed: 2_level_2
e,0.363112,2776
f,1.0,48
l,0.0,1296
n,0.0,36
p,0.794355,3968


ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

- large ring may be a good sign of poisonous.

- **No cobwebby, sheathing, zone.**

In [25]:
dataset[['class', 'spore-print-color']].groupby(['spore-print-color']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
spore-print-color,Unnamed: 1_level_2,Unnamed: 2_level_2
b,1.0,48
h,0.029412,1632
k,0.880342,1872
n,0.886179,1968
o,1.0,48
r,0.0,72
u,1.0,48
w,0.241206,2388
y,1.0,48


spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

- chocolate may be a good sign of poisonous.

- white may be poisonous.

- black & brown may be a good sign of edible.

In [26]:
dataset[['class', 'population']].groupby(['population']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
population,Unnamed: 1_level_2,Unnamed: 2_level_2
a,1.0,384
c,0.847059,340
n,1.0,400
s,0.705128,1248
v,0.29505,4040
y,0.621495,1712


population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

- **Describes the size of mushroom population.**

- All or most abundant, clustered, numerous are edible.

In [27]:
dataset[['class', 'habitat']].groupby(['habitat']).agg(['mean', 'count'])

Unnamed: 0_level_0,class,class
Unnamed: 0_level_1,mean,count
habitat,Unnamed: 1_level_2,Unnamed: 2_level_2
d,0.597205,3148
g,0.655493,2148
l,0.288462,832
m,0.876712,292
p,0.118881,1144
u,0.26087,368
w,1.0,192


habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

- Most on paths are poisonous.

### Feature Summary
***
- Odor:
    - Most good or neutral smells are edible (almond, anise, none)
    - Most bad smells are poisonous (creosote, fishy, foul, musty, pungent, spicy)

- Gill-spacing:
    - Most crowded are edible.

- Gill-size:
    - Most narrow are poisonous.

- Gill-color:
    - Buff gill color is a good sign of poisonous.
    - Gray or chocolate may be poisonous.
    - Brown may be edible.

- Stalk-surface-above-ring:
    - Silky may be a good sign it is poisonous.

- Stalk-surface-below-ring:
    - Silky may be a good sign it is poisonous.

- Ring-number:
    - Two may be poisonous.

- Ring-type:
    - Large may be a good sign of poisonous.

- spore-print-color:
    - Chocolate may be a good sign of poisonous.
    - White may be poisonous.
    - Black & brown may be a good sign of edible.

- Population:
    - All or most abundant, clustered, numerous are edible.

- Habitat:
    - Most on paths are poisonous.

In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   int32 
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

## Transforming into X, y dataset

In [39]:
y = dataset['class']

dataset_cat = list(dataset.drop('class', axis=1))
X_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), dataset_cat)
])

X_prepared = X_pipeline.fit_transform(dataset)

In [41]:
X_prepared.toarray().shape

(8124, 117)

## Split dataset into train, test sets

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.4, random_state=42)

In [43]:
X_train

<4874x117 sparse matrix of type '<class 'numpy.float64'>'
	with 107228 stored elements in Compressed Sparse Row format>

In [59]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(random_state=42)

In [60]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

In [61]:
f1_score(y_train, y_train_pred)

0.9988156336360048

In [62]:
y_test_pred = sgd_clf.predict(X_test)
f1_score(y_test, y_test_pred)

1.0