In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

%matplotlib inline

## Import the data

In [2]:
mushrooms_df = pd.read_csv('data/mushrooms.csv')
mushrooms_df.shape

(8124, 23)

we see that there are 8124 samples and 22 features with one target variable, class.

In [3]:
# a look at the first five rows
mushrooms_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


as it stands, we need to do a lot of mapping to make the letters understandable. The code for the letters in the values are in the Kaggle competition description. 

In [4]:
# transposing the dataframe so all the columns will be clearly shown
mushrooms_df.head().T

Unnamed: 0,0,1,2,3,4
class,p,e,e,p,e
cap-shape,x,x,b,x,x
cap-surface,s,s,s,y,s
cap-color,n,y,w,w,g
bruises,t,t,t,t,f
odor,p,a,l,p,n
gill-attachment,f,f,f,f,f
gill-spacing,c,c,c,c,w
gill-size,n,b,b,n,b
gill-color,k,k,n,n,k


The class column is our target. We need to make it that edible species are 1 and poisonous species are 0

In [5]:
class_map = {'e': 1, 'p': 0}
mushrooms_df['class'] = mushrooms_df['class'].map(class_map)

In [6]:
# cap-shape feature mapping
capshape_map = {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 'k':'knobbed', 's':'sunken'}
mushrooms_df['cap-shape']= mushrooms_df['cap-shape'].map(capshape_map) 

In [7]:
# cap-surface feature mapping
capsurface_map = {'f': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth'}
mushrooms_df['cap-surface'] = mushrooms_df['cap-surface'].map(capsurface_map)

In [8]:
# cap-color feature mapping
capcolor_map = {'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 
    'g':'gray', 'r': 'green', 'p':'pink','u':'purple', 'e':'red', 'w':'white','y':'yellow'}
mushrooms_df['cap-color'] = mushrooms_df['cap-color'].map(capcolor_map)    

In [9]:
# bruises feature mapping
bruises_map = {'t': 'bruises', 'f': 'no'}
mushrooms_df.bruises = mushrooms_df.bruises.map(bruises_map)

In [10]:
# odor feature map
odor_map = {'a':'almond', 'l':'anise', 'c':'creosote', 'y':'fishy', 'f':'foul', 
    'm':'musty', 'n':'none', 'p':'pungent', 's':'spicy'}
mushrooms_df.odor = mushrooms_df.odor.map(odor_map)    

In [11]:
# gill-attachment feature mapping
gillattachment_map = {'a':'attached', 'd':'descending', 'f':'free', 'n':'notched'}
mushrooms_df['gill-attachment'] = mushrooms_df['gill-attachment'].map(gillattachment_map)

In [12]:
# gill-spacing feature mapping
gillspacing_map = {'c':'close', 'w':'crowded', 'd':'distant'}
mushrooms_df['gill-spacing'] = mushrooms_df['gill-spacing'].map(gillspacing_map)

In [13]:
# gill-size feature mapping
gillsize_map = {'b':'broad', 'n':'narrow'}
mushrooms_df['gill-size'] = mushrooms_df['gill-size'].map(gillsize_map)

In [14]:
# gill-color feature mapping
gillcolor_map = {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate', 'g':'gray', 
    'r':'green', 'o':'orange', 'p':'pink', 'u':'purple', 'e':'red', 'w':'white', 
    'y':'yellow'}
mushrooms_df['gill-color'] = mushrooms_df['gill-color'].map(gillcolor_map)

In [15]:
# stalk-shape feature mapping
stalkshape_map = {'e':'enlarging', 't':'tapering'}
mushrooms_df['stalk-shape'] = mushrooms_df['stalk-shape'].map(stalkshape_map)

In [16]:
# stalk-root feature mapping
stalkroot_map = {'b':'bulbous', 'c':'club', 'u':'cup', 'e':'equal', 
    'z':'rhizomorphs', 'r':'rooted', '?':'missing'}
mushrooms_df['stalk-root'] = mushrooms_df['stalk-root'].map(stalkroot_map)    

In [17]:
# stalk-surface-above-ring feature mapping
stalksurfaceabovering_map = {'f':'fibrous', 'y': 'scaly', 'k':'silky', 's':'smooth'}
mushrooms_df['stalk-surface-above-ring'] = mushrooms_df['stalk-surface-above-ring'].map(stalksurfaceabovering_map)

In [18]:
# stalk-surface-below-ring feature mapping
stalksurfacebelowring_map = {'f':'fibrous', 'y': 'scaly', 'k': 'silky', 's':'smooth'}
mushrooms_df['stalk-surface-below-ring'] = mushrooms_df['stalk-surface-below-ring'].map(stalksurfacebelowring_map)


In [19]:
# stalk-color-above-ring feature mapping
stalkcolorabovering_map = {'n':'brown', 'b': 'buff', 'c': 'cinnamon', 'g':'gray', 
    'o':'orange', 'p':'pink', 'e': 'red', 'w':'white', 'y':'yellow'}
mushrooms_df['stalk-color-above-ring'] = mushrooms_df['stalk-color-above-ring'].map(stalkcolorabovering_map)    

In [20]:
# stalk-color-below-ring feature mapping
stalkcolorbelowring_map = {'n':'brown', 'b':'buff', 'c':'cinnamon', 'g':'gray', 
    'o':'orange', 'p':'pink', 'e':'red', 'w': 'white', 'y':'yellow'}
mushrooms_df['stalk-color-below-ring'] = mushrooms_df['stalk-color-below-ring'].map(stalkcolorbelowring_map)    

In [21]:
# veil-type feature mapping
veiltype_map = {'p':'partial', 'u':'universal'}
mushrooms_df['veil-type'] = mushrooms_df['veil-type'].map(veiltype_map)

In [22]:
# veil-color feature mapping
veilcolor_map = {'n':'brown', 'o':'orange', 'w':'white', 'y':'yellow'}
mushrooms_df['veil-color'] = mushrooms_df['veil-color'].map(veilcolor_map)

In [23]:
# ring-number feature mapping
ringnumber_map = {'n':'none', 'o':'one', 't':'two'}
mushrooms_df['ring-number'] = mushrooms_df['ring-number'].map(ringnumber_map)

In [24]:
# ring-type feature mapping
ringtype_map = {'c':'cobwebby', 'e':'evanescent', 'f':'flaring', 'l':'large',
    'n':'none', 'p':'pendant', 's':'sheathing', 'z':'zone'}
mushrooms_df['ring-type'] = mushrooms_df['ring-type'].map(ringtype_map)    

In [25]:
# spore-print-color feature mapping
sporeprintcolor_map = {'k':'black', 'n':'brown', 'b':'buff', 'h':'chocolate',
    'r':'green', 'o':'orange', 'u':'purple', 'w':'white', 'y':'yellow'}
mushrooms_df['spore-print-color'] = mushrooms_df['spore-print-color'].map(sporeprintcolor_map)

In [26]:
# population feature mapping
population_map = {'a':'abundant', 'c':'clustered', 'n':'numerous', 's':'scattered', 
    'v':'several', 'y':'solitary'}
mushrooms_df.population = mushrooms_df.population.map(population_map)

In [27]:
# habitat feature mapping
habitat_map = {'g':'grasses', 'l':'leaves', 'm':'meadows', 'p':'paths', 'u':'urban', 
    'w':'waste', 'd':'woods'}
mushrooms_df.habitat = mushrooms_df.habitat.map(habitat_map)    

In [28]:
# taking a look at what we just did again
mushrooms_df.head().T


Unnamed: 0,0,1,2,3,4
class,0,1,1,0,1
cap-shape,convex,convex,bell,convex,convex
cap-surface,smooth,smooth,smooth,scaly,smooth
cap-color,brown,yellow,white,white,gray
bruises,bruises,bruises,bruises,bruises,no
odor,pungent,almond,anise,pungent,none
gill-attachment,free,free,free,free,free
gill-spacing,close,close,close,close,crowded
gill-size,narrow,broad,broad,narrow,broad
gill-color,black,black,brown,brown,black


## Exploretory data analysis, EDA

First, the test set has to be separated from the data set

In [29]:
full_train_df, test_df = train_test_split(mushrooms_df, test_size=0.2)

In [30]:
full_train_df['class'].value_counts()

1    3397
0    3102
Name: class, dtype: int64

From the result above, the classes are balanced. We have 3350 edible classes and 3149 poisonous classes. 

In [31]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6499 entries, 7360 to 5151
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     6499 non-null   int64 
 1   cap-shape                 6499 non-null   object
 2   cap-surface               6499 non-null   object
 3   cap-color                 6499 non-null   object
 4   bruises                   6499 non-null   object
 5   odor                      6499 non-null   object
 6   gill-attachment           6499 non-null   object
 7   gill-spacing              6499 non-null   object
 8   gill-size                 6499 non-null   object
 9   gill-color                6499 non-null   object
 10  stalk-shape               6499 non-null   object
 11  stalk-root                6499 non-null   object
 12  stalk-surface-above-ring  6499 non-null   object
 13  stalk-surface-below-ring  6499 non-null   object
 14  stalk-color-above-rin

We can see that all the predictors are categorical variables. Also, there are no null values in the train data set. 

In [33]:
test_df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

There are also no null values in the test data set. 

Let's examine the charts for select features. 