In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
df = pd.read_csv("./mushrooms_sample.csv")
df.head()
# keep a copy of the dataset
df_original = df

#### Encoding error with bruises col, will rename to just bruises  

In [35]:
df.rename(columns = {'bruises%3F':'bruises'}, inplace = True)

#### Identify any missing values

In [36]:
df.isna().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  1230
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
class                          0
dtype: int64

#### For readability will convert all labels to full words as seen in EDA

In [37]:
df['cap-shape'] = df['cap-shape'].replace({'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 'k': 'knobbed', 
                                           's': 'sunken'})
df['cap-surface'] = df['cap-surface'].replace({'f': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth'})
df['cap-color'] = df['cap-color'].replace({'n': 'brown', 'b': 'buff', 'c': 'cinammon', 'g': 'gray', 'r': 'green', 
                                           'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'})
df['bruises'] = df['bruises'].replace({'t': 'true', 'f': 'false'})
df['odor'] = df['odor'].replace({'a': 'almond', 'l': 'anise', 'c': 'creosote', 'y': 'fishy', 'f': 'foul', 'm': 'musty', 
                                'n': 'none', 'p': 'pungent', 's': 'spicy'})
df['gill-attachment'] = df['gill-attachment'].replace({'a': 'attached', 'd': 'descending', 'f': 'free', 'n': 'notched'})
df['gill-spacing'] = df['gill-spacing'].replace({'c': 'close', 'w': 'crowded', 'd': 'distant'})
df['gill-size'] = df['gill-size'].replace({'b': 'broad', 'n': 'narrow'})
df['gill-color'] = df['gill-color'].replace({'n': 'brown', 'k': 'black', 'b': 'buff', 'h': 'chocolate', 'g': 'gray', 
                                             'r': 'green', 'o': 'orange', 'p': 'pink', 'u': 'purple', 'e': 'red', 
                                             'w': 'white', 'y': 'yellow'})
df['stalk-shape'] = df['stalk-shape'].replace({'e': 'enlarging', 't': 'tapering'})
df['stalk-root'] = df['stalk-root'].replace({'b': 'bulbous', 'c': 'club', 'u': 'cup', 'e': 'equal', 'z': 'rhizomorphs', 
                                             'r': 'rooted'})
df['stalk-surface-above-ring'] = df['stalk-surface-above-ring'].replace({'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 
                                                                         's': 'smooth'})
df['stalk-surface-below-ring'] = df['stalk-surface-below-ring'].replace({'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 
                                                                         's': 'smooth'})
df['stalk-color-above-ring'] = df['stalk-color-above-ring'].replace({'n': 'brown', 'b': 'buff', 'c': 'cinammon', 
                                                                     'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 
                                                                     'w': 'white', 'y': 'yellow'})
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].replace({'n': 'brown', 'b': 'buff', 'c': 'cinammon', 
                                                                     'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 
                                                                     'w': 'white', 'y': 'yellow'})
df['veil-type'] = df['veil-type'].replace({'p': 'partial', 'u': 'universal'})
df['veil-color'] = df['veil-color'].replace({'n': 'brown', 'o': 'orange', 'w': 'white', 'y': 'yellow'})
df['ring-number'] = df['ring-number'].replace({'n': 'none', 'o': 'one', 't': 'two'})
df['ring-type'] = df['ring-type'].replace({'c': 'cobwebby', 'e': 'evanescent', 'f': 'flaring', 'l': 'large', 'n': 'none',
                                           'p': 'pendant', 's': 'sheating', 'z': 'zone'})
df['spore-print-color'] = df['spore-print-color'].replace({'n': 'brown', 'k': 'black', 'b': 'buff', 'h': 'chocolate', 
                                                           'r': 'green', 'o': 'orange', 'u': 'purple', 'w': 'white', 
                                                           'y': 'yellow'})
df['population'] = df['population'].replace({'a': 'abundant', 'c': 'clustered', 'n': 'numerous', 's': 'scattered', 
                                             'v': 'several', 'y': 'solitary'})
df['habitat'] = df['habitat'].replace({'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'u': 'urban', 
                                       'w': 'waste', 'd': 'woods'})
df['class'] = df['class'].replace({'e': 'edible', 'p': 'poisonous'})

#### This gives me more readable labels for the next stages of data preparation

#### "Ring-number" can be converted to numbers 0 for none, 1 for one, 2 for two there is an order to the number of rings 

In [39]:
# Create a mapper

number_mapper = {
    "none": 0,
    "one": 1,
    "two": 2
}

df['ring-number'] = df['ring-number'].replace(number_mapper)

df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,convex,fibrous,red,True,none,free,close,broad,purple,tapering,...,pink,gray,partial,white,1,pendant,black,several,woods,edible
1,flat,scaly,gray,True,none,free,close,broad,purple,tapering,...,white,gray,partial,white,1,pendant,black,several,woods,edible
2,convex,smooth,pink,True,none,free,close,broad,white,enlarging,...,white,red,partial,white,2,evanescent,white,clustered,waste,edible
3,bell,smooth,white,False,none,free,crowded,broad,gray,enlarging,...,white,white,partial,white,2,pendant,white,numerous,grasses,edible
4,flat,scaly,red,True,none,free,close,broad,white,tapering,...,pink,pink,partial,white,1,pendant,brown,several,woods,edible


### Dummy One-Hot Encode all other Categorical attributes

#### One hot encode with NaN stalk-root due to missing values

In [40]:
df = pd.get_dummies(df, columns=["stalk-root",], 
                        dummy_na=True, 
                        drop_first=True)
df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,class,stalk-root_club,stalk-root_equal,stalk-root_rooted,stalk-root_nan
0,convex,fibrous,red,True,none,free,close,broad,purple,tapering,...,1,pendant,black,several,woods,edible,0,0,0,0
1,flat,scaly,gray,True,none,free,close,broad,purple,tapering,...,1,pendant,black,several,woods,edible,0,0,0,0
2,convex,smooth,pink,True,none,free,close,broad,white,enlarging,...,2,evanescent,white,clustered,waste,edible,0,0,0,1
3,bell,smooth,white,False,none,free,crowded,broad,gray,enlarging,...,2,pendant,white,numerous,grasses,edible,0,0,0,1
4,flat,scaly,red,True,none,free,close,broad,white,tapering,...,1,pendant,brown,several,woods,edible,0,0,0,0


#### One hot encode all other cols

In [41]:
cols = df.columns.values
# remove whats already been encoded
# cols  = np.setdiff1d(cols, ["cap-color","gill-color", "stalk-color-above-ring",
#                             "stalk-color-below-ring","veil-color","ring-number",
#                            "cap-surface","stalk-surface-above-ring","stalk-surface-below-ring"])
cols  = np.setdiff1d(cols, ["ring-number","stalk-root"])


In [42]:
df = pd.get_dummies(df, columns=cols, 
                        drop_first=True)

In [43]:
cols = df.columns.values
cols

array(['ring-number', 'bruises_true', 'cap-color_buff',
       'cap-color_cinammon', 'cap-color_gray', 'cap-color_green',
       'cap-color_pink', 'cap-color_purple', 'cap-color_red',
       'cap-color_white', 'cap-color_yellow', 'cap-shape_conical',
       'cap-shape_convex', 'cap-shape_flat', 'cap-shape_knobbed',
       'cap-shape_sunken', 'cap-surface_grooves', 'cap-surface_scaly',
       'cap-surface_smooth', 'class_poisonous', 'gill-attachment_free',
       'gill-color_brown', 'gill-color_buff', 'gill-color_chocolate',
       'gill-color_gray', 'gill-color_green', 'gill-color_orange',
       'gill-color_pink', 'gill-color_purple', 'gill-color_red',
       'gill-color_white', 'gill-color_yellow', 'gill-size_narrow',
       'gill-spacing_crowded', 'habitat_leaves', 'habitat_meadows',
       'habitat_paths', 'habitat_urban', 'habitat_waste', 'habitat_woods',
       'odor_anise', 'odor_creosote', 'odor_fishy', 'odor_foul',
       'odor_musty', 'odor_none', 'odor_pungent', 'odor_spicy'

In [44]:
df.isna().sum()

ring-number                        0
bruises_true                       0
cap-color_buff                     0
cap-color_cinammon                 0
cap-color_gray                     0
                                  ..
stalk-surface-below-ring_silky     0
stalk-surface-below-ring_smooth    0
veil-color_orange                  0
veil-color_white                   0
veil-color_yellow                  0
Length: 95, dtype: int64

#### No Missing Values

In [45]:
df.head()

Unnamed: 0,ring-number,bruises_true,cap-color_buff,cap-color_cinammon,cap-color_gray,cap-color_green,cap-color_pink,cap-color_purple,cap-color_red,cap-color_white,...,stalk-shape_tapering,stalk-surface-above-ring_scaly,stalk-surface-above-ring_silky,stalk-surface-above-ring_smooth,stalk-surface-below-ring_scaly,stalk-surface-below-ring_silky,stalk-surface-below-ring_smooth,veil-color_orange,veil-color_white,veil-color_yellow
0,1,1,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0
1,1,1,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0
2,2,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0
3,2,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,1,0
4,1,1,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0


In [47]:
df.to_csv("prepared_mushrooms_sample.csv")