In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("agaricus-lepiota.csv", header=None)


In [3]:
df.head(10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [4]:
# replacing "?" signs with proper missing values"
for s in list(df.columns):
    df[s][df[s] == "?"] = np.NaN


In [5]:
df.isna().sum()


0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11    2480
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
dtype: int64

In [6]:
df.rename(columns={0: "poisonous", 1: "cap_shape", 2: "cap_surface", 3: "cap_color", 4: "bruises", 5: "odor",
                   6: "gill_attachment", 7: "gill_spacing", 8: "gill_size", 9: "gill_color", 10: "stalk_shape",
                   11: "stalk_root", 12: "stalk_surface_above_ring", 13: "stalk_surface_below_ring",
                   14: "stalk_color_above_ring", 15: "stalk_color_below_ring", 16: "veil_type", 17: "veil_color",
                   18: "ring_number", 19: "ring_type", 20: "spore_print_color", 21: "population", 22: "habitat"}, inplace=True)


In [7]:
def generate_mapping(column_name, mapping_string):
    column_dict = f"{mapping_string}".split(",")
    column_dict = [s.split("=") for s in column_dict]
    column_dict = dict(column_dict)
    column_dict = {v: k for k, v in column_dict.items()}
    df[f"{column_name}"] = df[f"{column_name}"].map(column_dict)


In [8]:
generate_mapping("poisonous", "0=e,1=p")
generate_mapping(
    "cap_shape", "bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s")
generate_mapping("cap_surface", "fibrous=f,grooves=g,scaly=y,smooth=s")
generate_mapping(
    "cap_color", "brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y")
generate_mapping("bruises", "1=t,0=f")
generate_mapping(
    "odor", "almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s")
generate_mapping("gill_attachment", "attached=a,descending=d,free=f,notched=n")
generate_mapping("gill_spacing", "close=c,crowded=w,distant=d")
generate_mapping("gill_size", "broad=b,narrow=n")
generate_mapping(
    "gill_color", "black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y")
generate_mapping("stalk_shape", "enlarging=e,tapering=t")
generate_mapping(
    "stalk_root", "bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r")
generate_mapping("stalk_surface_above_ring",
                 "fibrous=f,scaly=y,silky=k,smooth=s")
generate_mapping("stalk_surface_below_ring",
                 "fibrous=f,scaly=y,silky=k,smooth=s")
generate_mapping("stalk_color_above_ring",
                 "brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y")
generate_mapping("stalk_color_below_ring",
                 "brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y")
generate_mapping("veil_type", "partial=p,universal=u")
generate_mapping("veil_color", "brown=n,orange=o,white=w,yellow=y")
generate_mapping("ring_number", "none=n,one=o,two=t")
generate_mapping(
    "ring_type", "cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z")
generate_mapping("spore_print_color",
                 "black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y")
generate_mapping(
    "population", "abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y")
generate_mapping(
    "habitat", "grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d")


In [10]:
df.to_csv("mushrooms.csv", header=True, sep=",")


In [12]:
df.head(10)


Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,convex,smooth,brown,1,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,0,convex,smooth,yellow,1,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,0,bell,smooth,white,1,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,1,convex,scaly,white,1,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,0,convex,smooth,gray,0,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
5,0,convex,scaly,yellow,1,almond,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,black,numerous,grasses
6,0,bell,smooth,white,1,almond,free,close,broad,gray,...,smooth,white,white,partial,white,one,pendant,black,numerous,meadows
7,0,bell,scaly,white,1,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,scattered,meadows
8,1,convex,scaly,white,1,pungent,free,close,narrow,pink,...,smooth,white,white,partial,white,one,pendant,black,several,grasses
9,0,bell,smooth,yellow,1,almond,free,close,broad,gray,...,smooth,white,white,partial,white,one,pendant,black,scattered,meadows


In [14]:
df.describe()


Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,0,convex,scaly,brown,0,none,free,close,broad,buff,...,smooth,white,white,partial,white,one,pendant,white,several,woods
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148
