In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

<h2>Mushroom Classification Dataset - All Categorical Features</h2>
<h4>Hands-on: Classification with AWS Machine Learning Service</h4>
Input Features: 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'<br>
Target Feature: 'class_edible'<br>
Objective: Predict class for given input features<br>
<h4>Data source: https://archive.ics.uci.edu/ml/datasets/mushroom</h4>

In [None]:
columns = ['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [None]:
df = pd.read_csv('mushroom_data_all.csv')

In [None]:
df['class_edible'].value_counts()

In [None]:
df.head()

In [None]:
# https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

In [None]:
# Encoding the variable
df = df.apply(lambda x: d[x.name].fit_transform(x))

In [None]:
df.head()

In [None]:
d.keys()

In [None]:
for key in d.keys():
    print(key, d[key].classes_)

In [None]:
df['class_edible'].value_counts()

In [None]:
df.to_csv('mushroom_encoded_all.csv'
                          ,index=False)

## Training and Validation Set
### Target Variable as first column followed by input features:
'class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
# Write Training Set
df[:train].to_csv('mushroom_train.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('mushroom_validation.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [None]:
# Write Column List
with open('mushroom_train_column_list.txt','w') as f:
    f.write(','.join(columns))