# Decision Trees (to determine if mushrooms are edible or not edible)

In [1]:
# importing relevant python modules, libraries 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer

# define tree model
model = tree.DecisionTreeClassifier(criterion = 'gini')
# model = tree.DecisionTreeClassifier(criterion = 'entropy')

In [2]:
# reading in raw dataset for mushrooms analysis
# dataset = pd.read_csv('data/mushrooms.csv')
dataset = pd.read_csv('mushrooms_decision_tree.csv')
column_names = list(dataset.columns.values)
dataset = np.array(dataset)

In [3]:
# pre-processing of raw dataset
size_shape = dataset.shape
print(size_shape)
no_columns = size_shape[1]
print(no_columns)
test_classes = {}
extract_class_column = dataset[:,0]

for i in range(len(extract_class_column)):
    if extract_class_column[i] in test_classes.keys():
        test_classes[extract_class_column[i]] += 1
    else:
        test_classes[extract_class_column[i]] = 1

test_classes_names = [item for item in test_classes.keys()]

for i in range (len(dataset)):
    for j in range (no_columns):
        if dataset[i][j] == '?':
            dataset[i][j] = 'N'

for i in range (len(dataset)):
    for j in range (1, no_columns):
        dataset[i][j] = ord(dataset[i][j])
            
new_table = pd.DataFrame(dataset, columns=column_names)
new_table.to_csv('mushrooms_new.csv')
new_table.head()

(8124, 23)
23


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [4]:
# splitting of dataset into training and validation data
x_train, x_test = train_test_split(new_table.iloc[:,1:], test_size = 0.20)
y_train, y_test = train_test_split(new_table.iloc[:,0], test_size = 0.20)

In [5]:
# model training
trained_model = model.fit(x_train,y_train)
pred_result_train = trained_model.predict(x_train)
y_train = list(y_train)

counts = 0
for i in range(len(pred_result_train)):
    if str(pred_result_train[i]) == str(y_train[i]):
        counts += 1
accuracy = (counts/len(pred_result_train)) * 100
accuracy = round(accuracy,1)

print('The accuracy of the trained model from the training step is ' + str(accuracy) + '% .')

The accuracy of the trained model from the training step is 100.0% .


In [6]:
# model validation
test_results = trained_model.predict(x_test)
y_test = list(y_test)
counts = 0
for i in range(len(test_results)):
    if str(test_results[i]) == str(y_test[i]):
        counts += 1
accuracy = (counts/len(test_results)) * 100
accuracy = round(accuracy,1)

print('The accuracy of the trained model from the validation step is ' + str(accuracy) + '% .')

The accuracy of the trained model from the validation step is 51.0% .


In [7]:
import graphviz
import pydot
from graphviz import Graph
dot_data = tree.export_graphviz(trained_model, out_file=None,
                                feature_names=x_test.columns,
                               class_names=test_classes_names,
                               filled=True, rounded=True,
                               special_characters=True)
graph = graphviz.Source(dot_data)
graph.render()

'Source.gv.pdf'