# Pre-requisites
_Libraries and objects used, together with the imported dataset._

## Libraries used

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns

## Objects used

In [2]:
# Object for importing data
from data import Mushrooms_dataset

# Object for giving a label to a given row, and for defining attributes as "continuous" or "categorical"
from data_classification import Classification

# Object for splitting data into train and test set
from train_test_split import TrainTestSplit

# Object for calculating the gini function
from gini import Gini

# Object for the tree's nodes
from tree_node import TreeNode

# Object for the tree algorithm
from tree_algorithm import TreePredictor

# Object for the accuracy of the tree (uses as input the test_set)
from accuracy_algorithm import TreeAccuracy

## Import dataset

In [3]:
# Folder where the dataset is saved
path = 'Datasets'

In [4]:
# Associating the dataset to a variable
df_mushrooms = Mushrooms_dataset(path).dataset

# EDA
_Some exploratory data analysis to understando the data that is going to be used for the construction of the tree predictor._

In [5]:
# See the first 5 rows of the dataset
original_data = df_mushrooms # Just for future comparisons after processing data
df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [6]:
# Check the type of each attribute
print(f"There are some attributes with missing values:")
print(df_mushrooms.info())

There are some attributes with missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3

No null values for continuous values, while some categorical columns have nul values.

In [7]:
# Obtain percentage of null values
print(f"Taking into account these percentages, I'll exclude those attributes with more than 80% of null values:")
print(round(df_mushrooms.isna().sum()/len(df_mushrooms)*100,2))

Taking into account these percentages, I'll exclude those attributes with more than 80% of null values:
class                    0.00
cap-diameter             0.00
cap-shape                0.00
cap-surface             23.12
cap-color                0.00
does-bruise-or-bleed     0.00
gill-attachment         16.18
gill-spacing            41.04
gill-color               0.00
stem-height              0.00
stem-width               0.00
stem-root               84.39
stem-surface            62.43
stem-color               0.00
veil-type               94.80
veil-color              87.86
has-ring                 0.00
ring-type                4.05
spore-print-color       89.60
habitat                  0.00
season                   0.00
dtype: float64


In [8]:
# Obtain columns which exceed the threshold of 80%
columns_to_drop = df_mushrooms.columns[(round(df_mushrooms.isna().sum()/len(df_mushrooms)*100,2)) >= 80]
columns_to_drop

Index(['stem-root', 'veil-type', 'veil-color', 'spore-print-color'], dtype='object')

In [9]:
# Drop the columns obtained before (>=80%)
df_mushrooms = df_mushrooms.drop(columns=columns_to_drop)
df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,17.09,y,w,t,g,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,18.19,y,w,t,g,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,17.74,y,w,t,g,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,15.98,y,w,t,p,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,17.2,y,w,t,p,d,w


In [10]:
# Now I'll check the percentage of nan values by row
row_null = round((df_mushrooms.isna().sum(axis=1) / df_mushrooms.shape[1])*100,2)
print(f"The maximum percentage of null values for a single row is: {row_null.max()}%.")
print(f"With this value, I won't drop any row and replace null values with median.")

The maximum percentage of null values for a single row is: 23.53%.
With this value, I won't drop any row and replace null values with median.


In [11]:
# Replace null values with mode of the column (numerical attributes have no null values).
for col in df_mushrooms.columns:
    if df_mushrooms[col].dtype == 'object':
        object_value = df_mushrooms[col].mode()[0]
        df_mushrooms.loc[:, col] = df_mushrooms[col].fillna(object_value)

df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,g,o,f,e,c,w,16.95,17.09,y,w,t,g,d,w
1,p,16.6,x,g,o,f,e,c,w,17.99,18.19,y,w,t,g,d,u
2,p,14.07,x,g,o,f,e,c,w,17.8,17.74,y,w,t,g,d,w
3,p,14.17,f,h,e,f,e,c,w,15.77,15.98,y,w,t,p,d,w
4,p,14.64,x,h,o,f,e,c,w,16.53,17.2,y,w,t,p,d,w


In [12]:
# Finally, I'll check the presence of any duplicate
df_mushrooms.duplicated().sum()

146

In [13]:
# To avoid overfitting, I'll remove those duplicated rows
df_mushrooms = df_mushrooms.drop_duplicates()

print(f"Finally, the final dataset has {len(original_data) - len(df_mushrooms)} less rows and {original_data.shape[1] - df_mushrooms.shape[1]} less columns than the original dataset.")

Finally, the final dataset has 146 less rows and 4 less columns than the original dataset.


# Tree Predictor
_Implementation of the tree predictor._

## Train-Test Split

In [14]:
# First I split the dataset into a training and test set.
# I splitted 70/30 (train/test), and defined seed=0 to keep the same result each time the notebook is run.
train_set, test_set = TrainTestSplit(data = df_mushrooms, test_size = 0.3, seed = 0).split()

In [15]:
# Check the number of samples for each set
print(f'Original dataset: {len(df_mushrooms)}')
print(f'Train set: {len(train_set)}')
print(f'Test set: {len(test_set)}')

Original dataset: 60923
Train set: 42646
Test set: 18277


In [16]:
# Print the first 5 rows of the train set
train_set.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,g,o,f,e,c,w,16.95,17.09,y,w,t,g,d,w
2,p,14.07,x,g,o,f,e,c,w,17.8,17.74,y,w,t,g,d,w
3,p,14.17,f,h,e,f,e,c,w,15.77,15.98,y,w,t,p,d,w
5,p,15.34,x,g,o,f,e,c,w,17.84,18.79,y,w,t,p,d,u
6,p,14.85,f,h,o,f,e,c,w,17.71,16.89,y,w,t,g,d,w


In [17]:
# Print the first 5 rows of the test set
test_set.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
55342,e,5.06,o,l,n,f,f,f,f,7.04,57.99,s,n,f,f,d,a
25249,e,17.94,s,t,k,f,a,d,k,6.92,25.64,s,w,f,f,d,w
49675,e,6.4,x,d,y,t,p,c,y,7.34,13.07,s,y,f,f,d,u
58489,e,48.63,o,y,y,f,p,c,y,5.35,17.66,k,k,f,f,d,u
27564,p,7.07,x,d,e,t,d,c,n,7.74,10.31,s,n,f,f,d,a


## Tree with min_samples = 300 and max_depth = 5

In [18]:
# Load the algorithm with the train_set (aprox. it last 1400 secs.)
tree = TreePredictor(data = train_set, counter = 0, min_samples = 300, max_depth = 5).tree_predictor()

In [19]:
# Run the tree predictor algorithm with the train set
print(tree)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': [{'stem-width <= 3.01 (gini: 0.2, n: 802)': [{'cap-shape = b (gini: 0.02, n: 642)': ['e', 'p']}, 'e']}, {'cap-surface = l (gini: 0.26, n: 2204)': ['p', {'stem-surface = h (gini: 0.1, n: 1955)': ['p', 'e']}]}]}, {'stem-color = w (gini: 0.37, n: 15645)': [{'has-ring = t (gini: 0.48, n: 4201)': [{'gill-attachment = x (gini: 0.1, n: 743)': ['e', 'p']}, {'cap-surface = i (gini: 0.47, n: 3458)': ['p', 'e']}]}, {'cap-shape = c (gini: 0.29, n: 11444)': [{'stem-width <= 1.88 (gini: 0.45, n: 500)': ['e', 'p']}, {'ring-type = r (gini: 0.26, n: 10944)': ['e', 'p']}]}]}]}, {'stem-surface = g (gini: 0.49, n: 23995)': ['p', {'cap-color = r (gini: 0.49, n: 23152)': [{'cap-surface = s (gini: 0.12, n: 938)': ['p', {'gill-color = r (gini: 0.02, n: 793)': ['e', 'p']}]}, {'ring-type = z (gini: 0.48, n: 22214)': ['p', 'e']}]}]}]}


In [20]:
pprint(tree)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': [{'stem-width <= 3.01 (gini: 0.2, n: 802)': [{'cap-shape = b (gini: 0.02, n: 642)': ['e',
                                                                                                                                                                                                                              'p']},
                                                                                                                                                                                      'e']},
                                                                                                                                          {'cap-surface = l (gini: 0.26, n: 2204)': ['p',
                                                                                                                                                                         

### Accuracy

In [21]:
accuracy_test = TreeAccuracy(test_set, tree)

In [22]:
accuracy = accuracy_test.tree_accuracy()

In [23]:
accuracy

0.7319581988291295

## Tree with min_samples = 300 and max_depth = 3

In [24]:
# Load the algorithm with the train_set
tree_2 = TreePredictor(data = train_set, counter = 0, min_samples = 300, max_depth = 3).tree_predictor()

In [25]:
# Run the tree predictor algorithm with the train set
print(tree_2)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': ['p', 'e']}, 'p']}, {'stem-surface = g (gini: 0.49, n: 23995)': ['p', {'cap-color = r (gini: 0.49, n: 23152)': ['p', 'e']}]}]}


In [26]:
pprint(tree_2)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': ['p',
                                                                                                                                          'e']},
                                                                                            'p']},
                                               {'stem-surface = g (gini: 0.49, n: 23995)': ['p',
                                                                                            {'cap-color = r (gini: 0.49, n: 23152)': ['p',
                                                                                                                                      'e']}]}]}


### Accuracy

In [27]:
accuracy_test_2 = TreeAccuracy(test_set, tree_2)

In [28]:
accuracy_2 = accuracy_test_2.tree_accuracy()

In [29]:
accuracy_2

0.6785577501778192

## Tree with min_samples = 500 and max_depth = 5

In [30]:
# Load the algorithm with the train_set
tree_3 = TreePredictor(data = train_set, counter = 0, min_samples = 500, max_depth = 5).tree_predictor()

In [31]:
# Run the tree predictor algorithm with the train set
print(tree_3)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': [{'stem-width <= 3.01 (gini: 0.2, n: 802)': [{'cap-shape = b (gini: 0.02, n: 642)': ['e', 'p']}, 'e']}, {'cap-surface = l (gini: 0.26, n: 2204)': ['p', {'stem-surface = h (gini: 0.1, n: 1955)': ['p', 'e']}]}]}, {'stem-color = w (gini: 0.37, n: 15645)': [{'has-ring = t (gini: 0.48, n: 4201)': [{'gill-attachment = x (gini: 0.1, n: 743)': ['e', 'p']}, {'cap-surface = i (gini: 0.47, n: 3458)': ['p', 'e']}]}, {'cap-shape = c (gini: 0.29, n: 11444)': [{'stem-width <= 1.88 (gini: 0.45, n: 500)': ['e', 'p']}, {'ring-type = r (gini: 0.26, n: 10944)': ['e', 'p']}]}]}]}, {'stem-surface = g (gini: 0.49, n: 23995)': ['p', {'cap-color = r (gini: 0.49, n: 23152)': [{'cap-surface = s (gini: 0.12, n: 938)': ['p', {'gill-color = r (gini: 0.02, n: 793)': ['e', 'p']}]}, {'ring-type = z (gini: 0.48, n: 22214)': ['p', 'e']}]}]}]}


In [32]:
pprint(tree_3)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': [{'stem-width <= 3.01 (gini: 0.2, n: 802)': [{'cap-shape = b (gini: 0.02, n: 642)': ['e',
                                                                                                                                                                                                                              'p']},
                                                                                                                                                                                      'e']},
                                                                                                                                          {'cap-surface = l (gini: 0.26, n: 2204)': ['p',
                                                                                                                                                                         

### Accuracy

In [33]:
accuracy_test_3 = TreeAccuracy(test_set, tree_3)

In [34]:
accuracy_3 = accuracy_test_3.tree_accuracy()

In [35]:
accuracy_3

0.7319581988291295

## Tree with min_samples = 500 and max_depth = 3

In [36]:
# Load the algorithm with the train_set
tree_4 = TreePredictor(data = train_set, counter = 0, min_samples = 500, max_depth = 3).tree_predictor()

In [37]:
# Run the tree predictor algorithm with the train set
print(tree_4)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': ['p', 'e']}, 'p']}, {'stem-surface = g (gini: 0.49, n: 23995)': ['p', {'cap-color = r (gini: 0.49, n: 23152)': ['p', 'e']}]}]}


In [38]:
pprint(tree_4)

{'stem-width <= 8.55 (gini: 0.49, n: 42646)': [{'gill-spacing = d (gini: 0.42, n: 18651)': [{'stem-height <= 3.61 (gini: 0.4, n: 3006)': ['p',
                                                                                                                                          'e']},
                                                                                            'p']},
                                               {'stem-surface = g (gini: 0.49, n: 23995)': ['p',
                                                                                            {'cap-color = r (gini: 0.49, n: 23152)': ['p',
                                                                                                                                      'e']}]}]}


### Accuracy

In [39]:
accuracy_test_4 = TreeAccuracy(test_set, tree_4)

In [40]:
accuracy_4 = accuracy_test_4.tree_accuracy()

In [41]:
accuracy_4

0.6785577501778192