# Pre-requisites
_Libraries and objects used, together with the imported dataset._

## Libraries used

In [1]:
import numpy as np
import pandas as pd

## Objects used

In [2]:
# Object for importing data
from data import Mushrooms_dataset

# Object for encoding categorical values and target
from encoding import Encoding

# Object for splitting data into train and test set
from train_test_split import split

# Object for calculating the gini function
from criterions import information_gain

# Object for Randomized Search CV
from randomized_search_cv import RandomizedSearchCV

# Object for the tree's nodes
from tree_node import TreeNode

# Object for the tree algorithm
from tree_algorithm import DecisionTree

# Object for evaluating the final model
from model_evaluation import ModelEvaluator

## Import dataset

In [3]:
# Folder where the dataset is saved
path = 'Datasets'

In [4]:
# Associating the dataset to a variable
df_mushrooms = Mushrooms_dataset(path).dataset

# EDA
_Some exploratory data analysis to understando the data that is going to be used for the construction of the tree predictor._

In [5]:
# See the first 5 rows of the dataset
original_data = df_mushrooms # Just for future comparisons after processing data
df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [6]:
# Check the type of each attribute
print(f"There are some attributes with missing values:")
print(df_mushrooms.info())

There are some attributes with missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3

No null values for continuous values, while some categorical columns have nul values.

In [7]:
# Obtain percentage of null values
print(f"Taking into account these percentages, I'll exclude those attributes with more than 80% of null values:")
print(round(df_mushrooms.isna().sum()/len(df_mushrooms)*100,2))

Taking into account these percentages, I'll exclude those attributes with more than 80% of null values:
class                    0.00
cap-diameter             0.00
cap-shape                0.00
cap-surface             23.12
cap-color                0.00
does-bruise-or-bleed     0.00
gill-attachment         16.18
gill-spacing            41.04
gill-color               0.00
stem-height              0.00
stem-width               0.00
stem-root               84.39
stem-surface            62.43
stem-color               0.00
veil-type               94.80
veil-color              87.86
has-ring                 0.00
ring-type                4.05
spore-print-color       89.60
habitat                  0.00
season                   0.00
dtype: float64


In [8]:
# Obtain columns which exceed the threshold of 80%
columns_to_drop = df_mushrooms.columns[(round(df_mushrooms.isna().sum()/len(df_mushrooms)*100,2)) >= 80]
columns_to_drop

Index(['stem-root', 'veil-type', 'veil-color', 'spore-print-color'], dtype='object')

In [9]:
# Drop the columns obtained before (>=80%)
df_mushrooms = df_mushrooms.drop(columns=columns_to_drop)
df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,17.09,y,w,t,g,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,18.19,y,w,t,g,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,17.74,y,w,t,g,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,15.98,y,w,t,p,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,17.2,y,w,t,p,d,w


In [10]:
# Now I'll check the percentage of nan values by row
row_null = round((df_mushrooms.isna().sum(axis=1) / df_mushrooms.shape[1])*100,2)
print(f"The maximum percentage of null values for a single row is: {row_null.max()}%.")
print(f"With this value, I won't drop any row and replace null values with mode.")

The maximum percentage of null values for a single row is: 23.53%.
With this value, I won't drop any row and replace null values with mode.


In [11]:
# Finally, I'll check the presence of any duplicate
df_mushrooms.duplicated().sum()

146

In [12]:
# To avoid overfitting, I'll remove those duplicated rows
df_mushrooms = df_mushrooms.drop_duplicates()

print(f"Finally, the final dataset has {len(original_data) - len(df_mushrooms)} less rows and {original_data.shape[1] - df_mushrooms.shape[1]} less columns than the original dataset.")

Finally, the final dataset has 146 less rows and 4 less columns than the original dataset.


# Tree Predictor
_Implementation of the tree predictor._

## Train-Test Split

For simplifying variables, now features are called x, and target are y.

In [13]:
# Features and target
features = df_mushrooms.drop('class', axis=1)
target = df_mushrooms['class']

In [14]:
# First I split the dataset into a training and test set.
# I splitted 80/20 (train/test), and defined seed=0 to keep the same result each time the notebook is run.
x_train, x_test, y_train, y_test = split(features, target, test_size=0.2, seed = 0)

In [15]:
# Replace missing values with the mode
for dataset in [x_train, x_test]:
    for col in dataset.columns:
        if dataset[col].dtype == 'object':
            object_value = dataset[col].mode()[0]
            dataset.loc[:, col] = dataset[col].fillna(object_value)

for dataset in [y_train, y_test]:
    if dataset.dtype == 'object':
        object_value = dataset.mode()[0]
        dataset.loc[:] = dataset.fillna(object_value)

In [16]:
# Encoding both categorical features and targets for further usage
encoder = Encoding()

x_train = encoder.one_hot_encoding(x_train, fit=True)
y_train, target_mapping = encoder.target_encoding(y_train, fit=True)

x_test = encoder.one_hot_encoding(x_test, fit=False)
y_test, _ = encoder.target_encoding(y_test, fit=False)

In [17]:
# Check the number of samples for each set
print(f'Original dataset: {len(df_mushrooms)}')
print(f'Train set: {len(x_train)}')
print(f'Test set: {len(x_test)}')

Original dataset: 60923
Train set: 48738
Test set: 12185


# Run Tree

In [18]:
param_grid = {
    'max_depth': [10, 25, 40],
    'min_samples': [5, 15, 20],
    'criterion': ['gini', 'entropy', 'misclassification_error']
}

In [19]:
# Convert pandas DataFrame to numpy array for compatibility
x_train_np = x_train.to_numpy()
y_train_np = y_train

In [20]:
search = RandomizedSearchCV(param_grid, x_train_np, y_train_np, n_iter=10, cv=5, seed=0, n_jobs=-1)

In [21]:
best_params, best_score = search.fit()

In [22]:
# Output the best parameters
print("Best parameters: ", best_params)

Best parameters:  {'max_depth': 25, 'min_samples': 5, 'criterion': 'gini'}


In [23]:
# Output the best score
print("Best cv score: ", round(best_score,4))

Best cv score:  0.9959


In [24]:
# Now I use the best hyperparameters outputed previously, into the tree
final_tree = DecisionTree(**best_params)

# Train the tree with the best parameters on the full training data
final_tree.fit(x_train_np, y_train_np)

In [25]:
# Predictions on test set
predicted_y = final_tree.predict(x_test.to_numpy())

In [26]:
test_metric_eval = ModelEvaluator(y_test, predicted_y)
print(f"Test performance: {test_metric_eval.evaluate()}")

Test performance: {'accuracy': 0.9971276159212146, 'precision': 0.9972992437882607, 'recall': 0.9964022306170175, 'f1_score': 0.9968505354089805, 'confusion_matrix': {'tp': 5539, 'tn': 6611, 'fp': 15, 'fn': 20}}


In [27]:
# Calculate the test performance
test_loss = final_tree.evaluate(x_test.to_numpy(), y_test)

In [28]:
# Finally, we calculate the training performance
predicted_train = final_tree.predict(x_train.to_numpy())
train_loss = final_tree.evaluate(x_train.to_numpy(), y_train)

In [29]:
print(f"Zero-One Loss for training set: {train_loss * 100:.2f}%")
print(f"Zero-One Loss for test set: {test_loss * 100:.2f}%")

Zero-One Loss for training set: 0.14%
Zero-One Loss for test set: 0.29%


# Testing Tree

In [30]:
new_mushroom = np.array([[
    5.2,  # cap-diameter
    0, 1, 0, 0, 0, 0, 0,  # cap-shape (x = 1, rest 0)
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,  # cap-surface (y = 1)
    0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # cap-color (w = 1)
    0, 1,  # does-bruise-or-bleed (t = 1)
    1, 0, 0, 0, 0, 0, 0,  # gill-attachment (f = 1)
    0, 0, 1,  # gill-spacing (c = 1)
    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,  # gill-color (k = 1)
    10.1,  # stem-height
    1.2,   # stem-width
    0, 1, 0, 0, 0, 0, 0, 0,  # stem-surface (s = 1)
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # stem-color (g = 1)
    0, 1,  # has-ring (t = 1)
    0, 0, 0, 0, 0, 1, 0, 0,  # ring-type (p = 1)
    0, 0, 0, 0, 1, 0, 0, 0,  # habitat (w = 1)
    0, 0, 1, 0  # season (s = 1)
]])

prediction = final_tree.predict(new_mushroom)

if prediction[0] == 1:
    print("The mushroom is POISONOUS, DON'T eat it.")
else:
    print("The mushroom is EDIBLE, you can eat it.")

The mushroom is EDIBLE, you can eat it.
