In [25]:
import pandas as pd
import numpy as np

from data import Mushrooms_dataset
from train_test_split import Train_test_split
from gini import Gini

## EDA

In [2]:
# Import primary_data dataset
path = 'Datasets'

data = Mushrooms_dataset(path)

df_mushrooms = data.dataset

In [3]:
df_mushrooms.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,na,w,16.95,...,s,y,w,u,w,t,g,na,d,w
1,p,16.6,x,g,o,f,e,na,w,17.99,...,s,y,w,u,w,t,g,na,d,u
2,p,14.07,x,g,o,f,e,na,w,17.8,...,s,y,w,u,w,t,g,na,d,w
3,p,14.17,f,h,e,f,e,na,w,15.77,...,s,y,w,u,w,t,p,na,d,w
4,p,14.64,x,h,o,f,e,na,w,16.53,...,s,y,w,u,w,t,p,na,d,w


In [4]:
# Split dataset into traint_set adn test_set
train_set, test_set = Train_test_split(data = df_mushrooms, test_size = 0.3, seed = 0).split()

In [5]:
train_set.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,na,w,16.95,...,s,y,w,u,w,t,g,na,d,w
2,p,14.07,x,g,o,f,e,na,w,17.8,...,s,y,w,u,w,t,g,na,d,w
3,p,14.17,f,h,e,f,e,na,w,15.77,...,s,y,w,u,w,t,p,na,d,w
5,p,15.34,x,g,o,f,e,na,w,17.84,...,s,y,w,u,w,t,p,na,d,u
6,p,14.85,f,h,o,f,e,na,w,17.71,...,s,y,w,u,w,t,g,na,d,w


In [6]:
test_set.head(5)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
55340,e,4.71,o,l,n,f,f,f,f,8.02,...,na,na,n,na,na,f,f,g,d,u
25247,e,15.24,x,t,n,f,a,d,k,8.43,...,na,na,w,na,na,f,f,w,d,w
49673,e,5.24,x,d,n,t,p,na,y,6.46,...,na,na,y,na,na,f,f,na,d,a
58343,e,52.81,o,y,y,f,p,na,y,7.84,...,na,k,k,na,na,f,f,na,d,u
27562,p,5.05,s,d,e,t,d,c,w,5.89,...,na,na,n,na,na,f,f,na,d,a


In [7]:
print("train_set:")
print(f"p: {train_set['class'][train_set['class'] == 'p'].count()}")
print(f"e: {train_set['class'][train_set['class'] == 'e'].count()}")

print("test_set:")
print(f"p: {test_set['class'][test_set['class'] == 'p'].count()}")
print(f"e: {test_set['class'][test_set['class'] == 'e'].count()}")

train_set:
p: 23715
e: 19033
test_set:
p: 10173
e: 8148


## Functions

### Leaf flag

In [9]:
def is_leaf (data):

    label_column = data[:, 0]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [10]:
is_leaf(train_set.values)

False

In [11]:
is_leaf(train_set[train_set['class'] == 'p'].values)

True

### Classification

In [35]:
def classify_data(data):
    
    label_column = data.iloc[:, 0].values
    unique_classes, count_unique_classes = np.unique(label_column, return_counts=True)

    index = count_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [36]:
classify_data(train_set)

'p'

In [14]:
classify_data(train_set[train_set['class'] == 'e'].values)

'e'

### Split

In [15]:
split_column_values = train_set.iloc[:, 1]
split_column_values

0        15.26
2        14.07
3        14.17
5        15.34
6        14.85
         ...  
61064     1.18
61065     1.27
61066     1.27
61067     1.24
61068     1.17
Name: cap-diameter, Length: 42748, dtype: float64

In [16]:
def split_data(data, split_column, split_value):
    
    split_column_values = data.iloc[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

### Gini impurity (simplified function for binary nodes)

In [17]:
label_column = train_set.iloc[:, 0]
    
_, counts = np.unique(label_column, return_counts=True)

counts

array([19033, 23715], dtype=int64)

In [18]:
label_column = train_set.iloc[:, 0]
    
_, counts = np.unique(label_column, return_counts=True)

p = counts/counts.sum()

p

array([0.4452372, 0.5547628])

In [19]:
1 - np.sum(p ** 2)

0.49400207236598814

In [20]:
def gini_impurity(data):
    
    label_column = data.iloc[:, 0]
    
    _, counts = np.unique(label_column, return_counts=True)

    if len(counts) != 2:
        raise ValueError("Tree for binary nodes, so only 2 classes are accepted.")

    p = counts / counts.sum()
    
    gini = 1 - np.sum(p ** 2)
     
    return gini

In [21]:
gini_impurity(train_set)

0.49400207236598814

### Attribute types

In [22]:
train_set.nunique()

class                      2
cap-diameter            2380
cap-shape                  7
cap-surface               12
cap-color                 12
does-bruise-or-bleed       2
gill-attachment            8
gill-spacing               4
gill-color                12
stem-height             2067
stem-width              4286
stem-root                  6
stem-surface               9
stem-color                13
veil-type                  2
veil-color                 7
has-ring                   2
ring-type                  9
spore-print-color          8
habitat                    8
season                     4
dtype: int64

In [23]:
def attribute_type(data):
    
    attribute_types = []
    treshold = 20

    for attribute in data.columns:
        if attribute != "class":
            unique_values = data[attribute].unique()
            first_value = unique_values[0]

            if (isinstance(first_value, str)) or (len(unique_values) <= treshold):
                attribute_types.append("categorical")
            else:
                attribute_types.append("continuous")
    
    return attribute_types

In [24]:
attribute_type(train_set)

['continuous',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'continuous',
 'continuous',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical']