# ID3 Classifier

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Import model made from scratch
from Models.tree import ID3Classifier

## Import Dataset
Using mushroom classification dataset for demonstration.  
Class: `p` for poisonous, `e` for edible.

In [2]:
mushroom = pd.read_csv("../data/mushrooms.csv")
mushroom.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
for col in mushroom.columns:
    print(col, mushroom[col].unique())

class ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r' '?']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type ['p']
veil-color ['w' 'n' 'o' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'f' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'w' 'l']


# Split Training and Validation Set

In [5]:
from sklearn.model_selection import train_test_split

X = mushroom.drop('class', axis=1)
y = mushroom['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encoding

In [6]:
# Use ordinal encoder for simplicity
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train = pd.DataFrame(encoder.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(encoder.transform(X_test), columns=X_test.columns)

In [7]:
X_train

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,2.0,3.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,...,1.0,6.0,0.0,0.0,2.0,1.0,2.0,1.0,5.0,1.0
1,5.0,2.0,5.0,1.0,5.0,1.0,0.0,0.0,1.0,0.0,...,2.0,2.0,7.0,0.0,2.0,2.0,0.0,7.0,1.0,6.0
2,0.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,10.0,0.0,...,2.0,7.0,7.0,0.0,2.0,2.0,4.0,7.0,3.0,1.0
3,2.0,2.0,4.0,0.0,7.0,1.0,0.0,1.0,0.0,1.0,...,1.0,6.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,0.0
4,3.0,3.0,4.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,...,1.0,6.0,6.0,0.0,2.0,1.0,0.0,7.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,5.0,2.0,4.0,0.0,7.0,1.0,0.0,1.0,0.0,1.0,...,2.0,6.0,6.0,0.0,2.0,1.0,0.0,7.0,4.0,0.0
6495,5.0,3.0,4.0,0.0,5.0,1.0,1.0,1.0,10.0,0.0,...,0.0,7.0,4.0,0.0,2.0,1.0,0.0,7.0,4.0,2.0
6496,2.0,0.0,8.0,0.0,5.0,1.0,1.0,0.0,7.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0
6497,5.0,0.0,3.0,1.0,5.0,1.0,0.0,0.0,10.0,1.0,...,2.0,3.0,6.0,0.0,2.0,1.0,4.0,3.0,5.0,0.0


In [8]:
X_test

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5.0,3.0,3.0,0.0,2.0,1.0,0.0,0.0,7.0,0.0,...,1.0,6.0,4.0,0.0,2.0,1.0,2.0,1.0,5.0,0.0
1,5.0,2.0,8.0,0.0,1.0,1.0,0.0,1.0,9.0,0.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,3.0,0.0
2,5.0,0.0,8.0,0.0,5.0,1.0,1.0,0.0,7.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,3.0,3.0,1.0
3,2.0,2.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,4.0
4,5.0,0.0,9.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,...,1.0,6.0,4.0,0.0,2.0,1.0,2.0,1.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,3.0,2.0,4.0,0.0,7.0,1.0,0.0,1.0,0.0,1.0,...,2.0,6.0,7.0,0.0,2.0,1.0,0.0,7.0,4.0,2.0
1621,2.0,0.0,3.0,1.0,5.0,1.0,0.0,0.0,10.0,1.0,...,2.0,7.0,6.0,0.0,2.0,1.0,4.0,3.0,4.0,0.0
1622,5.0,0.0,2.0,1.0,5.0,1.0,0.0,0.0,5.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,5.0,0.0
1623,2.0,2.0,4.0,0.0,5.0,1.0,1.0,0.0,7.0,1.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,2.0,3.0,1.0


# Evaluation

In [9]:
id3 = ID3Classifier()
id3.fit(X_train, y_train, verbose=True)

Splitting on feature gill-color with threshold 0.5

Splitting on feature spore-print-color with threshold 1.5

Splitting on feature odor with threshold 3.5

Splitting on feature gill-size with threshold 0.5

Splitting on feature spore-print-color with threshold 4.5

Splitting on feature stalk-root with threshold 0.5

Splitting on feature habitat with threshold 3.5

Splitting on feature stalk-surface-above-ring with threshold 2.5

Splitting on feature odor with threshold 5.5

Splitting on feature odor with threshold 2.0

Splitting on feature bruises with threshold 0.5

Splitting on feature stalk-surface-below-ring with threshold 2.5

Splitting on feature population with threshold 2.5



In [10]:
id3.print_tree()

if feature gill-color <= 0.5:
  return p
else:
  if feature spore-print-color <= 1.5:
    if feature odor <= 3.5:
      return p
    else:
      return e
  else:
    if feature gill-size <= 0.5:
      if feature spore-print-color <= 4.5:
        return e
      else:
        if feature stalk-root <= 0.5:
          return e
        else:
          if feature habitat <= 3.5:
            if feature stalk-surface-above-ring <= 2.5:
              return p
            else:
              return e
          else:
            return e
    else:
      if feature odor <= 5.5:
        if feature odor <= 2.0:
          if feature bruises <= 0.5:
            return p
          else:
            return e
        else:
          if feature stalk-surface-below-ring <= 2.5:
            if feature population <= 2.5:
              return p
            else:
              return e
          else:
            return p
      else:
        return p


In [11]:
y_pred = id3.predict(X_test.values)

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

le = LabelEncoder()
y_test = le.fit_transform(y_test)
y_pred = le.transform(y_pred)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       842
           1       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [13]:
# Test with sklearn's DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
y_pred = le.transform(y_pred)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       842
           1       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



# External Dataset Evaluation
`UNCOMMENT TO TRY YOUR OWN DATA`

In [14]:
# DATA = "ENTER PATH TO DATA HERE"
# TARGET = "ENTER TARGET COLUMN NAME HERE"

# df = pd.read_csv(DATA)
# X = df.drop(TARGET, axis=1)
# y = df[TARGET]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# X_train = pd.DataFrame(encoder.fit_transform(X_train), columns=X_train.columns)
# X_test = pd.DataFrame(encoder.transform(X_test), columns=X_test.columns)

# id3 = ID3Classifier()
# id3.fit(X_train, y_train)

# y_pred = id3.predict(X_test.values)

# # If the target column is not numeric, use LabelEncoder to convert it to numeric
# # Otherwise, skip this step
# le = LabelEncoder()
# y_test = le.fit_transform(y_test)
# y_pred = le.transform(y_pred)

# print(classification_report(y_test, y_pred))