In [1]:
import classes as cl
import pandas as pd
import numpy as np

### Data Loading

In [2]:
# 1. Load the Mushroom Dataset
df = pd.read_csv("secondary_data.csv", sep=';')

### Data exploration and preprocessing

In [3]:
df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w


In [4]:
df.isnull().sum()

class                       0
cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64

In [5]:
# Remove columns with missing values with threshold of 80%
df = df.dropna(thresh=len(df) * 0.8, axis=1)

In [6]:
df.isnull().sum()   

class                      0
cap-diameter               0
cap-shape                  0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         9884
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type               2471
habitat                    0
season                     0
dtype: int64

In [7]:
# Dropping the rows with missing values
df = df.dropna()

In [8]:
# unique values in each column
df.nunique()

class                      2
cap-diameter            2541
cap-shape                  7
cap-color                 12
does-bruise-or-bleed       2
gill-attachment            7
gill-color                12
stem-height             1928
stem-width              4605
stem-color                13
has-ring                   2
ring-type                  7
habitat                    8
season                     4
dtype: int64

In [9]:
df.dtypes

class                    object
cap-diameter            float64
cap-shape                object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-color               object
stem-height             float64
stem-width              float64
stem-color               object
has-ring                 object
ring-type                object
habitat                  object
season                   object
dtype: object

In [10]:
# Features and target
X = df.drop('class', axis=1)
y = df['class']

In [11]:
# Preprocess the data (custom encoding for features and target)
X_encoded = cl.one_hot_encoding(X)  # One-hot encoding for features
y_encoded, label_map = cl.custom_label_encoding(y)  # Label encoding for target

### Implementing the Binary Tree Classifier

In [12]:
# Split the dataset into training and testing sets 
X_train, X_test, y_train, y_test = cl.custom_train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

### Hyperparameter Tuning

In [13]:
tree=cl.DecisionTree()

In [14]:
# Ensure the param_grid keys match the expected parameter names
param_grid = {
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [5, 10, 15, 20],
    'criterion': ['gini', 'entropy', 'misclassification_error']
}

In [15]:
# Sample 20% of the training data
X_train_sample, _, y_train_sample, _ = cl.custom_train_test_split(X_train, y_train, test_size=0.8, random_state=42)

In [16]:
# Convert pandas DataFrame to numpy array for compatibility
X_train_np = X_train_sample.to_numpy()
y_train_np = y_train_sample

best_params, best_score = cl.randomized_search_cv(tree, param_grid, X_train_np, y_train_np, n_iter=10, cv=5, random_state=42, n_jobs=-1)
# Output the best parameters and score
print("Best parameters found: ", best_params)
print("Best cross-validation score: ", best_score)

Best parameters found:  {'max_depth': 40, 'min_samples_split': 10, 'criterion': 'entropy'}
Best cross-validation score:  0.9554140127388535


In [17]:
# Set the best hyperparameters to the tree
best_tree = tree.set_params(**best_params)
# Convert to numpy arrays
X_train_np = X_train.to_numpy()
y_train_np = y_train
# Train the tree with the best parameters on the full training data
best_tree.fit(X_train_np, y_train_np)

In [18]:
# Predictions on test set
y_pred = best_tree.predict(X_test.to_numpy())

In [19]:
test_metrics = cl.evaluate_model(y_test, y_pred)
print(f"Test Metrics: {test_metrics}")

Test Metrics: {'accuracy': np.float64(0.9873649887915223), 'precision': np.float64(0.9897674418604652), 'recall': np.float64(0.9871961402857673), 'f1_score': np.float64(0.9884801189149015), 'confusion_matrix': {'tp': np.int64(5320), 'tn': np.int64(4370), 'fp': np.int64(55), 'fn': np.int64(69)}}


In [20]:
# Calculate training performance
train_predictions = best_tree.predict(X_train.to_numpy())
train_loss = best_tree.evaluate(X_train.to_numpy(), y_train)

# Calculate test performance
test_loss = best_tree.evaluate(X_test.to_numpy(), y_test)

print(f"Training Zero-One Loss: {train_loss * 100:.2f}%")
print(f"Test Zero-One Loss: {test_loss * 100:.2f}%")

Training Zero-One Loss: 0.75%
Test Zero-One Loss: 1.26%
