In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    RocCurveDisplay)

import matplotlib.pyplot as plt

#Load the compact dataset
df = pd.read_parquet("mushroom_compact.parquet", engine="fastparquet")

cols_to_drop = [
    "cap-surface",
    "gill-attachment",
    "gill-spacing",
    "stem-root",
    "stem-surface",
    "veil-type",
    "veil-color",
    "spore-print-color",
]
df_model = df.drop(columns=cols_to_drop).copy()

#Feature lists
numeric_features = ["cap-diameter", "stem-height", "stem-width"]

categorical_features = [
    "stem-color",
    "cap-color",
    "ring-type",
    "gill-color",
    "habitat",
    "cap-shape",
    "season",
    "has-ring",
    "does-bruise-or-bleed",
]
df_model["class_binary"] = df_model["class"].map({"e": 1, "p":0})
#Target
target_col = "class_binary"

In [7]:
df_model.columns.tolist()

['class',
 'cap-diameter',
 'cap-shape',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-color',
 'stem-height',
 'stem-width',
 'stem-color',
 'has-ring',
 'ring-type',
 'habitat',
 'season',
 'class_binary']

In [8]:
#train test split
X = df_model[numeric_features + categorical_features]
y = df_model[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape


((5378492, 12), (1344624, 12))

In [9]:
#preprocessing for decision tree


numeric_transformer = "passthrough" #no scaling needed for a tree

categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=-1)

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),]

    
)




In [10]:
#pipeline
dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("tree", DecisionTreeClassifier(
        max_depth=20,  #cap depth to avoid too much overfitting aka time
        min_samples_leaf=50, #small regularization, speeds things up
        random_state=42))
])

In [11]:
#decision tree pipeline
from sklearn.tree import DecisionTreeClassifier


dt_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
    ("tree", DecisionTreeClassifier(max_depth=None, random_state=42))
])


In [12]:
#fit the model
dt_pipeline.fit(X_train, y_train)


In [13]:
#predictions

y_pred_dt = dt_pipeline.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9969731315222694
[[734004   2033]
 [  2037 606550]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    736037
           1       1.00      1.00      1.00    608587

    accuracy                           1.00   1344624
   macro avg       1.00      1.00      1.00   1344624
weighted avg       1.00      1.00      1.00   1344624




And the classification report confirms near-perfect precision, recall, and F1-scores for both classes:

          precision    recall  f1-score   support

       0       1.00      1.00      1.00    736037
       1       1.00      1.00      1.00    608587

accuracy                           1.00   1344624


