In [2]:
cd ../..

c:\Users\derri\VSCode\CS573MushroomProject


In [3]:
%run models/tree_impl.py

Training Accuracy: 100.00%
Test Accuracy: 100.00%


In [4]:
from utils import text_utils

In [5]:
from sklearn.model_selection import train_test_split

# 1) Do your reproducible split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42,     stratify=y

)

# 2) Instantiate your custom tree (no args needed)
from models.tree_impl import DecisionTreeClassifierCustom, print_tree
dt = DecisionTreeClassifierCustom()

# 3) Fit on the training slice
dt.fit(X_train, y_train, feature_names)
print("Decision Tree Structure:")
# print_tree(dt.tree, dt.original_feature_names)

# 4) Evaluate on test
print("Test accuracy:", (dt.predict(X_test) == y_test).mean())


Training Accuracy: 100.00%
Test Accuracy: 100.00%
Decision Tree Structure:
Test accuracy: 1.0


In [6]:
# ───────────────────────────────────────────────────────────────────────────────
# Cell 1: One‑Hot Encode Train/Test Splits for Manhattan distance
# ───────────────────────────────────────────────────────────────────────────────
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1) Reload raw data (strings) so we can one‑hot encode
df_raw = pd.read_csv("data/mushroom_dataset.csv")

# 2) Create the same train/test split (stratified) on labels
#    Using the y you already have from tree_impl, ensure it's the same order:
X_all = df_raw.drop(columns="class")
y_all = df_raw["class"]
le = LabelEncoder().fit(df_raw["class"])  # fits on all classes once
y_all = le.transform(y_all)
X_tr_raw, X_te_raw, y_train, y_test = train_test_split(
    X_all, y_all,
    test_size=0.3,
    random_state=42,
    stratify=y_all
)

# 3) One‑hot encode each split so distances are 0/1
X_tr_oh = pd.get_dummies(X_tr_raw, columns=X_tr_raw.columns, drop_first=False)
X_te_oh = pd.get_dummies(X_te_raw, columns=X_te_raw.columns, drop_first=False)

# 4) Align columns (in case some rare category only appears in test)
all_cols = sorted(set(X_tr_oh.columns).union(X_te_oh.columns))
X_tr_oh = X_tr_oh.reindex(columns=all_cols, fill_value=0)
X_te_oh = X_te_oh.reindex(columns=all_cols, fill_value=0)

# 5) Grab numpy arrays & feature names
X_train_oh = X_tr_oh.values
X_test_oh  = X_te_oh.values
feature_names_oh = all_cols


In [7]:
# ───────────────────────────────────────────────────────────────────────────────
# Cell X: One‐hot + filtered LIME summary for Decision Tree
# ───────────────────────────────────────────────────────────────────────────────

import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from lime.lime_tabular      import LimeTabularExplainer
from collections            import Counter, defaultdict

# — your pre‑existing variables —
# X_train_oh      : np.ndarray, one‐hot encoded training data (n_train × d)
# X_test_oh       : np.ndarray, one‐hot encoded test data  (n_test  × d)
# feature_names_oh: list of length d, the one‐hot column names
# tree_classifier : your fitted DecisionTreeClassifierCustom
# y_train         : train labels (0/1)
# (make sure you’ve run the cells that create all of the above)

# 1) Cluster medoids on the one‐hot test set
kmed   = KMedoids(n_clusters=40, metric="manhattan", random_state=0)
kmed.fit(X_test_oh)
rep_idx = kmed.medoid_indices_

# 2) Define background & medoids
bg = X_train_oh            # LIME’s background
md = X_test_oh[rep_idx]    # the 40 held‐out medoids

# 3) Mark every one‐hot column as categorical
cat_feats = list(range(bg.shape[1]))

# 4) A wrapper that maps one‐hot → label vector → tree predict_proba
def dt_proba_oh(arr_oh):
    arr_oh = np.atleast_2d(arr_oh)
    # decode one‐hot to label‐encoded integers
    from models.tree_impl import label_encoders, feature_names
    X_le = []
    for row in arr_oh:
        decoded = []
        idx = 0
        for feat in feature_names:
            le = label_encoders[feat]
            n_cats = len(le.classes_)
            block = row[idx:idx+n_cats]
            decoded.append(int(np.argmax(block)))
            idx += n_cats
        X_le.append(decoded)
    X_le = np.array(X_le)
    # predict and build a 2‐col proba array
    preds = tree_classifier.predict(X_le)
    proba = np.zeros((len(preds), 2))
    for i,p in enumerate(preds):
        proba[i, int(p)] = 1
    return proba

# 5) Run LIME on each medoid × 3 seeds, filter out dummy=0  
agg_freq, agg_wt = Counter(), defaultdict(float)
seeds = [0,1,2]
for seed in seeds:
    explainer = LimeTabularExplainer(
        training_data         = bg,
        feature_names         = feature_names_oh,
        class_names           = ["edible","poisonous"],
        categorical_features  = cat_feats,
        discretize_continuous = False,
        random_state          = seed
    )
    for row in md:
        exp = explainer.explain_instance(
            row,
            dt_proba_oh,
            num_features=bg.shape[1]   # ask for all, so we can filter safely
        )
        raw = exp.as_list(label=np.argmax(dt_proba_oh(row.reshape(1,-1))[0]))
        count = 0
        for feat_val, wt in raw:
            feat = feat_val.split("=",1)[0]        # strip “=0” or “=1”
            idx  = feature_names_oh.index(feat)
            if row[idx] == 1:                      # only keep actual 1’s
                agg_freq[feat] += 1
                agg_wt[feat]   += wt
                count += 1
                if count == 5:                     # top 5 only
                    break

# 6) Build and print the summary DataFrame
k_total = len(md) * len(seeds)  # 40 × 3 = 120 runs
rows = []
for feat, freq in agg_freq.items():
    rows.append({
        "feature":            feat,
        "pct_in_top5":       100 * freq      / k_total,
        "mean_signed_weight": agg_wt[feat]   / freq
    })
summary_df = pd.DataFrame(rows).sort_values(
    ["pct_in_top5","mean_signed_weight"],
    ascending=False
).reset_index(drop=True)

print(summary_df)
text_utils.ensure_directory_exists("eval/lime_results")
summary_df.to_csv("eval/lime_results/lime_dt_summary_filtered.csv", index=False)


MemoryError: Unable to allocate 4.46 MiB for an array with shape (5000, 117) and data type float64

In [6]:
print(summary_df)


                       feature  pct_in_top5  mean_signed_weight
0            gill-attachment_f   100.000000            0.583016
1                 veil-color_w    59.166667           -0.000653
2                ring-number_o    36.666667            0.000257
3                 gill-color_n    24.166667            0.015249
4                 gill-color_b    15.000000            0.018897
..                         ...          ...                 ...
61                stalk-root_e     0.833333            0.004677
62  stalk-surface-above-ring_f     0.833333           -0.004816
63                population_s     0.833333           -0.005291
64                 gill-size_n     0.833333           -0.005469
65                   habitat_m     0.833333           -0.005721

[66 rows x 3 columns]
