In [1]:
%run tree_impl.py



Original Label Encoded Data Sample:
   class  cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0      1          5            2          4        1     6                1   
1      0          5            2          9        1     0                1   
2      0          0            2          8        1     3                1   
3      1          5            3          8        1     6                1   
4      0          5            2          3        0     5                1   

   gill-spacing  gill-size  gill-color  ...  stalk-surface-below-ring  \
0             0          1           4  ...                         2   
1             0          0           4  ...                         2   
2             0          0           5  ...                         2   
3             0          1           5  ...                         2   
4             1          0           4  ...                         2   

   stalk-color-above-ring  stalk-color-below-ring 

In [2]:
from sklearn.model_selection import train_test_split

# 1) Do your reproducible split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42,     stratify=y

)

# 2) Instantiate your custom tree (no args needed)
from tree_impl import DecisionTreeClassifierCustom, print_tree
dt = DecisionTreeClassifierCustom()

# 3) Fit on the training slice
dt.fit(X_train, y_train, feature_names)
print("Decision Tree Structure:")
print_tree(dt.tree, dt.original_feature_names)

# 4) Evaluate on test
print("Test accuracy:", (dt.predict(X_test) == y_test).mean())


Original Label Encoded Data Sample:
   class  cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0      1          5            2          4        1     6                1   
1      0          5            2          9        1     0                1   
2      0          0            2          8        1     3                1   
3      1          5            3          8        1     6                1   
4      0          5            2          3        0     5                1   

   gill-spacing  gill-size  gill-color  ...  stalk-surface-below-ring  \
0             0          1           4  ...                         2   
1             0          0           4  ...                         2   
2             0          0           5  ...                         2   
3             0          1           5  ...                         2   
4             1          0           4  ...                         2   

   stalk-color-above-ring  stalk-color-below-ring 

In [3]:
# ───────────────────────────────────────────────────────────────────────────────
# Cell 1: One‑Hot Encode Train/Test Splits for Manhattan distance
# ───────────────────────────────────────────────────────────────────────────────
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1) Reload raw data (strings) so we can one‑hot encode
df_raw = pd.read_csv("mushroom_dataset.csv")

# 2) Create the same train/test split (stratified) on labels
#    Using the y you already have from tree_impl, ensure it's the same order:
X_all = df_raw.drop(columns="class")
y_all = df_raw["class"]
le = LabelEncoder().fit(df_raw["class"])  # fits on all classes once
y_all = le.transform(y_all)
X_tr_raw, X_te_raw, y_train, y_test = train_test_split(
    X_all, y_all,
    test_size=0.3,
    random_state=42,
    stratify=y_all
)

# 3) One‑hot encode each split so distances are 0/1
X_tr_oh = pd.get_dummies(X_tr_raw, columns=X_tr_raw.columns, drop_first=False)
X_te_oh = pd.get_dummies(X_te_raw, columns=X_te_raw.columns, drop_first=False)

# 4) Align columns (in case some rare category only appears in test)
all_cols = sorted(set(X_tr_oh.columns).union(X_te_oh.columns))
X_tr_oh = X_tr_oh.reindex(columns=all_cols, fill_value=0)
X_te_oh = X_te_oh.reindex(columns=all_cols, fill_value=0)

# 5) Grab numpy arrays & feature names
X_train_oh = X_tr_oh.values
X_test_oh  = X_te_oh.values
feature_names_oh = all_cols


In [4]:
from tree_impl import label_encoders, feature_names
import numpy as np
# Build a map from each original feature to its one‑hot column indices and category names:
oh_indices = {}
for col in feature_names:
    # one‑hot columns are named "col_category"
    matches = [i for i, nm in enumerate(feature_names_oh) if nm.startswith(f"{col}_")]
    # also grab the category string for each column:
    cats = [nm.split("_", 1)[1] for i, nm in enumerate(feature_names_oh) if nm.startswith(f"{col}_")]
    oh_indices[col] = list(zip(matches, cats))

def decode_one_hot_row(row_oh: np.ndarray) -> np.ndarray:
    """Turn a single one‑hot row into the integer‑encoded vector your tree expects."""
    decoded = []
    for col in feature_names:
        idxs_cats = oh_indices[col]
        # find which dummy is '1'
        for idx, cat in idxs_cats:
            if row_oh[idx] == 1:
                # encode that category back into the integer
                decoded.append(label_encoders[col].transform([cat])[0])
                break
        else:
            # if none ==1, maybe all zeros? default to most common
            decoded.append(label_encoders[col].transform([idxs_cats[0][1]])[0])
    return np.array(decoded)

In [6]:
# ───────────────────────────────────────────────────────────────────────────────
# Cell 3 (revised): DT + k‑medoids + LIME SUMMARY exactly like NBC
# ───────────────────────────────────────────────────────────────────────────────
from sklearn_extra.cluster import KMedoids

# 1) cluster on the **test** split, not train
kmed    = KMedoids(n_clusters=40, metric="manhattan", random_state=0)
kmed.fit(X_test_oh)                             # <— test here
rep_idx = kmed.medoid_indices_

# 2) background stays the **training** one‑hot
bg_data = X_train_oh

# 3) same wrapper
def dt_predict_proba(arr_oh):
    arr_oh = np.atleast_2d(arr_oh)
    arr_le = np.vstack([decode_one_hot_row(r) for r in arr_oh])
    preds  = dt.predict(arr_le)
    classes= np.unique(y_train)
    proba  = np.zeros((arr_oh.shape[0], classes.size))
    for i,p in enumerate(preds):
        proba[i, np.where(classes==p)[0][0]] = 1
    return proba



In [11]:
from collections import Counter, defaultdict

from lime.lime_tabular        import LimeTabularExplainer

# 4) aggregate exactly as before
agg_freq, agg_wt = Counter(), defaultdict(float)
for seed in (0,1,2):
    explainer = LimeTabularExplainer(
        training_data        = bg_data,
        feature_names        = feature_names_oh,
        class_names          = ["edible","poisonous"],
        mode                 = "classification",
        categorical_features=list(range(bg_data.shape[1])),
        discretize_continuous= False,
        random_state         = seed
    )
    for idx in rep_idx:
        exp = explainer.explain_instance(
            X_test_oh[idx],    # explain each **test** medoid
            dt_predict_proba,
            num_features=6
        )
        for feat, wt in exp.as_list(label=1):
            agg_freq[feat]   += 1
            agg_wt[feat]     += wt


In [12]:
import text_utils

# 5) build and save summary exactly as before
k_total   = len(rep_idx)*3
rows = [{"feature":f,
         "pct_in_top5":100*agg_freq[f]/k_total,
         "mean_signed_weight":agg_wt[f]/agg_freq[f]}
        for f in agg_freq]
summary_df = pd.DataFrame(rows).sort_values(
    ["pct_in_top5","mean_signed_weight"], ascending=False
)
text_utils.ensure_directory_exists("eval/lime_results")
summary_df.to_csv("eval/lime_results/lime_dt_summary1.csv",index=False)



print(summary_df.tail(10))

     feature  pct_in_top5  mean_signed_weight
5   odor_p=0    85.000000           -0.323165
0   odor_f=0    80.000000           -0.843280
6   odor_f=1    20.000000            0.843044
9   odor_n=0     9.166667            0.096269
11  odor_s=1     7.500000            0.326841
8   odor_y=1     2.500000            0.325831
12  odor_n=1     2.500000           -0.093546
10  odor_a=1     2.500000           -0.349308
7   odor_p=1     1.666667            0.324295
13  odor_m=0     1.666667           -0.717493


In [1]:
print(summary_df)


NameError: name 'summary_df' is not defined