In [8]:
import os, importlib.util
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lime.lime_tabular import LimeTabularExplainer

# 1) Dynamically load your teammate’s module
spec = importlib.util.spec_from_file_location(
    "nbc_mod", os.path.join(os.getcwd(), "nbc.py")
)
nbc_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(nbc_mod)  # :contentReference[oaicite:0]{index=0}

# 2) Load & preprocess
df = pd.read_csv("mushroom_dataset.csv")
X, y, encoders = nbc_mod.preprocess_data(df)

# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 4) Train the custom NBC
nbc = nbc_mod.NBC()
nbc.train(X_train, y_train)

# 5) Build a predict_proba wrapper
def predict_proba_custom(arr):
    """
    arr: numpy array of shape (n_samples, n_features)
    returns: array of shape (n_samples, n_classes)
    """
    df_in = pd.DataFrame(arr, columns=X_train.columns)
    probs = []
    for _, row in df_in.iterrows():
        # use predict_one to get (class, raw_score)
        # but we need all class raw scores
        # so we’ll reimplement the core loop here:
        raw = {}
        for c in nbc.output_classes:
            p = nbc.output_class_probs[c]
            for ft, val in row.items():
                p *= nbc.per_class_feature_probs[c][ft][val]
            raw[c] = p
        # normalize
        total = sum(raw.values())
        probs.append([ raw[c]/total for c in nbc.output_classes ])
    return np.array(probs)

# Quick sanity check
print("Custom NBC test accuracy:",
      np.mean(nbc.predict(X_test) == y_test))

# 6) Set up LIME with categorical metadata
feature_names        = X_train.columns.tolist()
categorical_features = list(range(len(feature_names)))
categorical_names    = {
    i: encoders[col].classes_.tolist()
    for i, col in enumerate(feature_names)
}

explainer = LimeTabularExplainer(
    training_data         = X_train.values,
    feature_names         = feature_names,
    class_names           = [str(c) for c in nbc.output_classes],
    mode                  = "classification",
    categorical_features  = categorical_features,
    categorical_names     = categorical_names,
    discretize_continuous = False
)

# 7) Explain a test instance (e.g. index 0)
idx = 0
exp = explainer.explain_instance(
    data_row     = X_test.iloc[idx].values,
    predict_fn   = predict_proba_custom,
    num_features = 5
)

print(f"\nLIME explanation for custom NBC instance #{idx}:")
for feat, w in exp.as_list():
    print(f"  {feat}: {w:.4f}")

exp.save_to_file(f"lime_custom_nbc_{idx}.html")
print(f"Saved HTML → lime_custom_nbc_{idx}.html")


Custom NBC test accuracy: 0.9458572600492207

LIME explanation for custom NBC instance #0:
  odor=n: -0.3845
  spore-print-color=n: -0.1185
  stalk-surface-above-ring=s: -0.1183
  gill-size=b: -0.1181
  bruises=f: 0.0808
Saved HTML → lime_custom_nbc_0.html


In [12]:
%run nbc.py

Loading mushroom dataset...
Dataset shape: (8124, 23)
First 5 rows:
  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0           