In [1]:
import os, importlib.util
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lime.lime_tabular import LimeTabularExplainer

# 1) Dynamically load your teammate’s module
spec = importlib.util.spec_from_file_location(
    "nbc_mod", os.path.join(os.getcwd(), "nbc.py")
)

In [2]:

nbc_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(nbc_mod)  

# 2) Load & preprocess
df = pd.read_csv("mushroom_dataset.csv")
X, y, encoders = nbc_mod.preprocess_data(df)


In [3]:

# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [4]:

# 4) Train the custom NBC
nbc = nbc_mod.NBC()
nbc.train(X_train, y_train)


In [5]:

# 5) Build a predict_proba wrapper
def predict_proba_custom(arr):
    """
    arr: numpy array of shape (n_samples, n_features)
    returns: array of shape (n_samples, n_classes)
    """
    df_in = pd.DataFrame(arr, columns=X_train.columns)
    probs = []
    for _, row in df_in.iterrows():
        # use predict_one to get (class, raw_score)
        # but we need all class raw scores
        # so we’ll reimplement the core loop here:
        raw = {}
        for c in nbc.output_classes:
            p = nbc.output_class_probs[c]
            for ft, val in row.items():
                p *= nbc.per_class_feature_probs[c][ft][val]
            raw[c] = p
        # normalize
        total = sum(raw.values())
        probs.append([ raw[c]/total for c in nbc.output_classes ])
    return np.array(probs)


In [6]:

# Quick sanity check
print("Custom NBC test accuracy:",
      np.mean(nbc.predict(X_test) == y_test))


Custom NBC test accuracy: 0.9458572600492207


In [7]:

# 6) Set up LIME with categorical metadata
feature_names        = X_train.columns.tolist()
categorical_features = list(range(len(feature_names)))
categorical_names    = {
    i: encoders[col].classes_.tolist()
    for i, col in enumerate(feature_names)
}


In [8]:

explainer = LimeTabularExplainer(
    training_data         = X_train.values,
    feature_names         = feature_names,
    class_names           = [str(c) for c in nbc.output_classes],
    mode                  = "classification",
    categorical_features  = categorical_features,
    categorical_names     = categorical_names,
    discretize_continuous = False
)


In [9]:

# 7) Explain a test instance (e.g. index 0)
idx = 1
exp = explainer.explain_instance(
    data_row     = X_test.iloc[idx].values,
    predict_fn   = predict_proba_custom,
    num_features = 5
)

In [10]:


print(f"\nLIME explanation for custom NBC instance #{idx}:")
for feat, w in exp.as_list():
    print(f"  {feat}: {w:.4f}")

exp.save_to_file(f"lime_custom_nbc_{idx}.html")
print(f"Saved HTML → lime_custom_nbc_{idx}.html")



LIME explanation for custom NBC instance #1:
  gill-color=b: 0.3109
  odor=y: 0.2485
  gill-size=n: 0.1262
  stalk-surface-above-ring=s: -0.1078
  population=v: 0.1017
Saved HTML → lime_custom_nbc_1.html


In [11]:
# ------------------------------------------------------------------
#  LIME-on-k–medoids  pipeline  (one‑hot version)
# ------------------------------------------------------------------
import pandas as pd, numpy as np, joblib, json
from sklearn.preprocessing import OneHotEncoder
from sklearn_extra.cluster import KMedoids          # pip install scikit-learn-extra
from lime.lime_tabular import LimeTabularExplainer
from collections import defaultdict, Counter
import sklearn
from sklearn.preprocessing import OneHotEncoder
from packaging import version


# ---------- 1. one‑hot encode the original df ---------------------
df = pd.read_csv("mushroom_dataset.csv")
y_full = df["class"].map({"e": 0, "p": 1}).to_numpy()
X_raw  = df.drop(columns="class")


if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
else:
    ohe = OneHotEncoder(sparse=False,       handle_unknown="ignore")

X_onehot = ohe.fit_transform(X_raw)
onehot_feature_names = ohe.get_feature_names_out(X_raw.columns)


In [12]:

# ---------- 2. train/test split (same split as your NBC) ----------
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
    X_onehot, y_full, np.arange(len(y_full)),
    test_size=0.3, random_state=42, stratify=y_full
)


In [13]:

# ---------------------------------------------------------------
# 1.  LOAD & SPLIT DATA  (label‑encoded + one‑hot)
# ---------------------------------------------------------------
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import Bunch
from nbc import NBC           # your custom Naive Bayes class

df = pd.read_csv("mushroom_dataset.csv")
X_raw, y_raw = df.drop(columns="class"), df["class"]


In [14]:

# --- label‑encode every column (NBC needs this) ---
X_le = X_raw.copy()
encoders = {}
for col in X_le.columns:
    le = LabelEncoder().fit(X_le[col])
    X_le[col] = le.transform(X_le[col])
    encoders[col] = le

y = y_raw.values



In [15]:
# --- one‑hot encode for distance / LIME ---
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_oh = ohe.fit_transform(X_raw)
onehot_names = ohe.get_feature_names_out(X_raw.columns)


In [16]:

# one single split so rows align
X_le_tr, X_le_te, X_oh_tr, X_oh_te, y_tr, y_te = train_test_split(
    X_le, X_oh, y, test_size=0.3, random_state=42, stratify=y
)


In [17]:

# ---------------------------------------------------------------
# 2.  TRAIN NBC ON LABEL‑ENCODED TRAIN DATA
# ---------------------------------------------------------------
nbc = NBC()
nbc.train(X_le_tr, y_tr)          # custom implementation



In [18]:
# ---------------------------------------------------------------
# 3.  PICK k MEDOIDS (or K‑means) ON ONE‑HOT TEST MATRIX
# ---------------------------------------------------------------
k = 40
try:
    from sklearn_extra.cluster import KMedoids
    med = KMedoids(n_clusters=k, metric="manhattan", random_state=0).fit(X_oh_te)
    rep_idx_te = med.medoid_indices_            # indices within X_oh_te
except ImportError:
    # fallback: K‑means + closest point to each centroid
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=k, random_state=0, n_init='auto').fit(X_oh_te)
    from sklearn.metrics import pairwise_distances_argmin
    rep_idx_te = pairwise_distances_argmin(km.cluster_centers_, X_oh_te)



In [19]:
# get corresponding rows in each representation
X_oh_reps = X_oh_te[rep_idx_te]
X_le_reps = X_le_te.iloc[rep_idx_te]



In [20]:
# ---------------------------------------------------------------
# 4.  LIME ON EACH REPRESENTATIVE POINT (3 seeds each)
# ---------------------------------------------------------------
from lime.lime_tabular import LimeTabularExplainer
from collections import Counter, defaultdict

explainer = LimeTabularExplainer(
    training_data=X_oh_tr,
    feature_names=onehot_names,
    class_names=["edible", "poisonous"],
    discretize_continuous=False
)
def make_explainer(seed):
    return LimeTabularExplainer(
        X_oh_tr,
        feature_names=onehot_names,
        class_names=["edible", "poisonous"],
        discretize_continuous=False,
        random_state=seed
    )



In [21]:
def model_proba(X_oh_batch):
    """
    Parameters
    ----------
    X_oh_batch : ndarray (n_samples, n_onehot_features)
        0/1 one‑hot rows supplied by LIME.

    Returns
    -------
    probs : ndarray (n_samples, 2)
        Columns: P(edible) , P(poisonous)
    """
    # --- 1. one‑hot  ➜  original string categories -------------
    cat_matrix = ohe.inverse_transform(X_oh_batch)   # shape (n_samples, 117)

    # --- 2. string categories  ➜  label‑encoded ints -----------
    #     vectorised column‑by‑column
    le_matrix = np.empty_like(cat_matrix, dtype=int)
    for j, col in enumerate(X_raw.columns):
        le_matrix[:, j] = encoders[col].transform(cat_matrix[:, j])

    # --- 3. NBC probability for each row -----------------------
    prob_e = []
    prob_p = []
    for row in le_matrix:
        # row is a 1‑D int array; wrap in dict {feature: value}
        entry = dict(zip(X_raw.columns, row))
        # use your NBC's predict_one, which returns (class, prob)
        pred_class, pred_prob = nbc.predict_one(entry)

        # compute the *unnormalised* probs for each class
        unnorm = {}
        for cls in nbc.output_classes:
            p = nbc.output_class_probs[cls]
            for ft, val in entry.items():
                p *= nbc.per_class_feature_probs[cls][ft][val]
            unnorm[cls] = p
        total = unnorm['e'] + unnorm['p']
        prob_e.append(unnorm['e'] / total)
        prob_p.append(unnorm['p'] / total)

    return np.column_stack([prob_e, prob_p])


In [24]:
seeds = (0, 1, 2)
agg_freq, agg_weight = Counter(), defaultdict(float)

for oh_row in X_oh_reps:
    for sd in seeds:
        exp = make_explainer(sd).explain_instance(
            oh_row,
            model_proba,
            num_features=5
        )
        for feat, weight in exp.as_list():
            agg_freq[feat] += 1
            agg_weight[feat] += weight









In [25]:


# ---------------------------------------------------------------
# 5.  SUMMARY CSV
# ---------------------------------------------------------------
k_total = k * len(seeds)
rows = [
    {
        "feature": f,
        "pct_in_top5": 100 * agg_freq[f] / k_total,
        "mean_signed_weight": agg_weight[f] / agg_freq[f],
    }
    for f in agg_freq
]
summary_df = pd.DataFrame(rows).sort_values(
    ["pct_in_top5", "mean_signed_weight"], ascending=False
)
summary_df.to_csv("lime_nbc_summary1.csv", index=False)
print("sweet it worked: wrote lime_nbc_summary.csv with", len(summary_df), "rows")


✓ wrote lime_nbc_summary.csv with 6 rows


odor_f	100 %	+0.106	“Odor = foul” is in every explanation and increases the NBC’s confidence in the predicted class (almost always poisonous). It’s the model’s strongest universal cue.
gill‑color_b	100 %	+0.088	Having brown gills consistently supports the prediction (likely poisonous).
ring‑type_l	100 %	+0.053	Large ring type also pushes probability toward the predicted class.
odor_n	100 %	‑0.132	“Odor = none” appears everywhere but with a negative effect – it pulls the model away from the current prediction. That usually means the model uses odor = none as evidence against poison.
ring‑type_p	98 %	‑0.050	Partial ring is almost everywhere and slightly decreases confidence; perhaps NBC associates partial rings with edible mushrooms.
spore‑print‑color_h	1.7 %	+0.041	“Spore‑print = chocolate” (value h) barely shows up (only 2 runs out of 120) so it’s not a globally important cue, but when it does appear it nudges the probability up.