In [1]:
%run models/neural_nets.py


[1,    30] loss: 2.04228
example:  predicted tensor([[0.4798, 0.5202],
        [0.4821, 0.5179],
        [0.4937, 0.5063],
        [0.4547, 0.5453],
        [0.4481, 0.5519]]) , actual tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.]]) 

------------------------------
[2,    30] loss: 1.96896
example:  predicted tensor([[0.5452, 0.4548],
        [0.4842, 0.5158],
        [0.4761, 0.5239],
        [0.4833, 0.5167],
        [0.5355, 0.4645]]) , actual tensor([[1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.]]) 

------------------------------
[3,    30] loss: 1.88677
example:  predicted tensor([[0.4438, 0.5562],
        [0.4223, 0.5777],
        [0.4437, 0.5563],
        [0.5243, 0.4757],
        [0.4294, 0.5706]]) , actual tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.]]) 

------------------------------
[4,    30] loss: 1.76487
example:  predicted tensor([[0.3521, 0.6479],
  

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1) Load data
df = pd.read_csv("data/mushroom_dataset.csv")

# 2) Extract raw X and y
X_raw = df.drop(columns="class")
y_raw = df["class"]

# 3) Label‑encode the target
le = LabelEncoder().fit(y_raw)
y = le.transform(y_raw)    # 0='e', 1='p'

# 4) One‑hot encode all features
X_oh_df = pd.get_dummies(X_raw, drop_first=False)
feature_names_oh = X_oh_df.columns.tolist()
X_oh = X_oh_df.values      # convert to numpy

# 5) Stratified train/test split
X_train_oh, X_test_oh, y_train, y_test = train_test_split(
    X_oh, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# Quick sanity checks
print("One‑hot shape:", X_oh.shape)
print("Features:", len(feature_names_oh))
print("Train/test sizes:", X_train_oh.shape[0], X_test_oh.shape[0])



One‑hot shape: (8124, 117)
Features: 117
Train/test sizes: 5686 2438


In [4]:
from sklearn_extra.cluster import KMedoids

# 6) Find 40 medoids on the ONE‑HOT test set
kmed = KMedoids(n_clusters=40, metric="manhattan", random_state=0)
kmed.fit(X_test_oh)

# medoid indices into X_test_oh
med_idx    = kmed.medoid_indices_
# the actual medoid rows (one‑hot)
medoids_oh = X_test_oh[med_idx]

print(f"Selected {len(med_idx)} medoids from test set")
print("Example medoid indices:", med_idx[:10])
print("Medoids shape:", medoids_oh.shape)  # should be (40, n_features)


Selected 40 medoids from test set
Example medoid indices: [ 736  589 2233 1180    3 1549 1930 1764 2187  514]
Medoids shape: (40, 117)


In [5]:
def nn_proba_oh(arr_oh):
    """
    Input:  (n_samples × d) one‑hot numpy array
    Output: (n_samples × 2) probability array [[p(edible), p(poisonous)], …]
    """
    arr = np.atleast_2d(arr_oh).astype(np.float32)
    with torch.no_grad():
        logits = model(torch.from_numpy(arr).int())  # your PyTorch model
        probs  = torch.softmax(logits, dim=1).cpu().numpy()
    return probs


In [7]:
from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(
    training_data         = X_train_oh,
    feature_names         = feature_names_oh,
    class_names           = ["edible","poisonous"],
    categorical_features  = list(range(X_train_oh.shape[1])),
    discretize_continuous = False,
    random_state          = 0
)


In [9]:
from collections import Counter, defaultdict
import numpy as np
from lime.lime_tabular import LimeTabularExplainer

# 5) Run LIME on each medoid × 3 seeds, filter out dummy=0  
agg_freq, agg_wt = Counter(), defaultdict(float)
seeds = [0,1,2]

for seed in seeds:
    # ← make a brand‐new explainer each time, with its own random_state
    explainer = LimeTabularExplainer(
        training_data         = X_train_oh,
        feature_names         = feature_names_oh,
        class_names           = ["edible","poisonous"],
        categorical_features  = list(range(X_train_oh.shape[1])),
        discretize_continuous = False,
        random_state          = seed
    )

    for row in medoids_oh:
        # predict class for this medoid
        pred_cls = np.argmax(nn_proba_oh(row))

        # get a full list of coefficients (one for every dummy)
        exp = explainer.explain_instance(
            row,
            nn_proba_oh,
            num_features=X_train_oh.shape[1]
        )
        raw = exp.as_list(label=pred_cls)

        # now filter only those dummies where row[idx]==1, keep top 5
        count = 0
        for feat_val, wt in raw:
            feat = feat_val.split("=",1)[0]
            idx  = feature_names_oh.index(feat)
            if row[idx] == 1:
                agg_freq[feat] += 1
                agg_wt[feat]   += wt
                count += 1
                if count == 5:
                    break

# …then build your summary_df exactly as before…



IndexError: index out of range in self