In [1]:
import numpy as np
from collections import defaultdict, Counter

# --------------------------
# Dataset (14 rows, categorical)
# --------------------------
data = [
    # age,     income,  student, credit,     buys_computer
    ("<=30",   "high",  "no",    "fair",     "no"),
    ("<=30",   "high",  "no",    "excellent","no"),
    ("31..40", "high",  "no",    "fair",     "yes"),
    (">40",    "medium","no",    "fair",     "yes"),
    (">40",    "low",   "yes",   "fair",     "yes"),
    (">40",    "low",   "yes",   "excellent","no"),
    ("31..40", "low",   "yes",   "excellent","yes"),
    ("<=30",   "medium","no",    "fair",     "no"),
    ("<=30",   "low",   "yes",   "fair",     "yes"),
    (">40",    "medium","yes",   "fair",     "yes"),
    ("<=30",   "medium","yes",   "excellent","yes"),
    ("31..40", "medium","no",    "excellent","yes"),
    ("31..40", "high",  "yes",   "fair",     "yes"),
    (">40",    "medium","no",    "excellent","no"),
]

X = [tuple(row[:-1]) for row in data]     # features
y = [row[-1] for row in data]             # labels
features = ["age", "income", "student", "credit"]

# Query sample:
x_query = ("<=30", "medium", "yes", "fair")

# --------------------------
# Build frequency tables with Laplace smoothing
# --------------------------
classes = sorted(set(y))
n = len(y)

# prior counts
class_counts = Counter(y)

# For each feature position, collect the set of possible categorical values (for smoothing)
value_spaces = [sorted(set(col[i] for col in X)) for i in range(len(features))]

# likelihood counts: counts[(class, feature_idx, feature_value)]
counts = defaultdict(int)
for xi, yi in zip(X, y):
    for j, v in enumerate(xi):
        counts[(yi, j, v)] += 1

def posterior_logprob(x):
    """Return log posterior for each class with Laplace smoothing (alpha=1)."""
    alpha = 1.0
    logps = {}
    for c in classes:
        # log prior
        logp = np.log((class_counts[c]) / n)
        # multiply likelihoods (add logs)
        for j, v in enumerate(x):
            Vj = len(value_spaces[j])                      # number of categories for feature j
            num = counts[(c, j, v)] + alpha               # smoothed count
            den = class_counts[c] + alpha * Vj            # smoothed total
            logp += np.log(num / den)
        logps[c] = logp
    return logps

# --------------------------
# Predict & show posterior
# --------------------------
logps = posterior_logprob(x_query)
# normalize to probabilities
maxlog = max(logps.values())
probs = {c: np.exp(lp - maxlog) for c, lp in logps.items()}
Z = sum(probs.values())
probs = {c: p/Z for c, p in probs.items()}
pred = max(probs, key=probs.get)

print("From-scratch Naive Bayes (categorical):")
print("Posterior probabilities:", probs)
print("Prediction:", pred)


From-scratch Naive Bayes (categorical):
Posterior probabilities: {'no': np.float64(0.23217141029740956), 'yes': np.float64(0.7678285897025904)}
Prediction: yes


In [2]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

# Same dataset
X_cat = np.array([row[:-1] for row in data], dtype=object)
y_cat = np.array([row[-1]  for row in data], dtype=object)

# Encode categorical features as integers for CategoricalNB
enc = OrdinalEncoder()
X_enc = enc.fit_transform(X_cat)

# Encode labels to {0,1} implicitly handled by CategoricalNB; we can keep strings in y
clf = CategoricalNB(alpha=1.0)   # Laplace smoothing
clf.fit(X_enc, y_cat)

# Transform the query sample and predict
xq_enc = enc.transform(np.array([x_query], dtype=object))
pred_skl = clf.predict(xq_enc)[0]
proba_skl = dict(zip(clf.classes_, clf.predict_proba(xq_enc)[0]))

print("\nscikit-learn CategoricalNB:")
print("Posterior probabilities:", proba_skl)
print("Prediction:", pred_skl)



scikit-learn CategoricalNB:
Posterior probabilities: {np.str_('no'): np.float64(0.2321714102974098), np.str_('yes'): np.float64(0.7678285897025902)}
Prediction: yes
