# Naive Bayes from scratch

Implementazione del classificatore Naive Bayes partendo da un toy dataset,
seguendo l'esempio teorico del libro *Python Machine Learning by Example*.

## Obiettivo

- capire il calcolo di prior, likelihood e posterior
- implementare il Laplace smoothing
- verificare il risultato numericamente


In [2]:
import numpy as np

X_train = np.array([
    [0, 1, 1],
    [0, 0, 1],
    [0, 0 ,0],
    [1, 1, 0]])
Y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1, 1, 0]])

In [4]:
def get_label_indices(labels):
    """
    Group samples based on their labels and return indices.
    @param labels: list of labels.
    @return: dict, {class1: [indices], class2: [indices]}.
    """
    from collections import defaultdict
    label_indices = defaultdict(list)
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [6]:
label_indices = get_label_indices(Y_train)
print('label_indices:\n', label_indices)

label_indices:
 defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})


In [33]:
def get_prior(label_indices):
    """
    Compute prior based on training samples.
    @param label_indices: grouped sample indices by class.
    @return: dictionary, with class label as key, corresponding prior as the value.
    """
    prior = {label: len(indices) for label, indices in label_indices.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count
    return prior

In [35]:
prior = get_prior(label_indices)
print('Prior:', prior)

Prior: {'Y': 0.75, 'N': 0.25}


In [39]:
def get_likelihood(features, label_indices, smoothing=0):
    """
    Compute likelihood based on training samples.
    @param features: matrix of features.
    @param label_indices: grouped sample indices by class.
    @param smoothing: integer, additive smoothing parameter.
    @return: dictionary, with class as key, corresponding conditional probability 
    P(feature|class) vector as value.
    """
    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis=0) + smoothing
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
    return likelihood