# Lab: Naive Bayes

## Part 4: Multinomial naive Bayes for multi-class classification

Extend the multinomial naive Bayes solution so that you can distinguish between more than 2 classes. 

We'll work on a toy dataset again.

Name: Benjamin Fraeyman

### 1. Imports and data set creation

In [1]:
from __future__ import print_function
import numpy as np

In [2]:
class_vec = np.array([0,1,0,1,0,1,2,2,2])
sentences = np.array([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please','help'],
             ['maybe', 'stop', 'taking', 'him', 'to', 'dog', 'park', 'stupid'],
             ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
             ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
             ['mr', 'licks', 'ate', 'my', 'steak', 'how','to', 'stop', 'him'],
             ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'],
             ['Kitten','is','so','fluffy','white','she','has','little','paws'],
             ['Cat','has','long','nails','on','his','paws'],
             ['Kitten','is','so','little','she','has','orange','fluffy','fur']])

In [3]:
# Create vocabulary
# List which will contain all unique words contained in the data set
all_words = []

# Transform the data set (which is a list of lists) to a single list
for sentence in sentences:
    all_words.extend(sentence)

# Use the numpy function #unique# to get all unique elements from a list
vocab = np.unique(all_words)

In [4]:
#Encode the data set
def encode_multinomial(vocab,sentence):
    vocab_list = vocab.tolist()
    binary_sentence=np.zeros(len(vocab_list),)
    for word in sentence:
        if word in vocab:
            binary_sentence[vocab_list.index(word)] += 1
    return binary_sentence

data_set = []
for sentence in sentences:
    binary_sentence = encode_multinomial(vocab, sentence)
    data_set.append(binary_sentence)
    
data_set = np.array(data_set)
print(data_set)

[[0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

### 2. Prior calculation

In [5]:
# Calculate the priors
# Total number of sentences
N = np.float(len(sentences))
class_unique = np.unique(class_vec)
print(len(class_unique))
prior = np.zeros(len(class_unique))
for sentence_class in (class_unique):
    prior[sentence_class] = len(class_vec[class_vec==sentence_class])/N

print(prior)

3
[0.33333333 0.33333333 0.33333333]


### 3. Likelihood calculation


In [6]:
# Calculate the likelihood of each word given a class: P(wt|C) 
class_count = len(np.unique(class_vec))
word_count_class = []
total_word_count_class = []
words_likelihood = []

print(class_unique)
for sentence_class in range(0, class_count):
    word_count_class.append(np.sum(data_set[class_vec==sentence_class],axis=0) + 1)
    total_word_count_class.append(np.sum(data_set[class_vec==sentence_class]))
    words_likelihood.append(1. * word_count_class[sentence_class] / (total_word_count_class[sentence_class] + len(vocab)))

print(words_likelihood)    

[0 1 2]
[array([0.01449275, 0.02898551, 0.01449275, 0.02898551, 0.01449275,
       0.02898551, 0.02898551, 0.02898551, 0.02898551, 0.01449275,
       0.01449275, 0.01449275, 0.01449275, 0.02898551, 0.04347826,
       0.04347826, 0.01449275, 0.02898551, 0.02898551, 0.02898551,
       0.01449275, 0.01449275, 0.02898551, 0.01449275, 0.02898551,
       0.05797101, 0.01449275, 0.01449275, 0.01449275, 0.01449275,
       0.01449275, 0.02898551, 0.01449275, 0.02898551, 0.01449275,
       0.01449275, 0.02898551, 0.02898551, 0.02898551, 0.01449275,
       0.01449275, 0.02898551, 0.01449275, 0.01449275]), array([0.01587302, 0.01587302, 0.01587302, 0.01587302, 0.03174603,
       0.01587302, 0.01587302, 0.04761905, 0.01587302, 0.01587302,
       0.03174603, 0.01587302, 0.03174603, 0.01587302, 0.01587302,
       0.03174603, 0.01587302, 0.01587302, 0.01587302, 0.01587302,
       0.01587302, 0.01587302, 0.01587302, 0.03174603, 0.01587302,
       0.01587302, 0.01587302, 0.01587302, 0.01587302, 0.031746

### 4. Classification

In [7]:
# Create a classification function that returns the label
def classify(sentence,vocab,words_likelihood,prior):
    # Create a BOW representation of the new sentence
    coded_sentence = encode_multinomial(vocab,sentence)
    class_count = len(words_likelihood)
    log_likelihood = []
    posterior = []
    
    for sentence_class in range(0, class_count):
        log_likelihood.append(np.sum((coded_sentence*np.log(words_likelihood[sentence_class]))))
        posterior.append(np.log(prior[sentence_class])+log_likelihood[sentence_class])

    highest_value = -float("inf")
    highest_class = 0
    for sentence_class in range(0, class_count):
        if posterior[sentence_class] > highest_value:
            highest_value = posterior[sentence_class]
            highest_class = sentence_class
    return highest_class

### 5. Tests

In [8]:
# Classify this sentence and print the label
sentence1=['my','dog','is','cute','he','licks','me']
classify(sentence1, vocab, words_likelihood, prior)

0

In [9]:
# Classify this sentence and print the label
sentence2=['my','dog','is','stupid','and','worthless',"real"]
classify(sentence2, vocab, words_likelihood, prior)

1

In [10]:
# Classify this sentence and print the label
sentence3=['she','is','so','white','and','fluffy',"my","little","kitten"]
classify(sentence3, vocab, words_likelihood, prior)

2