In [259]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting

## `Simple Naive Bayes Classifier` 

`Naive Bayes` is a probabilistic machine learning algorithm used for classification tasks.<br>
 It is based on `Bayes` theorem,<br> which is a mathematical formula used to calculate the probability<br> 
 of a hypothesis (or event) given some observed evidence.

![](https://insightimi.files.wordpress.com/2020/04/unnamed-1.png)

### ` - Bayes Theorm `

P(A|B) = P(B|A) * P(A) / P(B)

where:

- P(A|B) is the probability of hypothesis A given the observed evidence B (also known as the posterior probability).<br>
- P(B|A) is the probability of observing evidence B given the hypothesis A (also known as the likelihood).<br>
- P(A) is the prior probability of hypothesis A before observing evidence B.<br>
- P(B) is the probability of observing evidence B.<br>

In [260]:
'''Bayes Theorem'''
# P(A|B) = P(B|A) * P(A) / P(B)
def bayes_theorm(P_A , P_B_given_A ,P_B):
    # P(B) = P(B|A) * P(A) + P(B|not A) * P(not A)
    return (P_B_given_A * P_A) / P_B


### `Naive Bayes`

In [261]:
'''Naive Bayes Classifier'''
def prior_prob(y_train, label):
    # P(y = label)
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    return class_examples / float(total_examples)

def cond_prop(x_train , y_train , features_indecies
              , feature_names):
    # P(Xi = feature_value | y = label)
    cond_prop = {}
    # Classes
    classes = set(y_train)
    
    for cls in classes:
        print(cls)
        cls_show = (y_train == cls)
        print(cls_show)
        for feature in features_indecies:
            cond_prop[(feature , cls)] = x_train[cls_show , feature].mean()            
    return cond_prop

### `Example`

In [262]:
'''
Let's say we have a dataset of emails, where each email is labeled as either spam or not spam (ham).
We want to build a Naive Bayes classifier to classify new emails as spam or ham based on their features.
The features we will use for each email are the presence or absence of certain words.
For simplicity, let's say we only consider three words: "money", "lottery", and "buy".
We count the number of times each word appears in each email and represent it as a binary feature 
(1 if the word appears, 0 if it doesn't).

N_FEATURES = 3 (money , lottery , buy)
N_CLASSES = 2 (spam , ham)
'''

'\nLet\'s say we have a dataset of emails, where each email is labeled as either spam or not spam (ham).\nWe want to build a Naive Bayes classifier to classify new emails as spam or ham based on their features.\nThe features we will use for each email are the presence or absence of certain words.\nFor simplicity, let\'s say we only consider three words: "money", "lottery", and "buy".\nWe count the number of times each word appears in each email and represent it as a binary feature \n(1 if the word appears, 0 if it doesn\'t).\n\nN_FEATURES = 3 (money , lottery , buy)\nN_CLASSES = 2 (spam , ham)\n'

| Email                                          | Money | Lottery | Buy | Spam/Ham |
|------------------------------------------------|-------|---------|-----|----------|
| Want to win the lottery? Buy our tickets now!  | 0     | 1       | 1   | Spam     |
| Money back guarantee on all purchases!        | 1     | 0       | 1   | Ham      |
| Get rich quick with our investment scheme!    | 1     | 0       | 0   | Spam     |
| Buy one, get one free! Limited time offer!    | 0     | 0       | 1   | Ham      |
| You have been selected for a cash prize!      | 1     | 1       | 0   | Spam     |
| Congratulations! You have won a prize!        | 0     | 1       | 0   | Spam     |
| Buy now and save 50% on all purchases!        | 0     | 0       | 1   | Ham      |
| Our new product is now available for purchase | 0     | 0       | 1   | Ham      |

In [277]:
data = [
    {'Email': 'Want to win the lottery? Buy our tickets now!', 'Money': 0, 'Lottery': 1, 'Buy': 1, 'Spam/Ham': 'Spam'},
    {'Email': 'Money back guarantee on all purchases!', 'Money': 1, 'Lottery': 0, 'Buy': 1, 'Spam/Ham': 'Ham'},
    {'Email': 'Get rich quick with our investment scheme!', 'Money': 1, 'Lottery': 0, 'Buy': 0, 'Spam/Ham': 'Spam'},
    {'Email': 'Buy one, get one free! Limited time offer!', 'Money': 0, 'Lottery': 0, 'Buy': 1, 'Spam/Ham': 'Ham'},
    {'Email': 'You have been selected for a cash prize!', 'Money': 1, 'Lottery': 1, 'Buy': 0, 'Spam/Ham': 'Spam'},
    {'Email': 'Congratulations! You have won a prize!', 'Money': 0, 'Lottery': 1, 'Buy': 0, 'Spam/Ham': 'Spam'},
    {'Email': 'Buy now and save 50% on all purchases!', 'Money': 0, 'Lottery': 0, 'Buy': 1, 'Spam/Ham': 'Ham'},
    {'Email': 'Our new product is now available for purchase', 'Money': 0, 'Lottery': 1, 'Buy': 1, 'Spam/Ham': 'Ham'}
]

# Convert data to pandas dataframe
data = pd.DataFrame(data)
print(data)

                                           Email  Money  Lottery  Buy Spam/Ham
0  Want to win the lottery? Buy our tickets now!      0        1    1     Spam
1         Money back guarantee on all purchases!      1        0    1      Ham
2     Get rich quick with our investment scheme!      1        0    0     Spam
3     Buy one, get one free! Limited time offer!      0        0    1      Ham
4       You have been selected for a cash prize!      1        1    0     Spam
5         Congratulations! You have won a prize!      0        1    0     Spam
6         Buy now and save 50% on all purchases!      0        0    1      Ham
7  Our new product is now available for purchase      0        1    1      Ham


In [278]:
# Extract features
X = data.iloc[: , 1:4].values
y = data.iloc[: , 4].values
print("X = \n" , X)
print("y = \n" , y)

X = 
 [[0 1 1]
 [1 0 1]
 [1 0 0]
 [0 0 1]
 [1 1 0]
 [0 1 0]
 [0 0 1]
 [0 1 1]]
y = 
 ['Spam' 'Ham' 'Spam' 'Ham' 'Spam' 'Spam' 'Ham' 'Ham']


In [279]:
n_Features = X.shape[1]
print("Number of features = " , n_Features)
n_Classes = np.unique(y).shape[0]
print("Number of classes = " , n_Classes)


Number of features =  3
Number of classes =  2


In [280]:
# Prior probabilities
prior_spam = prior_prob(y , 'Spam')
print("Prior probability of spam = " , prior_spam)
prior_ham = prior_prob(y , 'Ham')
print("Prior probability of ham = " , prior_ham)

Prior probability of spam =  0.5
Prior probability of ham =  0.5


In [281]:
# Conditional probabilities
cond_prop(X , y, [0,1,2] , ['Money' , 'Lottery' , 'Buy'])

Ham
[False  True False  True False False  True  True]
Spam
[ True False  True False  True  True False False]


{(0, 'Ham'): 0.25,
 (1, 'Ham'): 0.25,
 (2, 'Ham'): 1.0,
 (0, 'Spam'): 0.5,
 (1, 'Spam'): 0.75,
 (2, 'Spam'): 0.25}

In [290]:
test_email = {'Money': 1, 'Lottery': 0, 'Buy': 1}
# P(Spam | test_email)
'''P(test_email | Spam) * P(Spam) / P(test_email)'''
'''P(test_email | Spam)'''
'''from conditional probabilities get P(Money | Spam) , P(Lottery | Spam) , P(Buy | Spam) '''
'''P(test_email | Spam) = P(Money | Spam) * P(Lottery | Spam) * P(Buy | Spam)'''
cond_prop_array = cond_prop(X , y, [0,1,2] , ['Money' , 'Lottery' , 'Buy']) 
P_test_email_Spam = cond_prop_array[(0 , 'Spam')] *  cond_prop_array[(2 , 'Spam')]
print("P(test_email | Spam) = " , P_test_email_Spam)
'''P(test_email | Ham)'''
'''from conditional probabilities get P(Money | Ham) , P(Lottery | Ham) , P(Buy | Ham) '''
'''P(test_email | Ham) = P(Money | Ham) * P(Lottery | Ham) * P(Buy | Ham)'''
P_test_email_Ham = cond_prop_array[(0 , 'Ham')] *  cond_prop_array[(2 , 'Ham')]
print("P(test_email | Ham) = " , P_test_email_Ham) 

'''P(Spam)'''
P_Spam = prior_prob(y , 'Spam')
print("P(Spam) = " , P_Spam)
'''P(Ham)'''
P_Ham = prior_prob(y , 'Ham') 
print("P(Ham) = " , P_Ham)

'''P(test_email)'''
'''P(test_email) = P(test_email | Spam) * P(Spam) + P(test_email | Ham) * P(Ham)'''
P_test_email = P_test_email_Spam * P_Spam + P_test_email_Ham * P_Ham
print("P(test_email) = " , P_test_email)


'''P(Spam | test_email)'''
'''P(Spam | test_email) = P(test_email | Spam) * P(Spam) / P(test_email)'''
P_Spam_test_email = P_test_email_Spam * P_Spam / P_test_email
print("P(Spam | test_email) = " , P_Spam_test_email)

'''P(Ham | test_email)'''
'''P(Ham | test_email) = P(test_email | Ham) * P(Ham) / P(test_email)'''
P_Ham_test_email = P_test_email_Ham * P_Ham / P_test_email
print("P(Ham | test_email) = " , P_Ham_test_email)

'''P(Spam | test_email) + P(Ham | test_email) = 1'''
print("P(Spam | test_email) + P(Ham | test_email) = " , P_Spam_test_email + P_Ham_test_email)

'''return the class with the highest probability'''
if P_Spam_test_email > P_Ham_test_email:
    print("The email is Spam")
else:
    print("The email is Ham")

Ham
[False  True False  True False False  True  True]
Spam
[ True False  True False  True  True False False]
P(test_email | Spam) =  0.125
P(test_email | Ham) =  0.25
P(Spam) =  0.5
P(Ham) =  0.5
P(test_email) =  0.1875
P(Spam | test_email) =  0.3333333333333333
P(Ham | test_email) =  0.6666666666666666
P(Spam | test_email) + P(Ham | test_email) =  1.0
The email is ham


## - `Referances`
---
- [naive-bayes-classifier](https://towardsdatascience.com/naive-bayes-classifier-81d512f50a7c)