# Bayesian inference and Naive Bayes classifier for emails

## Step 1. Understanding the dataset

In [40]:
import numpy as np
import pandas as pd
import os

In [107]:
# Read dataset

df = pd.read_table(os.path.join('smsspamcollection', 'SMSSpamCollection'), sep = '\t', names = ['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Step 2. Data preprocessing

In [108]:
# change label format to numeric

df['label'] = df['label'].map({"ham":0, "spam":1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Step 3. Bag of words

### Bag of words manual implementation example

In [70]:
# Transorm text to lowercase

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = [document.lower() for document in documents]
lower_case_documents

['hello, how are you!',
 'win money, win from home.',
 'call me now.',
 'hello, call hello you tomorrow?']

In [71]:
# Remove punctuation

import re
import string

# sans_punctuation_documents = [re.sub(r'[^a-zA-Z0-9]', ' ', document).strip() for document in lower_case_documents]
# sans_punctuation_documents = [re.sub(r'\s{2,}', ' ', document) for document in sans_punctuation_documents]

sans_punctuation_documents = [document.translate(str.maketrans('','', string.punctuation)) for document in lower_case_documents]
sans_punctuation_documents

['hello how are you',
 'win money win from home',
 'call me now',
 'hello call hello you tomorrow']

In [76]:
# tokenisation

preprocessed_documents = [sentence.split() for sentence in sans_punctuation_documents]
preprocessed_documents

[['hello', 'how', 'are', 'you'],
 ['win', 'money', 'win', 'from', 'home'],
 ['call', 'me', 'now'],
 ['hello', 'call', 'hello', 'you', 'tomorrow']]

In [81]:
# count word frequencies

import pprint
from collections import Counter

frequency_list = [Counter(document) for document in preprocessed_documents]
frequency_list

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]

### Bag of words Sklearn

In [93]:
# import vectorizer and fit it to the data

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [99]:
# create a document array

doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [104]:
# clean the word frequency matrix and make a dataframe

frequency_matrix = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())
frequency_matrix



Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


## Step 4. Train - test split

In [110]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state = 1)

print(f'Number of rows in the total set: {df.shape[0]}')
print(f'Number of rows in the training set: {X_train.shape[0]}')
print(f'Number of rows in the test set: {X_test.shape[0]}')

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


## Step 5. BOW application from scratch

In [130]:
count_vector = CountVectorizer()

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

print(f'Training data shape: {training_data.shape}')
print(f'Test data shape: {test_data.shape}')
print(f'Total number of words: {len(count_vector.get_feature_names())}')

Training data shape: (4179, 7456)
Test data shape: (1393, 7456)
Total number of words: 7456


In [138]:
# calculate the positive class

p_diabetes = 0.01
p_no_diabetes = 0.99
p_pos_diabetes = 0.9
p_neg_no_diabetes = 0.9

p_pos = p_diabetes * p_pos_diabetes + p_no_diabetes * (1-p_pos_diabetes)
print(f'The probability of getting a positive test result is: {p_pos}')

The probability of getting a positive test result is: 0.10799999999999998


In [141]:
# calculate the first posterior

p_diabetes_pos = (p_diabetes * p_pos_diabetes) / p_pos
print(f'Probability of an individual having diabetes, given that that individual got a positive test result is: {p_diabetes_pos}') 

Probability of an individual having diabetes, given that that individual got a positive test result is: 0.08333333333333336


In [147]:
# calculate the second posterior

p_pos_no_diabetes = 1 - p_pos_diabetes
p_no_diabetes_pos = (p_no_diabetes * 0.1) / p_pos
print (f'Probability of an individual not having diabetes, given that individual got a positive test result is: {p_no_diabetes_pos}')

Probability of an individual not having diabetes, given that individual got a positive test result is: 0.9166666666666669


## Step 6. Naive Bayes implementation from scratch

In [153]:
# P(J)
p_j = 0.5

# P(F/J)
p_j_f = 0.1

# P(I/J)
p_j_i = 0.1

p_j_text = p_j_f * p_j_i * p_j
print(p_j_text)

0.005000000000000001


In [154]:
# P(G)
p_g = 0.5

# P(F/G)
p_g_f = 0.7

# P(I/G)
p_g_i = 0.2

p_g_text = p_g_f * p_g_i * p_g
print(p_g_text)

0.06999999999999999


In [155]:
p_f_i = p_j_text + p_g_text
print('Probability of words freedom and immigration being said are: ', format(p_f_i))

Probability of words freedom and immigration being said are:  0.075


In [159]:
p_j_fi = (p_j * p_j_f * p_j_i) / p_f_i
print(f'The probability of Jill Stein saying the words Freedom and Immigration: {p_j_fi}')

The probability of Jill Stein saying the words Freedom and Immigration: 0.06666666666666668


In [160]:
p_g_fi = (p_g * p_g_f * p_g_i) / p_f_i
print(f'The probability of Gary Johnson saying the words Freedom and Immigration: {p_g_fi}')

The probability of Gary Johnson saying the words Freedom and Immigration: 0.9333333333333332
