# Bayesian inference and Naive Bayes classifier for emails

In [40]:
import numpy as np
import pandas as pd
import os

In [35]:
# Read dataset

df = pd.read_table(os.path.join('smsspamcollection', 'SMSSpamCollection'), sep = '\t', names = ['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
# change label format to numeric

df['label'] = df['label'].map({"ham":0, "spam":1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,,"Go until jurong point, crazy.. Available only ..."
1,,Ok lar... Joking wif u oni...
2,,Free entry in 2 a wkly comp to win FA Cup fina...
3,,U dun say so early hor... U c already then say...
4,,"Nah I don't think he goes to usf, he lives aro..."


## Bag of words manual implementation example

In [70]:
# Transorm text to lowercase

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = [document.lower() for document in documents]
lower_case_documents

['hello, how are you!',
 'win money, win from home.',
 'call me now.',
 'hello, call hello you tomorrow?']

In [71]:
# Remove punctuation

import re
import string

# sans_punctuation_documents = [re.sub(r'[^a-zA-Z0-9]', ' ', document).strip() for document in lower_case_documents]
# sans_punctuation_documents = [re.sub(r'\s{2,}', ' ', document) for document in sans_punctuation_documents]

sans_punctuation_documents = [document.translate(str.maketrans('','', string.punctuation)) for document in lower_case_documents]
sans_punctuation_documents

['hello how are you',
 'win money win from home',
 'call me now',
 'hello call hello you tomorrow']

In [76]:
# tokenisation

preprocessed_documents = [sentence.split() for sentence in sans_punctuation_documents]
preprocessed_documents

[['hello', 'how', 'are', 'you'],
 ['win', 'money', 'win', 'from', 'home'],
 ['call', 'me', 'now'],
 ['hello', 'call', 'hello', 'you', 'tomorrow']]

In [81]:
# count word frequencies

import pprint
from collections import Counter

frequency_list = [Counter(document) for document in preprocessed_documents]
frequency_list

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]

# Bag of words Sklearn

In [93]:
# import vectorizer and fit it to the data

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [96]:
doc_array = count_vector.transform(documents).toarray()

In [97]:
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)