# GaussianNB

In [4]:
# Continuous, numerical data.
import pandas as pd
#     [0, 0, 1],  # Red, Round, Large (Apple)
#     [1, 1, 0],  # Yellow, Elongated, Small (Banana)
#     [0, 0, 0],  # Red, Round, Small (Apple)
#     [1, 1, 1]   # Yellow, Elongated, Large (Banana)

In [5]:
df = pd.read_csv('fruit.csv')
df

Unnamed: 0,"Color (0: Red, 1: Yellow)","Shape (0: Round, 1: Elongated)","Size (0: Small, 1: Large)","Fruit (0: Apple, 1: Banana)"
0,0,0,1,0
1,1,1,0,1
2,0,0,0,0
3,1,1,1,1


In [6]:
X = df.drop(columns='Fruit (0: Apple, 1: Banana)',axis=1)

In [7]:
y =df['Fruit (0: Apple, 1: Banana)']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
from sklearn.naive_bayes import GaussianNB

In [11]:
model = GaussianNB()

In [12]:
model.fit(X_train,y_train)

In [13]:
y_pred = model.predict(X_test)
y_pred

array([1], dtype=int64)

In [14]:
from sklearn.metrics import accuracy_score
print("Test Data:", X_test)
print("True Labels:", y_test)
print("Predicted Labels:", y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))

Test Data:    Color (0: Red, 1: Yellow)  Shape (0: Round, 1: Elongated)  \
1                          1                               1   

   Size (0: Small, 1: Large)  
1                          0  
True Labels: 1    1
Name: Fruit (0: Apple, 1: Banana), dtype: int64
Predicted Labels: [1]
Accuracy: 1.0


# BernoulliNB

In [15]:
from sklearn.naive_bayes import BernoulliNB

# Dataset
# Features: [Color, Shape, Size]
X = [
    [1, 1, 1],  # Red, Round, Large (Apple)
    [0, 0, 0],  # Not Red, Not Round, Not Large (Banana)
    [1, 1, 0],  # Red, Round, Not Large (Apple)
    [0, 0, 1]   # Not Red, Not Round, Large (Banana)
]
# Labels: [Fruit Type]
y = [0, 1, 0, 1]  # 0: Apple, 1: Banana

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model = BernoulliNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Test Data:", X_test)
print("True Labels:", y_test)
print("Predicted Labels:", y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))


Test Data: [[0, 0, 0]]
True Labels: [1]
Predicted Labels: [1]
Accuracy: 1.0


In [16]:
from sklearn.datasets import load_iris

In [17]:
iris = load_iris()

In [18]:
X = iris.data
y=iris.target

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [20]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [21]:
y_pred = gnb.predict(X_test)

In [22]:
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 95.0


# MultinomialNB

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Dataset
# Features: [Buy, Win, Offer, Free]
X = [
    [2, 0, 1, 0],  # Not Spam: Moderate frequency of "Buy" and "Offer"
    [0, 1, 0, 3],  # Spam: High frequency of "Free" and "Win"
    [1, 0, 2, 0],  # Not Spam: Moderate frequency of "Buy" and "Offer"
    [0, 2, 1, 1]   # Spam: High frequency of "Win", moderate "Free" and "Offer"
]
# Labels: [Email Type]
y = [0, 1, 0, 1]  # 0: Not Spam, 1: Spam

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print results
print("Test Data:", X_test)
print("True Labels:", y_test)
print("Predicted Labels:", y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))


Test Data: [[0, 1, 0, 3], [0, 2, 1, 1]]
True Labels: [1, 1]
Predicted Labels: [0 0]
Accuracy: 0.0


In [24]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [25]:
# Tokenization: Breaking down text into smaller parts, like sentences or words

NLP

In [26]:
# NLTK, or Natural Language Toolkit, is a Python library that provides tools for processing and analyzing text data. 
# It's one of the most popular and powerful libraries for natural language processing (NLP)

In [27]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [28]:
#VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool 
#that is specifically attuned to sentiments expressed in social media, 
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [31]:
nltk.download("punkt_tab")
nltk.download("punkt")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [33]:
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Welcome to the fun session of Artificial Intelligence at Islington!. But I live in Dharan"
print(sent_tokenize(text))  # Sentence Tokenization
print(word_tokenize(text))  # Word Tokenization

['Welcome to the fun session of Artificial Intelligence at Islington!.', 'But I live in Dharan']
['Welcome', 'to', 'the', 'fun', 'session', 'of', 'Artificial', 'Intelligence', 'at', 'Islington', '!', '.', 'But', 'I', 'live', 'in', 'Dharan']


In [None]:
# corpus: is a large and structured set of text.
# corpora: NLTK provides a collection of pre-loaded datasets, known as built-in corpora, for common NLP tasks.
# The movie_reviews corpus contains movie reviews 
# review is labeled as either Positive (pos) or Negative (neg)

In [None]:
from nltk.corpus import movie_reviews #imports movie reviews from nltk
from nltk.corpus import stopwords #imports stopwords from nltk
from nltk.corpus import wordnet #imports wordnet(lexical database for the english language) from nltk

In [None]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [None]:
movie_reviews.categories()

['neg', 'pos']

In [None]:
#inbuilt list of stopwords in nltk
stopwords.words('english')[:16]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself']

In [None]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [None]:
# Prints all words in movie_review with file id ‘....’
movie_reviews.words('neg/cv003_12683.txt')

['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...]

In [None]:
movie_reviews.raw('neg/cv003_12683.txt')

' " quest for camelot " is warner bros . \' first feature-length , fully-animated attempt to steal clout from disney\'s cartoon empire , but the mouse has no reason to be worried . \nthe only other recent challenger to their throne was last fall\'s promising , if flawed , 20th century fox production " anastasia , " but disney\'s " hercules , " with its lively cast and colorful palate , had her beat hands-down when it came time to crown 1997\'s best piece of animation . \nthis year , it\'s no contest , as " quest for camelot " is pretty much dead on arrival . \neven the magic kingdom at its most mediocre -- that\'d be " pocahontas " for those of you keeping score -- isn\'t nearly as dull as this . \nthe story revolves around the adventures of free-spirited kayley ( voiced by jessalyn gilsig ) , the early-teen daughter of a belated knight from king arthur\'s round table . \nkayley\'s only dream is to follow in her father\'s footsteps , and she gets her chance when evil warlord ruber ( ga

In [None]:
file = open("data2.txt","r")
# read the file "data2.txt" and split each
# line in the file using newline 
data = file.read().split("\n")

data

['Congrats, You have won!! reply to our sms for a free nokia mobile + free camcorder. \tspam',
 'Congrats! 1 year special cinema pass for 2 is yours. reply to this sms to claim your prize.\tspam',
 'I am pleased to tell you that you are awarded with a 1500 Bonus Prize, reply to this sms to claim your prize.\tspam',
 'Dont worry. I guess he is busy.\tnot spam',
 'Going for dinner. msg you later.\tnot spam',
 'Ok, I will call you up when I get some cash.\tnot spam',
 '']

In [None]:
# futher split each line using tab
data = [d.split("\t") for d in data]
data

[['Congrats, You have won!! reply to our sms for a free nokia mobile + free camcorder. ',
  'spam'],
 ['Congrats! 1 year special cinema pass for 2 is yours. reply to this sms to claim your prize.',
  'spam'],
 ['I am pleased to tell you that you are awarded with a 1500 Bonus Prize, reply to this sms to claim your prize.',
  'spam'],
 ['Dont worry. I guess he is busy.', 'not spam'],
 ['Going for dinner. msg you later.', 'not spam'],
 ['Ok, I will call you up when I get some cash.', 'not spam'],
 ['']]

In [None]:
# remove the last list from the data
del(data[-1])

data

[['Congrats, You have won!! reply to our sms for a free nokia mobile + free camcorder. ',
  'spam'],
 ['Congrats! 1 year special cinema pass for 2 is yours. reply to this sms to claim your prize.',
  'spam'],
 ['I am pleased to tell you that you are awarded with a 1500 Bonus Prize, reply to this sms to claim your prize.',
  'spam'],
 ['Dont worry. I guess he is busy.', 'not spam'],
 ['Going for dinner. msg you later.', 'not spam'],
 ['Ok, I will call you up when I get some cash.', 'not spam']]

In [None]:
df = pd.DataFrame(data,columns = ["text","label"])
df

Unnamed: 0,text,label
0,"Congrats, You have won!! reply to our sms for ...",spam
1,Congrats! 1 year special cinema pass for 2 is ...,spam
2,I am pleased to tell you that you are awarded ...,spam
3,Dont worry. I guess he is busy.,not spam
4,Going for dinner. msg you later.,not spam
5,"Ok, I will call you up when I get some cash.",not spam


In [None]:
vocab = []
text_vector = []

for each in df["text"]:
    print(each)
    # collect vocabularies (or words) from input 
    # datafile and store in a list called vocab
    vocab.extend(each.lower().\
                 replace(".","").\
                 replace(",","").\
                 replace("+","").\
                 replace("!","").\
                 replace("1","").\
                 replace("1500","").\
                 replace("2","").\
                 split())
    # collect vocabularies (or words) from input 
    # datafile, however store words in each line 
    # in a list called text_vector
    text_vector.append(each.lower().\
                       replace(".","").\
                       replace(",","").\
                       replace("+","").\
                       replace("!","").\
                       replace("1","").\
                       replace("1500","").\
                       replace("2","").\
                       split())

Congrats, You have won!! reply to our sms for a free nokia mobile + free camcorder. 
Congrats! 1 year special cinema pass for 2 is yours. reply to this sms to claim your prize.
I am pleased to tell you that you are awarded with a 1500 Bonus Prize, reply to this sms to claim your prize.
Dont worry. I guess he is busy.
Going for dinner. msg you later.
Ok, I will call you up when I get some cash.


In [None]:
print(vocab)

['congrats', 'you', 'have', 'won', 'reply', 'to', 'our', 'sms', 'for', 'a', 'free', 'nokia', 'mobile', 'free', 'camcorder', 'congrats', 'year', 'special', 'cinema', 'pass', 'for', 'is', 'yours', 'reply', 'to', 'this', 'sms', 'to', 'claim', 'your', 'prize', 'i', 'am', 'pleased', 'to', 'tell', 'you', 'that', 'you', 'are', 'awarded', 'with', 'a', '500', 'bonus', 'prize', 'reply', 'to', 'this', 'sms', 'to', 'claim', 'your', 'prize', 'dont', 'worry', 'i', 'guess', 'he', 'is', 'busy', 'going', 'for', 'dinner', 'msg', 'you', 'later', 'ok', 'i', 'will', 'call', 'you', 'up', 'when', 'i', 'get', 'some', 'cash']


In [None]:
len(vocab)

78

In [None]:
# remove dublicates words 
vocab = list(set(vocab))
print(vocab)

['free', 'sms', 'msg', 'cash', 'pass', 'mobile', 'some', 'get', 'with', 'is', 'prize', 'worry', 'guess', 'awarded', 'he', 'that', 'are', 'cinema', 'a', 'call', 'i', 'busy', 'yours', 'reply', 'dont', 'tell', 'have', '500', 'when', 'dinner', 'going', 'congrats', 'year', 'your', 'will', 'ok', 'claim', 'up', 'camcorder', 'this', 'to', 'am', 'pleased', 'bonus', 'later', 'won', 'nokia', 'special', 'for', 'you', 'our']


In [None]:
len(vocab)

51

In [None]:
# display words in each line from the input file
print(text_vector)

[['congrats', 'you', 'have', 'won', 'reply', 'to', 'our', 'sms', 'for', 'a', 'free', 'nokia', 'mobile', 'free', 'camcorder'], ['congrats', 'year', 'special', 'cinema', 'pass', 'for', 'is', 'yours', 'reply', 'to', 'this', 'sms', 'to', 'claim', 'your', 'prize'], ['i', 'am', 'pleased', 'to', 'tell', 'you', 'that', 'you', 'are', 'awarded', 'with', 'a', '500', 'bonus', 'prize', 'reply', 'to', 'this', 'sms', 'to', 'claim', 'your', 'prize'], ['dont', 'worry', 'i', 'guess', 'he', 'is', 'busy'], ['going', 'for', 'dinner', 'msg', 'you', 'later'], ['ok', 'i', 'will', 'call', 'you', 'up', 'when', 'i', 'get', 'some', 'cash']]


In [None]:
# Create a list of list containing number of words presented in each sentence or line

In [None]:
text_vector_num = []
for text in text_vector:
    vector = []
    # count number of words presented in vocabulary for each line
    for word in vocab:
        vector.append(text.count(word))
    text_vector_num.append(vector)

In [None]:
len(text_vector_num[0])

51

In [None]:
print(text_vector_num[1])

[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]


In [None]:
# Display new representation of each sentence in the dataset
for each in text_vector_num:
    print(each)

[2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1]
[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]
[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [None]:
# insert vocabulary list at index 0
text_vector_num.insert(0, vocab)
print(text_vector_num)

[['free', 'sms', 'msg', 'cash', 'pass', 'mobile', 'some', 'get', 'with', 'is', 'prize', 'worry', 'guess', 'awarded', 'he', 'that', 'are', 'cinema', 'a', 'call', 'i', 'busy', 'yours', 'reply', 'dont', 'tell', 'have', '500', 'when', 'dinner', 'going', 'congrats', 'year', 'your', 'will', 'ok', 'claim', 'up', 'camcorder', 'this', 'to', 'am', 'pleased', 'bonus', 'later', 'won', 'nokia', 'special', 'for', 'you', 'our'], [2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1], [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Display numerical vectors correspondind to spam messages

In [None]:
# get spam message vector
import numpy as np
spam = np.array(text_vector_num[1:4])
print('Spam message vectors \n', spam)
print()

Spam message vectors 
 [[2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0
  0 0 1 0 1 0 0 0 0 1 1 0 1 1 1]
 [0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0
  1 0 0 1 2 0 0 0 0 0 0 1 1 0 0]
 [0 1 0 0 0 0 0 0 1 0 2 0 0 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0
  1 0 0 1 3 1 1 1 0 0 0 0 0 2 0]]



In [None]:
# Display numerical vectors correspondind to non-spam messages¶

In [None]:
# get non-span message vector
ham = np.array(text_vector_num[4:])
print('Ham message vectors \n', ham)

Ham message vectors 
 [[0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 1 1 0]
 [0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
  0 1 0 0 0 0 0 0 0 0 0 0 0 1 0]]


In [None]:
# Compute  𝑝(𝑤𝑘│𝑠𝑝𝑎𝑚)
# Probability that a given word ( 𝑤𝑘
#   or feature) comes from spam messages

# Number of times a particular word occur in spam messages
# 𝑛𝑘

In [None]:
nk_spam = np.sum(spam,axis=0) 
nk_spam

array([2, 3, 0, 0, 1, 1, 0, 0, 1, 1, 3, 0, 0, 1, 0, 1, 1, 1, 2, 0, 1, 0,
       1, 3, 0, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0, 0, 2, 0, 1, 2, 6, 1, 1, 1,
       0, 1, 1, 1, 2, 3, 1])

In [None]:
# Number of words in spam messages
# 𝑛𝑠𝑝𝑎𝑚
n_spam = np.sum(spam) 
n_spam

54

In [None]:
prob_of_wk_given_spam_message = (nk_spam + 1) / (n_spam + len(vocab))
prob_of_wk_given_spam_message

array([0.02857143, 0.03809524, 0.00952381, 0.00952381, 0.01904762,
       0.01904762, 0.00952381, 0.00952381, 0.01904762, 0.01904762,
       0.03809524, 0.00952381, 0.00952381, 0.01904762, 0.00952381,
       0.01904762, 0.01904762, 0.01904762, 0.02857143, 0.00952381,
       0.01904762, 0.00952381, 0.01904762, 0.03809524, 0.00952381,
       0.01904762, 0.01904762, 0.01904762, 0.00952381, 0.00952381,
       0.00952381, 0.02857143, 0.01904762, 0.02857143, 0.00952381,
       0.00952381, 0.02857143, 0.00952381, 0.01904762, 0.02857143,
       0.06666667, 0.01904762, 0.01904762, 0.01904762, 0.00952381,
       0.01904762, 0.01904762, 0.01904762, 0.02857143, 0.03809524,
       0.01904762])

In [None]:
# Compute  𝑝(𝑤𝑘│ℎ𝑎𝑚)
 
# Probability that a given word ( 𝑤𝑘
#   or feature) comes from ham messages

# Number of times a particular word occur in ham messages
# 𝑛𝑘

In [None]:
nk_ham = np.sum(ham,axis=0) 
nk_ham

array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 3, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 2, 0])

In [None]:
# Number of words in ham messages
# 𝑛ℎ𝑎𝑚

In [None]:
n_ham = np.sum(ham) 
n_ham

24

In [None]:
prob_of_wk_given_ham_message = (nk_ham + 1) / (n_ham + len(vocab))
prob_of_wk_given_ham_message

array([0.01333333, 0.01333333, 0.02666667, 0.02666667, 0.01333333,
       0.01333333, 0.02666667, 0.02666667, 0.01333333, 0.02666667,
       0.01333333, 0.02666667, 0.02666667, 0.01333333, 0.02666667,
       0.01333333, 0.01333333, 0.01333333, 0.01333333, 0.02666667,
       0.05333333, 0.02666667, 0.01333333, 0.01333333, 0.02666667,
       0.01333333, 0.01333333, 0.01333333, 0.02666667, 0.02666667,
       0.02666667, 0.01333333, 0.01333333, 0.01333333, 0.02666667,
       0.02666667, 0.01333333, 0.02666667, 0.01333333, 0.01333333,
       0.01333333, 0.01333333, 0.01333333, 0.01333333, 0.02666667,
       0.01333333, 0.01333333, 0.01333333, 0.02666667, 0.04      ,
       0.01333333])

In [None]:
# Compute probabilities of a message being spam and ham
# There are six messages in the file. Out of six messages, 3 are spam and rest of the messages are ham or non-spam.

In [None]:
prob_message_is_spam,prob_message_is_ham = 3/6,3/6
prob_message_is_spam,prob_message_is_ham

(0.5, 0.5)

In [None]:
# Create new sample messages and process them

In [None]:
new = ["I am busy. I will msg you later.","Congrats! You are awarded a free mobile."]
new_words= []
for each in new:
    processed = each.lower().replace(".","").replace(",","").replace("+","").replace("!","").split()
    new_words.append(processed)
new_words

[['i', 'am', 'busy', 'i', 'will', 'msg', 'you', 'later'],
 ['congrats', 'you', 'are', 'awarded', 'a', 'free', 'mobile']]

In [None]:
# prediction
def predict(message_list,prob_spam,prob_ham,prob_of_feature_given_spam_message,prob_of_feature_given_ham_message,vocab):
    """
    Returns whether or not the given message (a list of words) is spam or ham
    
    
    Input Parameters
    ---------
    
    message_list: a list of words
    
    prob_spam: a float
    
    prob_ham: a float
    
    prob_of_feature_given_spam_message: an numpy array
    
    prob_of_feature_given_ham_message: an numpy array
    
    vocab: a list of words    
             
    
    """
    spam = prob_spam
    ham = prob_ham
    # compute 𝑝(𝑦) * ∏(𝑗=1 to 𝑑) 𝑝(𝑎_𝑗│𝑦)
    
    for word in message_list:
        spam *= prob_of_feature_given_spam_message[vocab.index(word)]
        ham *= prob_of_feature_given_ham_message[vocab.index(word)]
     
    print('-----------------------------------------')
    print('The given message: ', " ".join(message_list))
    print()
    print('Prob that the given message is spam = ',spam)
    print()
    print('Prob that the given message is ham = ',ham)
    print('------------------------------------------')
    
    
    if spam > ham:
        return " ".join(message_list),"spam"
    else:
        return " ".join(message_list),"ham"

In [None]:
result = predict(new_words[0],prob_message_is_spam,prob_message_is_ham,prob_of_wk_given_spam_message,prob_of_wk_given_ham_message,vocab)
print('Result: ', result[1])

-----------------------------------------
The given message:  i am busy i will msg you later

Prob that the given message is spam =  1.0829429792459005e-15

Prob that the given message is ham =  3.835668952903523e-13
------------------------------------------
Result:  ham


In [None]:
result = predict(new_words[1],prob_message_is_spam,prob_message_is_ham,prob_of_wk_given_spam_message,prob_of_wk_given_ham_message,vocab)
print('Result: ', result[1])

-----------------------------------------
The given message:  congrats you are awarded a free mobile

Prob that the given message is spam =  3.0701433461621255e-12

Prob that the given message is ham =  1.1237311385459537e-13
------------------------------------------
Result:  spam
