In [None]:
import math
import re
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Email:
    def __init__(self, message, is_spam):
        self.message = message
        self.is_spam = is_spam

emails = [
    Email("Hey there! I thought you might find this interesting. Click here.", True),
    Email("Get viagra for a discount as much as 90%", True),
    Email("Viagra prescription for less", True),
    Email("Even better than Viagra, try this new prescription drug", True),
    Email("My name is Natasha, I want to meet you", True),
    Email("Meet the hottest singles on the #1 dating site", True),

    Email("Hey, I left my phone at home. Email me if you need anything. I'll be in a meeting for the afternoon.",
          False),
    Email("Please see attachment for notes on today's meeting. Interesting findings on your market research.", False),
    Email("An item on your Amazon wish list received a discount", False),
    Email("Your prescription drug order is ready", False),
    Email("Your Amazon account password has been reset", False),
    Email("Your Amazon order", False),
    Email("Hi, how are you doing today?", False)
]

spam_email_count = sum(1 for email in emails if email.is_spam)
non_spam_email_count = sum(1 for email in emails if not email.is_spam)

# Helper function to break up words from a string
def break_up_words(str):
    return re.sub(r'[^\w\s]', '', str.lower()).split()

# get count of words for spam emails
spam_count_by_word = defaultdict(int)

for email in emails:
    if email.is_spam:
        for word in break_up_words(email.message):
            spam_count_by_word[word] += 1

# get count of words for non-spam emails
non_spam_count_by_word = defaultdict(int)

for email in emails:
    if not email.is_spam:
        for word in break_up_words(email.message):
            non_spam_count_by_word[word] += 1

# get count of words in all emails
all_emails_count_by_word = defaultdict(int)

for email in emails:
    for word in break_up_words(email.message):
        all_emails_count_by_word[word] += 1

# Create functions to calculate probability of word occurring in spam or not spam
# add a little .1 and .2 to numerator/denominator respectively to prevent 0 division
def prob_word_appears_in_spam(w):
    return (.1 + spam_count_by_word.get(w, 0)) / (.2 + spam_email_count)

def prob_word_appears_in_spam_laplace_smoothing(w, spam_email_count, spam_count_by_word, all_emails_count_by_word, alpha=1):
    
    word_count = spam_count_by_word.get(w, 0)
    num_all_training_words = np.sum(list(all_emails_count_by_word.values()))

    return (word_count + alpha)/(spam_email_count + alpha * num_all_training_words)

def prob_word_appears_in_non_spam(w):
    return (.1 + non_spam_count_by_word.get(w, 0)) / (.2 + non_spam_email_count)

def prob_word_appears_in_non_spam_laplace_smoothing(w, non_spam_email_count, non_spam_count_by_word, all_emails_count_by_word, alpha=1):
    word_count = non_spam_count_by_word.get(w, 0)
    num_all_training_words = np.sum(list(all_emails_count_by_word.values()))

    return (word_count + alpha)/(non_spam_email_count + alpha * num_all_training_words)

# Here we go! Naive Bayes happens here
def spam_score_for_message(message, use_laplace_smoothing=False, alpha=1):
    message_words = break_up_words(message)

    total_spam_probability = 0.0
    
    if not use_laplace_smoothing:
        for w in all_emails_count_by_word:
            if w in message_words:
                total_spam_probability += math.log(prob_word_appears_in_spam(w))
            # else:
            #     total_spam_probability += math.log(1.0 - prob_word_appears_in_spam(w))
    else:
        for w in message_words:
            total_spam_probability += math.log(prob_word_appears_in_spam_laplace_smoothing(w, spam_email_count, spam_count_by_word, all_emails_count_by_word, alpha))

    total_not_spam_probability = 0.0

    if not use_laplace_smoothing:
        for w in all_emails_count_by_word:
            if w in message_words:
                total_not_spam_probability += math.log(prob_word_appears_in_non_spam(w))
            # else:
            #     total_not_spam_probability += math.log(1.0 - prob_word_appears_in_non_spam(w))
    else:
        for w in message_words:
            total_not_spam_probability += math.log(prob_word_appears_in_non_spam_laplace_smoothing(w, non_spam_email_count, non_spam_count_by_word, all_emails_count_by_word, alpha))
    return math.exp(total_spam_probability) / (math.exp(total_spam_probability) + math.exp(total_not_spam_probability))

In [None]:
# Test new messages
message1 = "discount viagra wholesale, hurry while this offer lasts"
message2 = "interesting meeting on amazon cloud services discount program"
message3 = "i will pay you money for viagra"
message4 = "i will pay you money for viagra, please"
messages = [message1, message2, message3, message4]

for message in messages:
    alphas = np.linspace(0.2,1,5)
    probability_of_spam = spam_score_for_message(message, use_laplace_smoothing=False)
    print(f"Message: {message}")
    print("Spam probability: {0}%".format(round(probability_of_spam * 100.0, 6)))

    for alpha in alphas:
        probability_of_spam_laplace = spam_score_for_message(message, use_laplace_smoothing=True, alpha=alpha)
        print(f"Spam probability with Laplace smoothing (alpha={alpha}): {round(probability_of_spam_laplace * 100.0, 6)}%")
    print()

In [None]:
while True:
    alphas = np.linspace(0.2,1,5)
    message = input("Input an email message to calculate probability of spam:")

    probability_of_spam = spam_score_for_message(message, use_laplace_smoothing=False)
    print(f"Message: {message}")
    print("Spam probability: {0}%".format(round(probability_of_spam * 100.0, 6)))

    for alpha in alphas:
        probability_of_spam_laplace = spam_score_for_message(message, use_laplace_smoothing=True, alpha=alpha)
        print(f"Spam probability with Laplace smoothing (alpha={alpha}): {round(probability_of_spam_laplace * 100.0, 6)}%")