In [1]:
import pandas as pd
import numpy as np


# Naive Bayes Text Classification

In [8]:
spam_text = ['Send us your password', 'review us', 'Send your password', 'Send us your account']
ham_text = ['Send us your review', 'review your password']

spam_dict = {}
ham_dict = {}

for s in spam_text:
    for w in s.split(' '):
        w = w.lower()
        if w not in spam_dict.keys():
            spam_dict[w] = 1
        else:
            spam_dict[w] += 1
            
for s in ham_text:
    for w in s.split(' '):
        w = w.lower()
        if w not in ham_dict.keys():
            ham_dict[w] = 1
        else:
            ham_dict[w] += 1

print(spam_dict)
print(ham_dict)

{'send': 3, 'us': 3, 'your': 3, 'password': 2, 'review': 1, 'account': 1}
{'send': 1, 'us': 1, 'your': 2, 'review': 2, 'password': 1}


In [13]:
p_pass_in_spam = spam_dict['password']/sum(spam_dict.values())

In [12]:
p_pass_in_ham = ham_dict['password']/sum(ham_dict.values())

In [14]:
p_pass_in_spam * (4/6)

0.10256410256410256

In [15]:
p_pass_in_spam * (4/6) + p_pass_in_ham * (2/6)

0.15018315018315018

In [16]:
p_pass_in_spam * (4/6) / (p_pass_in_spam * (4/6) + p_pass_in_ham * (2/6))

0.6829268292682927

In [17]:
p_pass_in_ham * (2/6) / (p_pass_in_spam * (4/6) + p_pass_in_ham * (2/6))

0.3170731707317073

In [20]:
import re
import string
import math
import pandas as pd
from sklearn.model_selection import train_test_split

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""

    # clean up our string by removing punctuation
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)

    #  tokenize our string into words
    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    # count up how many of each word appears in a list of words.
    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        # Compute log class priors (the probability that any given message is spam/ham),
        # by counting how many messages are spam/ham, 
        # dividing by the total number of messages, and taking the log.
        n = len(X)
        self.num_messages['spam'] = sum(1 for label in Y if label == 'spam')
        self.num_messages['ham'] = sum(1 for label in Y if label == 'ham')
        self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n )
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n )
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}

        # for each (document, label) pair, tokenize the document into words.
        for x, y in zip(X, Y):
            c = 'spam' if y == 'spam' else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            # For each word, either add it to the vocabulary for spam/ham, 
            # if it isn’t already there, and update the number of counts. 
            for word, count in counts.items():
                # Add that word to the global vocabulary.
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    # function to actually output the class label for new data.
    def predict(self, X):
        result = []
        # Given a document...
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            # We iterate through each of the words...
            for word, _ in counts.items():
                if word not in self.vocab: continue
                # ... and compute log p(w_i|Spam), and sum them all up. The same will happen for Ham
                # add Laplace smoothing
                # https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf
                log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham
            
            # Then we add the log class priors...
            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']

            # ... and check to see which score is bigger for that document.
            # Whichever is larger, that is the predicted label!
            if spam_score > ham_score:
                result.append('spam')
            else:
                result.append('ham')
        return result
        

# TODO: Fill in the below function to make a prediction, 
# your answer should match the final number in the below output (0.9641)
if __name__ == '__main__':
    pass

In [32]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
X = df["v2"]
Y = df["v1"]

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=.25, random_state=0)

s_det = SpamDetector()
s_det.fit(X_train, y_train)
y_pred = s_det.predict(X_test)

In [33]:
error = 0

for actual, pred in zip(y_test, y_pred):
    if actual != pred:
        error +=1 
        
error_per = error / len(actual)
print(f'Error %: {error_per}')

Error %: 18.0


In [37]:
len(y_pred)

1393

In [38]:
len(y_test)

1393

In [36]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average="micro")

ValueError: pos_label=1 is not a valid label: array(['ham', 'spam'], dtype='<U4')