# Unigrams, Bigrams, and Trigrams in Naive Bayes Classifiers

Math of Intelligence Week 6 Challenge - https://www.youtube.com/watch?v=PrkiRVcrxOs&t=7s

In this notebook I will explore the performance of ngram words in a naive bayes classifier. I will look at how they perform across two data sets: 
    1) A Spam SMS dataset 
    2) Rap lines from Biggie Smalls and 2Pac

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [91]:
raw_test_documents = [
    "Taipei Taiwan",
    "Macao Taiwan Shanghai",
    "Japan Sapporo",
    "Sapporo Osaka Taiwan"
]

documents_classes =  [1, 1, 0, 0]

test_set = pd.DataFrame({
    "label": documents_classes,
    "body": raw_test_documents
})

data = test_set.values
test_set

Unnamed: 0,body,label
0,Taipei Taiwan,1
1,Macao Taiwan Shanghai,1
2,Japan Sapporo,0
3,Sapporo Osaka Taiwan,0


In [83]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.75):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split)
        # convert into n grams
        self.train_data = [[item[1], self.ngrams(n, item[0])] for item in self.train_data]
        self.test_data = [[item[1], self.ngrams(n, item[0])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [84]:
unigram_bayes = ngrams_bayes(data,1)

In [85]:
unigram_bayes.train()

In [86]:
unigram_bayes.evaluate_test_data()

Evaluated 1 test cases. 100.00% Accuracy


1.0

In [90]:
bigram_sms= ngrams_bayes(data,2) 
bigram_sms.train()
bigram_sms.evaluate_test_data()


Evaluated 1 test cases. 0.00% Accuracy


0.0

In [88]:
trigram_sms = ngrams_bayes(data,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()

Evaluated 1 test cases. 100.00% Accuracy


1.0

In [92]:
test_text = "Taiwan Taiwan Sapporo"
print(unigram_bayes.classify(test_text))
print(bigram_sms.classify(test_text))
print(trigram_sms.classify(test_text))

0
1
0
