In [None]:
# default_exp models.naive_bayes

# models.naive_bayes

> Naive Bayes Algorithm for Binary Classification with Laplace Smoothing.

In [None]:
#export
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

In [None]:
#hide
df = pd.read_csv("./datasets/Kaggle/train.csv")
x = df["text"].tolist()
y = df["target"].tolist()
train_percent = 0.70
cut = int(len(x) * train_percent)
x_train = x[:cut]
x_train = [x.split() for x in x_train]
x_test = x[cut:]
x_test = [x.split() for x in x_test]
y_train = y[:cut]
y_test = y[cut:]

Data is taken from kaggle competition [Real or Not? NLP with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started])

Data Sample: 

In [None]:
print(f"Two  tweets == {x_train[:2]}")
print(f"Tweet Label == {y_train[:2]}")

Two  tweets == [['Our', 'Deeds', 'are', 'the', 'Reason', 'of', 'this', '#earthquake', 'May', 'ALLAH', 'Forgive', 'us', 'all'], ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask.', 'Canada']]
Tweet Label == [1, 1]


In [None]:
#export
class NaiveBayes:
    """Naive Bayes Algorithm for Binary Classification"""
    
    def __init__(self):
        self.class1_term_cf = defaultdict(int) #term frequency in class 1
        self.class2_term_cf = defaultdict(int) #term frequency in class 2
        self.classes = []
    
    def fit(self, X, y):
        """ Train Naive Bayes.
        
        Args:
        
            X (nested list): nested list of tokenized samples.
            y (list): list of corresponding lables.
        """
        
        self._get_priors(y)
        self._get_term_class_freq(X, y)
        self._get_vocab_length()
        
    def _get_priors(self, y_train):
        """Calculate the priors for both classes"""
        
        self.priors = Counter(y_train)
        self.classes = list(self.priors.keys())
        total = sum(self.priors.values())
        for class_name in self.priors.keys():
            self.priors[class_name] /= total
    
    def _get_term_class_freq(self, x_train, y_train):
        """Calculate term frequency in each class"""
        
        for index , sample in enumerate(x_train):
            for term in sample:
                if   y_train[index] == self.classes[0]:
                    self.class1_term_cf[term] += 1
                elif y_train[index] == self.classes[1]:
                    self.class2_term_cf[term]  += 1
                    
        self.class1_cf = sum(self.class1_term_cf.values())
        self.class2_cf = sum(self.class2_term_cf.values())
           
    def _get_vocab_length(self):
        self.vocab_length = len(set(list(self.class1_term_cf.keys()) + \
                                    list(self.class2_term_cf.keys())))
                    
    def predict(self, X):
        """ Predict the labels for samples.
        
        Args:
        
            X(nested list): nested list of tokenized samples.
            
        Returns:
        
            list of predicted label.
        """
        
        predictions = []
        for sample in X:
            predictions.append(self._predict_label(sample))
        return predictions
    
    def _predict_label(self, sample):
        """predict label for one sample / instance"""
        
        summation_class1 = np.sum([np.log(self._prob_term_given_label(term , label = self.classes[0]))\
                                 for term in sample])
        summation_class2 = np.sum([np.log(self._prob_term_given_label(term , label = self.classes[1]))\
                                 for term in sample])
        prob_class1  = np.log(self.priors[self.classes[0]]) + summation_class1
        prob_class2  = np.log(self.priors[self.classes[1]]) + summation_class2
       
        prediction = self.classes[0] if prob_class1 > prob_class2 else self.classes[1]
        return prediction
    
    def _prob_term_given_label(self, term , label):
        """calculate probability of term given some label / class ,laplace smoothing applied"""
        
        if   label == self.classes[0]:
            return (self.class1_term_cf[term] + 1) / (self.class1_cf + self.vocab_length)
        elif label == self.classes[1]:
            return (self.class2_term_cf[term] + 1)  / (self.class2_cf + self.vocab_length)

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
show_doc(NaiveBayes.fit)

<h4 id="NaiveBayes.fit" class="doc_header"><code>NaiveBayes.fit</code><a href="__main__.py#L10" class="source_link" style="float:right">[source]</a></h4>

> <code>NaiveBayes.fit</code>(**`X`**, **`y`**)

Train Naive Bayes.

Args:

    X (nested list): nested list of tokenized samples.
    y (list): list of corresponding lables.

In [None]:
show_doc(NaiveBayes.predict)

<h4 id="NaiveBayes.predict" class="doc_header"><code>NaiveBayes.predict</code><a href="__main__.py#L49" class="source_link" style="float:right">[source]</a></h4>

> <code>NaiveBayes.predict</code>(**`X`**)

Predict the labels for samples.

Args:

    X(nested list): nested list of tokenized samples.
    
Returns:

    list of predicted label.

In [None]:
from scratch.models.naive_bayes import NaiveBayes

In [None]:
nb = NaiveBayes()

In [None]:
nb.fit(x_train, y_train)

In [None]:
nb.classes

[1, 0]

In [None]:
nb.vocab_length

24501

In [None]:
predictions = nb.predict(x_test)

In [None]:
#hide
def accuracy(y_test, y_pred):
    correct = len([True for idx in range(len(y_test)) if y_test[idx] == y_pred[idx]])
    total = len(y_test)
    return correct / total

In [None]:
accuracy(y_test, predictions)

0.7329246935201401

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 01_count_vectorizer.ipynb.
Converted 02_naive_bayes.ipynb.
Converted 03_KNN.ipynb.
Converted index.ipynb.
Converted main.ipynb.
