# ML Assignment 1 : Naive-Bayes Classifier
### Anirudh Agrawal: 2018A7PS0099H | Aviral Agarwal: 2018A7PS0192H | Vikramjeet Das: 2018A7PS0280H

In [1]:
import numpy as np
import pandas as pd
import math
from preprocessing import download_nltk_deps, process_string

%load_ext lab_black

In [2]:
def fetch_data(fname):
    """
    Parses the given dataset and extracts out the emails and their associated class

        Parameters:
            fname : Name of dataset file

        Returns:
            (X,y) : List of processed emails and their classes
    """
    X = list()
    y = list()
    data_file = open(fname, "r")
    row = 0

    for line in data_file:
        line = line.strip()  # every line ends with a '\n'
        spam = int(line[-1])  # last character of every line is the class
        email = line[:-1]
        words = list(
            set(process_string(email))
        )  # consider only single occurence of each word in an email
        X.append(words)
        y.append(spam)
        row += 1

    data_file.close()
    return X, y


def shuffle(X, y):
    """
    Shuffles rows of a dataframe and returns shuffled dataframe
    """
    permute = np.random.permutation(len(X))
    return X[permute], y[permute]


def train_test_split(X, y, folds=7):
    """
    Splits data into train-test split

        Parameters:
            X : Features
            y : Labels
            folds: number of partitions required for cross validation technique

        Returns:
            (X_train, y_train, X_test, y_test) : Train test split
    """
    if not isinstance(X, np.ndarray):
        X = np.array(X, dtype="object")
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    X, y = shuffle(X, y)

    for fold in range(0, folds):
        test_indices = np.array(
            [
                x >= int(fold * len(X) / folds) and x < int((fold + 1) * len(X) / folds)
                for x in range(0, len(X))
            ]
        )
        yield X[~test_indices], y[~test_indices], X[test_indices], y[test_indices]

In [3]:
class Naive_Bayes:
    def __init__(self):
        self.params = dict()  # dictionary to store parameter values
        self.p_spam = 0  # probability of spam emails
        self.p_ham = 0  # probability of ham emails

    def train(self, X, y):
        assert X.shape[0] == y.shape[0], "Data and target count do not match"
        assert len(np.unique(y)) == 2, "Targets cannot have more than 2 classes"
        c_spam = 0  # count of spam emails
        for words, spam in zip(X, y):
            if spam:
                c_spam += 1
            for word in words:
                if self.params.__contains__(word):
                    self.params[word][spam] += 1
                else:
                    self.params[word] = (
                        [1, 2] if spam else [2, 1]
                    )  # Initialized to non-zero in accordance with Laplace Smoothening
        self.p_spam = c_spam / len(X)
        self.p_ham = 1 - self.p_spam

    def predict(self, X):
        preds = [0 for x in range(0, len(X))]
        for ind in range(0, len(X)):
            words = X[ind]
            pred_spam = math.log(self.p_spam)
            pred_ham = math.log(self.p_ham)
            for word in words:
                if self.params.__contains__(word):
                    denominator = (
                        self.params[word][0] + self.params[word][1] + 2
                    )  # Added 2 in accordance with Laplace Smoothening
                    pred_spam += math.log(self.params[word][1] / denominator)
                    pred_ham += math.log(self.params[word][0] / denominator)
                else:
                    pass  # Ignore words which aren't encountered even once in the training dataset
            preds[ind] = 1 if pred_spam >= pred_ham else 0
        return preds

    def evaluate(self, X, y):
        preds = self.predict(X)
        return np.sum(preds == y) / len(X)

In [4]:
download_nltk_deps()
X, y = fetch_data("dataset_NB.txt")
split = train_test_split(X, y, 7)
nb = Naive_Bayes()
acc = list()
for i in range(0, 7):
    X_train, y_train, X_test, y_test = split.__next__()
    nb.__init__()
    nb.train(X_train, y_train)
    acc.append(nb.evaluate(X_test, y_test))
print("Accuracy for each fold= ", acc)
print("Average Accuracy= ", sum(acc) / 7)

[nltk_data] Downloading package punkt to /home/aviral/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/aviral/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy for each fold=  [0.7816901408450704, 0.8391608391608392, 0.7832167832167832, 0.7692307692307693, 0.7762237762237763, 0.8321678321678322, 0.8251748251748252]
Average Accuracy=  0.8009807094314138
