In [4]:
import nltk
import re
import pandas as pd
import numpy as np

from data_preprocessor import preprocess_comment
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# nltk.download('popular')

[nltk_data] Downloading package wordnet to /home/sbakhit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.le = preprocessing.LabelEncoder()
        self.feature_names = cv.get_feature_names()
        self.marginal_prob = None
        self.conditional_prob = None

    def fit(self, features, targets):
        # encode classes [0, 19]
        targetscv = self.le.fit_transform(targets)
        
        # step1: marginal probability of each class P(y=k)
        # get number of docs(examples) per class (i.e. doc_count(y=k))
        classes, doc_count = np.unique(targetscv, return_counts=True)

        # get total number of docs(examples)
        total_doc_count = np.sum(doc_count)

        # P(y=k) = doc_count(y=k)/total_docs_count
        self.marginal_prob = np.divide(doc_count, total_doc_count)

        # step2: conditional probability of each class P(x_j=1 | y=k)
        # get word count per class
        class_word_count_list = []

        for class_ in classes:
            # indices where target is specified class
            class_indices = np.where(targetscv == class_)[0]
            # slice of all docs(examples) of specified class
            class_features = features[class_indices, :]
            
            # number of times each feature(word) appears in specified class (i.e. doc_count(x_j=1, y=k))
            class_word_count_list.append(class_features.sum(axis=0) + 1)
        class_word_count = np.concatenate(class_word_count_list, axis=0)

        # P(x_j=1 | y=k) =  doc_count(x_j=1, y=k)/doc_count(y=k)
        self.conditional_prob = np.divide(class_word_count, doc_count[:, None] + 2)

    def predict(self, features):       
        predictions = []
        # foreach example
        for i in range(np.size(features, axis=0)):
            class_prob = []
            for class_ in range(np.size(self.marginal_prob)):
                feature_likelihood = 0
                # foreach feature
                for j in range(np.size(features, axis=1)):
                    feature_likelihood += features[i, j]*np.log(self.conditional_prob[class_, j]) + \
                    (1 - features[i, j])*np.log(1 - self.conditional_prob[class_, j])
                class_prob.append(feature_likelihood + np.log(self.marginal_prob[class_]))
            predictions.append(np.argmax(class_prob))
        return self.le.inverse_transform(predictions)

## Data preprocessing

In [6]:
REDDIT_TRAIN_DATA_PATH = 'data_sources/reddit_train.csv'
df = pd.read_csv(REDDIT_TRAIN_DATA_PATH, quotechar='"', delimiter=',', skipinitialspace=True)

train_dataset = df.to_numpy()

features = train_dataset[:, 1]
targets = train_dataset[:, 2]

x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=4)

cv = CountVectorizer(binary=True, stop_words=stop_words.ENGLISH_STOP_WORDS, preprocessor=preprocess_comment, 
                     ngram_range=(1, 2), max_features=100)
x_traincv = cv.fit_transform(x_train)

  'stop_words.' % sorted(inconsistent))


In [13]:
x_traincv

<56000x100 sparse matrix of type '<class 'numpy.int64'>'
	with 203549 stored elements in Compressed Sparse Row format>

## fit and predict

In [7]:
bnb = BernoulliNaiveBayes(cv)
bnb.fit(x_traincv, y_train)

In [8]:
x_testcv = cv.transform(x_test)
x_testcv

<14000x100 sparse matrix of type '<class 'numpy.int64'>'
	with 50618 stored elements in Compressed Sparse Row format>

In [15]:
predictions = bnb.predict(x_testcv)

KeyboardInterrupt: 

In [10]:
np.mean(predictions == y_test)

0.14557142857142857

## Test BNB sklearn

In [11]:
clf = BernoulliNB(alpha=1.0)
clf.fit(x_traincv, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
np.mean(clf.predict(x_testcv) == y_test)

0.14557142857142857