In [1]:
from collections import defaultdict

import collections
import numpy as np
import operator

In [2]:
DATA_PATH = './data.csv'

In [37]:
def load_dataset():
    data = np.genfromtxt(DATA_PATH, dtype=str, delimiter=',')
    print(data.shape)
    test  = []
    train = []
    for clazz in np.unique(data[:,1]):
        data_class = data[data[:,1] == clazz]
        count = int(data_class.shape[0]*.8)
    
        train.append(data_class[:count])
        test.append(data_class[count:])
        
    return np.concatenate(train), np.concatenate(test)

In [38]:
train_samples, test_samples = load_dataset()
print(train_samples.shape, test_samples.shape)

(1118, 2)
(304, 2)
(590, 2)
(894, 2) (224, 2)


In [54]:
class NaiveBayes:
    def __init__(self):
        self.container = dict()
        self.prior = defaultdict(int)
    
        self.unique_words = []
    
    def initialize_dict_labels(self, labels):
        container = dict()
        for label in np.unique(labels):
            container[label] = defaultdict(int)
        
        return container
            
    def count_priors(self):
        for label in self.data[:,1]:
            self.prior[label] += 1

        shape = self.data.shape[0]
        for key, value in self.prior.items():
            self.prior[key] = value/shape
    
    def __fit_tfidf(self):
        self.idf_container = self.initialize_dict_labels(self.data[:,1])

        for clazz, values in self.container.items():
            for word, freq in values.items():
                for doc in self.data[:, 0]:
                    if word in doc:
                        self.idf_container[clazz][word] += 1
                
    def fit(self, data, tfidf=False):
        self.data = np.asarray(data)
        self.container = self.initialize_dict_labels(self.data[:,1])

        for sample in self.data:
            words = sample[0].lower().split(' ')
            for word in words:
                if (word not in self.unique_words): self.unique_words.append(word)
                self.container[sample[1]][word] +=1   
        
        if tfidf:
            self.__fit_tfidf()
            
        self.count_priors()
        self.container['0'][''] = 0
        self.container['1'][''] = 0
        
    def __predict_tfidf(self, document, logarifmic=True):
        words = document[0].lower().split(' ')
        class_predictions = dict()

        for clazz, values in self.container.items():
            total_count = sum(values.values())
            container = []
            
            for word in words:
                if word == '':
                    pass
                else: 
                    P = (1 + values[word])/(total_count + len(self.unique_words))*\
                         np.log(total_count/(self.idf_container[clazz][word] + 1))
                    container.append(P)
            
            if logarifmic:
                class_predictions[clazz] = np.log(self.prior[clazz])
                for value in container:
                    class_predictions[clazz] += np.log(value)

            else:
                class_predictions[clazz] = self.prior[clazz]
                for value in container:
                    class_predictions[clazz] *= value
                    
        return class_predictions

    
    def _predict(self, document, logarifmic=True):
        words = document[0].lower().split(' ')
        class_predictions = dict()

        for clazz, values in self.container.items():
            total_count = sum(values.values())
            container = []
            
            for word in words:
                if word == '':
                    pass
                else: 
                    if word in values.keys():
                        value = values[word]
                    else:
                        value = 0

                    P = (value + 1)/(total_count + len(self.unique_words))
                    container.append(P)
            
            if logarifmic:
                class_predictions[clazz] = np.log(self.prior[clazz])
                for value in container:
                    class_predictions[clazz] += np.log(value)

            else:
                class_predictions[clazz] = self.prior[clazz]
                for value in container:
                    class_predictions[clazz] *= value
                    
        return class_predictions

    def predict(self, samples, tfidf=False):
        container = []
        for i in samples:
            result = self.__predict_tfidf(i) if tfidf else self._predict(i)
            maximum = max(result.items(), key=operator.itemgetter(1))[0]
            container.append(maximum == i[1])
        return np.asarray(container).mean()

In [55]:
nb = NaiveBayes()
nb.fit(train_samples, tfidf=True)

In [56]:
len(nb.unique_words)

26457

In [57]:
result = nb.predict(train_samples,tfidf=True)
result

0.9854586129753915

In [58]:
result = nb.predict(test_samples,tfidf=True)
result

0.9598214285714286

In [148]:
# Train data
data = [["Chinese Beijing Chinese","0"],
            ["Chinese Chinese Shanghai","0"], 
            ["Chinese Macao","0"],
            ["Tokyo Japan Chinese","1"]]

# Fit model
s = NaiveBayes()
s.fit(data)

In [149]:
# Mpdel predict
result = s._predict(["Chinese Chinese Chinese Tokyo Japan"])
result

5
5


{'0': 0.0003012137799726303, '1': 0.00013548070246744226}

In [None]:
# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}