In [5]:
from collections import defaultdict

import numpy as np
import operator

In [6]:
DATA_PATH = './data.csv'

In [7]:
def load_dataset():
    data = np.genfromtxt(DATA_PATH, dtype=str, delimiter=',')
    train_samples_count = int(data.shape[0]*.8)
    train = data[:train_samples_count]
    test  = data[train_samples_count:]
    return train, test
#     return train[:,0], train[:,1], test[:,0], test[:,1]

# X_train, y_train, X_test, y_test = load_dataset()
# print(X_train.shape, X_test.shape)

In [12]:
train_samples, test_samples = load_dataset()
print(train_samples.shape, test_samples.shape)

(894, 2) (224, 2)


In [13]:
class NaiveBayes:
    def __init__(self):
        self.container = dict()
        self.prior = defaultdict(int)
    
        self.unique_words = []
    
    def initialize_dict_labels(self, labels):
        for label in labels:
            self.container[label] = defaultdict(int)
            
    def count_priors(self):
        for label in self.data[:,1]:
            self.prior[label] += 1

        shape = self.data.shape[0]
        for key, value in self.prior.items():
            self.prior[key] = value/shape
            
    def fit(self,data):
        self.data = np.asarray(data)
        self.initialize_dict_labels(self.data[:,1])

        for sample in self.data:
            words = sample[0].lower().split(' ')
            for word in words:
                if (word not in self.unique_words): self.unique_words.append(word)
                self.container[sample[1]][word] +=1   
        
        self.count_priors()
        
    def _predict(self, document):
        words = document[0].lower().split(' ')
        class_predictions = dict()

        for clazz, values in self.container.items():
            total_count = sum(values.values())
            container = []
            
            for word in words: 
                if word in values.keys():
                    value = values[word]
                else:
                    value = 0
                
                P = (value + 1)/(total_count + len(self.unique_words))
                container.append(P)
                
            class_predictions[clazz] = self.prior[clazz]
            for value in container:
                class_predictions[clazz] *= value
        
        return class_predictions

    def predict(self, samples):
        container = []
        for i in samples:
            result = self._predict(i[0])
            maximum = max(result.items(), key=operator.itemgetter(1))[0]
            container.append(maximum == i[1])
        return np.asarray(container).mean()

In [20]:
nb = NaiveBayes()
nb.fit(train_samples)

In [21]:
result = nb.predict(test_samples)
result

0.6339285714285714

In [17]:
# Train data
data = [["Chinese Beijing Chinese","0"],
            ["Chinese Chinese Shanghai","0"], 
            ["Chinese Macao","0"],
            ["Tokyo Japan Chinese","1"]]

# Fit model
s = NaiveBayes()
s.fit(data)

In [18]:
# Mpdel predict
result = s._predict(["Chinese Chinese Chinese Tokyo Japan"])
result

{'0': 0.00030121377997263036, '1': 0.00013548070246744226}

In [None]:
# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}