In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [3]:
data = {
    "text": [
        "a great game",
        "the election was over",
        "very clean match",
        "a clean but forgettable game",
        "it was a close election"
    ],
    "tag": ["S", "NS", "S", "S", "NS"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,tag
0,a great game,S
1,the election was over,NS
2,very clean match,S
3,a clean but forgettable game,S
4,it was a close election,NS


In [22]:
class NaiveBayesClassifier:

    def __init__(self):
        self.vocab = set()
        self.label_count = defaultdict(int)
        self.labelled_word_count = defaultdict(lambda: defaultdict(int))
        self.word_count = defaultdict(int)
        self.class_probs = {}

    def fit(self, X, y):
        for text, label in zip(X, y):
            words = text.split()
            self.label_count[label] += 1

            for word in words:
                self.vocab.add(word)
                self.labelled_word_count[label][word] += 1
                self.word_count[word] += 1

        total_count = sum(self.label_count.values())
        for label, count in self.label_count.items():
            self.class_probs[label] = count/total_count

    def predict(self, X):
        predictions = {}

        for text in X:
            words = text.split()

            class_scores = {}

            for label in self.label_count.keys():
                class_score = self.class_probs[label] # np.log(self.class_probs[label])

                for word in words:
                    word_count = self.labelled_word_count[label][word]
                    total_count = self.label_count[label]+len(self.vocab)

                    class_score *=  (1+word_count)/(total_count)
                    class_scores[label] = class_score
            
            pred, label = 0, None
            for i, j in class_scores.items():
                if(j>pred):
                    pred=j
                    label=i
            predictions[label]=pred

        return predictions

In [23]:
nb = NaiveBayesClassifier()

X = df['text'].values
y = df['tag'].values

nb.fit(X, y)

input = ["a very close game", "game not election"]

y_pred = nb.predict(input)

print(y_pred)

{'S': 0.0003663749236718909}
