# Multinomial Naive Bayes Text Classifier

In [60]:
# imports

import numpy as np
import pandas as pd
import nltk
import sklearn

The MNB Classifier (courtesy of ML hw1)

In [61]:
class MNB_TextClassifier:
    
    def __init__(self):
        self.prior = {}
        self.condprob = {}
    
    # p(class=y | {term0=x[0], term1=x[1], ...} )
    # please note that you dont add new data to vocabulary when predicting
    def proba_y_given_x(self, y, x):
        length = len(self.condprob)
        prob_y = self.prior[y]
        prob_x = 1
        for i in x:
            if(i in self.condprob.keys()):
                prob_x *= (self.condprob[i][y])
            else:
                prob_x *= ((1)/(self.dict_y_term[y] + length))
        return prob_y * prob_x
 
        #raise NotImplementedError()
    
    # p(c)
    def proba_y(self, y):
        return self.prior[y]

    # update self.prior[class] as p(class=class) 
    # update self.condprob[term][class] as p(term=term | class=class)
    def fit(self, X, y):
        dict_y = {i:list(y).count(i) for i in y}
        total_y = 0

        for i in dict_y:
            total_y += dict_y[i]
        for i in dict_y:
            self.prior[i] = dict_y[i]/total_y
        
        X_Plain = list()
        
        for i in X:
            for j in i:
                X_Plain.append(j)

        dict_x = {i:X_Plain.count(i) for i in X_Plain}
        dict_x_dict_y = {i:dict() for i in dict_x}
        dict_x_cond_y = {i:dict() for i in dict_x}

        for i in range(len(X)):
            for j in X[i]:
                if(i in dict_x_cond_y[j]):
                    dict_x_cond_y[j][i] += 1
                else:
                    dict_x_cond_y[j][i] = 1
        self.dict_y_term = {i:0 for i in y}
        for i in range(len(y)):
            self.dict_y_term[y[i]] += (len(X[i]))

        dict_y_last = {i:{j:0 for j in self.dict_y_term} for i in dict_x_cond_y}

        for i in dict_x_cond_y:
            for j in range(len(y)):
                if(j in dict_x_cond_y[i]):
                    dict_y_last[i][y[j]] += dict_x_cond_y[i][j] 
        self.condprob = {i:{j:0 for j in self.dict_y_term} for i in dict_x_cond_y}
        for i in dict_y_last:
            for j in dict_y_last[i]:

                self.condprob[i][j] = (dict_y_last[i][j] + 1)/(self.dict_y_term[j] + len(dict_x))
        #print(dict_x)
        #print(self.dict_y_term)
        #print(dict_y_last)
        #raise NotImplementedError()
        
    def predict_single(self, x):
        classes = [i for i in self.prior.keys()]
        likelihood = [self.proba_y_given_x(i,x) for i in classes]
        return classes[likelihood.index(max(likelihood))]
        #raise NotImplementedError()
        
    def predict(self, X):
        return [self.predict_single(x) for x in X]

Using single annotator csv as <b>TRAIN DATA</b>

In [62]:
single_annotator = pd.read_csv("training_clean/clean_single_annotator.csv",delimiter=",")
single_annotator.head()

Unnamed: 0.1,Unnamed: 0,kalimat_id,kata,sense,kalimat
0,0,336691,cerah,4801,cuaca cerah lazim
1,1,336270,cerah,4801,gambar dihasilkan layarnya cerah memiliki spea...
2,2,336555,cerah,4803,cerah pemuda berumur 20 prancis abad 17
3,3,336618,cerah,4801,cor caroli alpha canum venaticorum nama lengka...
4,4,336613,cerah,4801,sanders menyukai cat air lilo maksud menampilk...


In [63]:
# This is creating the MNB Text Classifier and fitting X to y

df = single_annotator[single_annotator['kata']=='cerah']

arr = [nltk.word_tokenize(df['kalimat'].iloc[i]) for i in range(len(df))]

X = np.array(arr)

y = np.array(df['sense'].values)

clf = MNB_TextClassifier()
clf.fit(X, y)
clf.proba_y_given_x('4801',['cuaca']) #test proba

0.004127130354675265

Using double annotator agree csv as <b>TEST DATA</b>

In [64]:
double_annotator_agree = pd.read_csv("training_clean/clean_double_annotator_agree.csv",delimiter=",")

df_test = double_annotator_agree[double_annotator_agree['kata']=='cerah']

test_arr = [nltk.word_tokenize(df_test['kalimat'].iloc[i]) for i in range(len(df_test))]
test_X = np.array(test_arr)

test_y = np.array(df_test['sense'].values)

y_pred = np.array(clf.predict(test_X))

acc = sklearn.metrics.accuracy_score(test_y, 
                                     y_pred)

acc

0.8421052631578947

## SANDBOX

In [11]:
#single experiment, 4 samples

a = nltk.word_tokenize(single_annotator['kalimat'][0])
b = nltk.word_tokenize(single_annotator['kalimat'][1])
c = nltk.word_tokenize(single_annotator['kalimat'][2])
d = nltk.word_tokenize(single_annotator['kalimat'][3])

X = np.array([
    a,
    b,
    c,
    d,
])

y = np.array([1,1,3,1])

clf = MNB_TextClassifier()
clf.fit(X, y)

X

array([list(['cuaca', 'cerah', 'lazim']),
       list(['gambar', 'dihasilkan', 'layarnya', 'cerah', 'memiliki', 'speaker', 'menghasilkan', 'suara', 'keras', 'jernih']),
       list(['cerah', 'pemuda', 'berumur', '20', 'prancis', 'abad', '17']),
       list(['cor', 'caroli', 'alpha', 'canum', 'venaticorum', 'nama', 'lengkapnya', 'cor', 'caroli', 'regis', 'martyris', 'bintang', 'cerah', 'rasi', 'canes', 'venatici'])],
      dtype=object)

In [13]:
clf.condprob['cerah']

{1: 0.06666666666666667, 3: 0.05263157894736842}