In [1]:
import pandas as pd
import numpy as np
import re
pd.set_option('mode.chained_assignment', None)

In [2]:
data = pd.read_csv(r'dataset_NB.txt', sep='\n', header=None)
data.head()

Unnamed: 0,0
0,So there is no way for me to plug it in here i...
1,"Good case, Excellent value. 1"
2,Great for the jawbone. 1
3,Tied to charger for conversations lasting more...
4,The mic is great. 1


In [3]:
df = pd.DataFrame(data=np.asarray(data), columns=['EMAILS'])
df.insert(1, "sentiment", 'NULL')
df.head()


Unnamed: 0,EMAILS,sentiment
0,So there is no way for me to plug it in here i...,
1,"Good case, Excellent value. 1",
2,Great for the jawbone. 1,
3,Tied to charger for conversations lasting more...,
4,The mic is great. 1,


In [4]:
# sentiment is the last character in the string
for i in range(df.shape[0]):
    df['sentiment'][i] = int(df['EMAILS'][i][-1])
    df['EMAILS'][i] = df['EMAILS'][i][:-1]
    df['EMAILS'][i].strip()

In [5]:
df.head()

Unnamed: 0,EMAILS,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


# Working Equations:

$$ \text{message} = [x_1, x_2...., x_n] $$

$$ P(C_{neg} | x_1, x_2,..., x_n) = P(C_{neg})P(x_1 | C_{neg})P(x_2 | C_{neg}) ... P(x_n|C_{neg}) $$

$$ P(C_{pos} | x_1, x_2,..., x_n) = P(C_{pos})P(x_1 | C_{pos})P(x_2 | C_{pos}) ... P(x_n|C_{pos}) $$

In [6]:
def preprocess(text_str):
    text_str = re.sub(r'([^a-zA-Z ])', '', text_str)
    text_str = re.sub('(\s+)', ' ', text_str)
    text_str = re.sub(r'(^|\W)\d+', '', text_str)
    text_str = text_str.lower()
    text_str = text_str.strip().split()
    return text_str

In [7]:
class Naive_Bayes:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        self.test['pred_sentiment'] = 'NULL'
        self.accuracy = []

    def train_NB(self):
        pos_tot = 0
        neg_tot = 0
        voc = dict()  # dictionary with (word, sentiment) as key
        pos_words = set([])
        neg_words = set([])  # set of words

        for i in range(self.train.shape[0]):
            sentiment = self.train['sentiment'][i]

            if sentiment == 1:
                pos_tot += 1
                for words in self.train['EMAILS'][i]:
                    pos_words.add(words)

            if sentiment == 0:
                neg_tot += 1
                for words in self.train['EMAILS'][i]:
                    neg_words.add(words)

            for word in self.train['EMAILS'][i]:
                sentiment = self.train['sentiment'][i]
                pair = (word, sentiment)
                if pair in voc.keys():
                    voc[pair] += 1
                else:
                    voc[pair] = 1

        unique = set([])
        
        for key in voc.keys():
            unique.add(key[0])

        num_vocab = len(unique)

        self.voc = voc
        self.num_vocab = num_vocab
        self.pos_tot = pos_tot
        self.neg_tot = neg_tot
        self.pos_words = pos_words
        self.neg_words = neg_words

    def test_NB(self, alpha):
        for i in range(self.test.shape[0]):
            accuracy = []
            pro_neg = self.neg_tot / \
                (self.pos_tot + self.neg_tot)  # prior probability
            pro_pos = self.pos_tot / (self.pos_tot + self.neg_tot)

            for word in self.test['EMAILS'][i]:
                pair1 = (word, 0)
                pair2 = (word, 1)
                if pair1 not in self.voc.keys():
                    pro1 = 0
                else:
                    pro1 = self.voc[pair1]
                if pair2 not in self.voc.keys():
                    pro2 = 0
                else:
                    pro2 = self.voc[pair2]

                pro_neg *= ((pro1 + alpha) /
                            (len(self.neg_words) + alpha*self.num_vocab))
                pro_pos *= ((pro2 + alpha) /
                            (len(self.pos_words) + alpha*self.num_vocab))

            if pro_neg > pro_pos:
                self.test['pred_sentiment'][i] = 0
            elif pro_pos > pro_neg:
                self.test['pred_sentiment'][i] = 1
        arr1 = self.test['sentiment'].to_numpy()
        arr2 = self.test['pred_sentiment'].to_numpy()
        accuracy.append((np.sum(arr1 == arr2)/self.test.shape[0]))
        return round(accuracy[0], 2)

In [8]:
for i in range(df.shape[0]):
    df['EMAILS'][i] = preprocess(df['EMAILS'][i])

sum = 0
alpha = 1    # laplace smoothing
kfold = 7
n = df.shape[0] // kfold
df = df.sample(frac=1)  # random
folds = [df[i: i + n] for i in range(0, len(df), n)]

In [9]:
for i in range(kfold):
    # k fold CV
    train_data = []
    test_data = folds[i]
    test_data = test_data.reset_index()

    for j in range(kfold):
        if i != j:
            train_data.append(folds[j])

    temp = train_data[0]
    for k in range(1, kfold-1):
        temp = pd.concat([temp, train_data[k]], ignore_index=True)

    NB = Naive_Bayes(temp, test_data)
    NB.train_NB()
    prob = (NB.test_NB(alpha))
    print(f"Accuracy [fold:{i}]: {prob}")
    sum = sum + prob

print("Avg Accuracy : {:0.2f}".format(sum / kfold))

accuracy [fold:0]: 0.75
accuracy [fold:1]: 0.82
accuracy [fold:2]: 0.82
accuracy [fold:3]: 0.84
accuracy [fold:4]: 0.82
accuracy [fold:5]: 0.79
accuracy [fold:6]: 0.87
avg accuracy : 0.82
