# Part A) Naive Bayes tf-idf language Model

In [1]:
import csv
import random
import re
import nltk
from collections import defaultdict
import math

# Reading file

In [2]:
def readCSV(fileName):
   
    rows  =  []
    result = []
    
    with open(fileName, 'r',encoding="utf8") as csvfile:

        # creating a csv reader object
        csvreader = csv.reader(csvfile)

        # extracting each data row one by one
        for row in csvreader:
            #print(row)
            rows.append(row)
    return rows
    

# Splitting testing and training data

In [3]:
def splitTrainAndTestData(rows, percent = 0.7,seed = 40):    
    random.seed(seed)
    rows.sort()
    trainSize = int(len(rows) * percent)
    random.shuffle(rows)
    return rows[:trainSize] , rows[trainSize:]

In [4]:
def preprocess(rows):
    for row in rows:
        row[0] = row[0].lower() 

# starting function execution

In [5]:
rows = readCSV(r"./CSV_FOLDER/message.csv")

In [6]:
preprocess(rows)

In [7]:
rows

[['i love this sandwich', 'pos'],
 ['this is an amazing place', 'pos'],
 ['i feel very good about these beers', 'pos'],
 ['this is my best work', 'pos'],
 ['what an awesome view', 'pos'],
 ['i do not like this restaurant', 'neg'],
 ['i am tired of this stuff', 'neg'],
 ['i can not deal with this', 'neg'],
 ['he is my sworn enemy', 'neg'],
 ['my boss is horrible', 'neg'],
 ['this is an awesome place', 'pos'],
 ['i do not like the taste of this juice', 'neg'],
 ['i love to dance', 'pos'],
 ['i am sick and tired of this place', 'neg'],
 ['what a great holiday', 'pos'],
 ['that is a bad locality to stay', 'neg'],
 ['we will have good fun tomorrow', 'pos'],
 ['i went to my enemy house today', 'neg']]

In [8]:
class NaiveBayes_tf_idf:
    
    def __init__(self, x_train, y_train):
        self.x_train = []
        self.y_train = y_train
        self.total_doc = len(y_train)
        print(self.total_doc)
        self.trainOnce = False
        for idx in range(len(x_train)):
            self.x_train.append(x_train[idx].split())
    
    def calculate_tf_idf(self):
        self.tf = [] 
        self.idf = defaultdict(int);
        distinct_words = set(); 
        word_in_distinct_doc = defaultdict(int)
        
        for idx in range(self.total_doc):
            self.tf.append(defaultdict(int))
            for word in self.x_train[idx]:
                distinct_words.add(word)

        # Now calculating tf
        for word in distinct_words:
            for idx in range(self.total_doc):
                self.tf[idx][word] = self.x_train[idx].count(word)/len(self.x_train[idx]);
        
        # Now calculating idf
        for word in distinct_words:
            for idx in range(self.total_doc):
                if self.x_train[idx].count(word):
                        word_in_distinct_doc[word] += 1;
        
        for word in distinct_words:
            self.idf[word] = math.log((1 + len(self.x_train))/(1 + word_in_distinct_doc[word]) + 1,10)
        #print(self.tf)
    
    
    def calculate_weight(self):
        self.weight = [] 
        for idx in range(len(self.tf)):
            self.weight.append(defaultdict(int));
            for word in self.tf[idx]:
                self.weight[idx][word] = self.tf[idx][word] * self.idf[word]
    
    def trainModel(self):
        if self.trainOnce == True:
            return
        self.trainOnce = True 

        self.calculate_tf_idf()
        self.calculate_weight()
        
        self.trainCount = defaultdict(lambda: defaultdict(int));
        self.totalWordsinClass = defaultdict(int);
        self.classCount = defaultdict(int)
        
        for idx in range(self.total_doc):
            self.classCount[self.y_train[idx]] += 1
            for word in self.x_train[idx]:
                self.trainCount[y_train[idx]][word]  += self.weight[idx][word]
                self.totalWordsinClass[y_train[idx]] += self.weight[idx][word]
    
    
    def testModel(self, sentence, tell_all_word_prob = False):
        def myPrint(*args):
            if tell_all_word_prob:
                print(" ".join(args))
            
        all_prob = []
        #return sentence
        myPrint(f'->Test Sentence is:\n',sentence,'\n')
        sentence = sentence.split()
        myPrint(f'tokens\n{sentence}')
        
        #print(sentence)
        #return self.classCount
        
        for key in self.classCount:
            myPrint(f"for class {key}")
            #print("hi")
            prob = 1.0
            for token in sentence:
                if token not in self.trainCount[key].keys():
                    num = 0
                else:
                    num = self.trainCount[key][token];
                    #print('found')
                denum = self.totalWordsinClass[key]
                num += 1                              # add one smooting
                denum += len(self.trainCount[key])    #add one smooting
                prob *= num / denum
                myPrint("\t",f"the probability of word ({token}) is {num / denum}")
            prob *= self.classCount[key] / self.total_doc
            #print(prob)
            
            all_prob.append((prob, key))
        all_prob.sort(reverse=True)
        total = 0
        for x in all_prob:
            total += x[0];
        #print(total)    
        return (round(all_prob[0][0]/total*100,2),all_prob[0][1])
        

In [9]:
training, testing = splitTrainAndTestData(rows)

In [10]:
x_train, y_train = [t[0] for t in rows] , [t[1] for t in rows]

In [11]:
x_test, y_test = [t[0] for t in testing] , [t[1] for t in testing]

In [12]:
model = NaiveBayes_tf_idf(x_train, y_train)

18


In [13]:
model.trainModel()

# Testing data

In [14]:
x_test[0]
model.testModel(x_test[0],True)

->Test Sentence is:
 my boss is horrible 

tokens
['my', 'boss', 'is', 'horrible']
for class pos
	 the probability of word (my) is 0.030409840793833933
	 the probability of word (boss) is 0.026763377511391826
	 the probability of word (is) is 0.0359144503121491
	 the probability of word (horrible) is 0.026763377511391826
for class neg
	 the probability of word (my) is 0.03323822630313742
	 the probability of word (boss) is 0.02972041356573502
	 the probability of word (is) is 0.031675040335868924
	 the probability of word (horrible) is 0.02972041356573502


(54.31, 'neg')

In [15]:
countCorrect = 0
for t in testing:
    to_string = t[0]
    print('*'*10,to_string,'*'*10,'\n')
    x = model.testModel(to_string)
    if t[1] == x[1]:
        countCorrect += 1
        print(f"it's a {x[1].upper()} with {x[0]}% surety 😁 ")
    else:
        print(f"my code says it is {x[1].upper()} with {x[0]}% surety but it is not 😨")
    print('-' * 100)    

********** my boss is horrible ********** 

it's a NEG with 54.31% surety 😁 
----------------------------------------------------------------------------------------------------
********** i do not like the taste of this juice ********** 

it's a NEG with 56.22% surety 😁 
----------------------------------------------------------------------------------------------------
********** i can not deal with this ********** 

it's a NEG with 51.73% surety 😁 
----------------------------------------------------------------------------------------------------
********** i am sick and tired of this place ********** 

my code says it is POS with 53.88% surety but it is not 😨
----------------------------------------------------------------------------------------------------
********** what a great holiday ********** 

it's a POS with 79.96% surety 😁 
----------------------------------------------------------------------------------------------------
********** this is my best work ********** 

it

In [16]:
model.tf

[defaultdict(int,
             {'will': 0.16666666666666666,
              'house': 0.0,
              'am': 0.0,
              'bad': 0.0,
              'stay': 0.0,
              'the': 0.0,
              'tomorrow': 0.16666666666666666,
              'very': 0.0,
              'tired': 0.0,
              'enemy': 0.0,
              'what': 0.0,
              'have': 0.16666666666666666,
              'sick': 0.0,
              'great': 0.0,
              'today': 0.0,
              'an': 0.0,
              'beers': 0.0,
              'locality': 0.0,
              'awesome': 0.0,
              'he': 0.0,
              'horrible': 0.0,
              'sworn': 0.0,
              'a': 0.0,
              'holiday': 0.0,
              'sandwich': 0.0,
              'and': 0.0,
              'best': 0.0,
              'place': 0.0,
              'do': 0.0,
              'good': 0.16666666666666666,
              'my': 0.0,
              'with': 0.0,
              'juice': 0.0,
            

In [17]:
model.idf

defaultdict(int,
            {'will': 1.021189299069938,
             'house': 1.021189299069938,
             'am': 0.8653014261025437,
             'bad': 1.021189299069938,
             'stay': 1.021189299069938,
             'the': 1.021189299069938,
             'tomorrow': 1.021189299069938,
             'very': 1.021189299069938,
             'tired': 0.8653014261025437,
             'enemy': 0.8653014261025437,
             'what': 0.8653014261025437,
             'have': 1.021189299069938,
             'sick': 1.021189299069938,
             'great': 1.021189299069938,
             'today': 1.021189299069938,
             'an': 0.7596678446896304,
             'beers': 1.021189299069938,
             'locality': 1.021189299069938,
             'awesome': 0.8653014261025437,
             'he': 1.021189299069938,
             'horrible': 1.021189299069938,
             'sworn': 1.021189299069938,
             'a': 0.8653014261025437,
             'holiday': 1.021189299069938,
  

# Using sklearn 

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [19]:
data = pd.DataFrame(data = rows,columns=['text','label'])

In [20]:
data['labelnum'] = data.label.map({'pos':1,'neg':0})

In [21]:
data

Unnamed: 0,text,label,labelnum
0,we will have good fun tomorrow,pos,1
1,this is an awesome place,pos,1
2,i feel very good about these beers,pos,1
3,he is my sworn enemy,neg,0
4,i went to my enemy house today,neg,0
5,i love this sandwich,pos,1
6,i love to dance,pos,1
7,i do not like this restaurant,neg,0
8,i am tired of this stuff,neg,0
9,what an awesome view,pos,1


In [22]:
vectorizer = TfidfVectorizer()
transformation = vectorizer.fit_transform(data.text)

In [23]:
transformation

<18x56 sparse matrix of type '<class 'numpy.float64'>'
	with 91 stored elements in Compressed Sparse Row format>

In [24]:
pd.DataFrame(data=transformation.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,about,am,amazing,an,and,awesome,bad,beers,best,boss,...,today,tomorrow,very,view,we,went,what,will,with,work
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.416445,0.0,0.0,0.416445,0.0,0.0,0.416445,0.0,0.0
1,0.0,0.0,0.0,0.484512,0.0,0.538998,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.416445,0.0,0.0,0.0,0.0,0.0,0.0,0.416445,0.0,0.0,...,0.0,0.0,0.416445,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.451709,0.0,0.0,0.0,0.0,0.451709,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.474251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.443222,0.0,0.493066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.563317,0.0,0.0,0.493066,0.0,0.0,0.0


In [30]:
xtrain,xtest,ytrain,ytest = train_test_split(transformation,data.labelnum,test_size=0.2,random_state=40)

In [31]:
clf = MultinomialNB().fit(xtrain,ytrain)

In [32]:
ytest

17    1
0     1
11    1
4     0
Name: labelnum, dtype: int64

In [33]:
predicted = clf.predict(xtest)
predicted

array([0, 1, 1, 0], dtype=int64)

In [34]:
#printing accuracy, Confusion matrix, Precision and Recall
print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision', metrics.precision_score(ytest,predicted))
print('\n The value of Recall', metrics.recall_score(ytest,predicted))


 Accuracy of the classifier is 0.75

 Confusion matrix
[[1 0]
 [1 2]]

 The value of Precision 1.0

 The value of Recall 0.6666666666666666
