In [1]:
import csv
import random
import re
import nltk
import math
import pandas as pd
nltk.download('stopwords')
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/goldman1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = set(nltk.corpus.stopwords.words("english"))

In [3]:
def readCSV(fileName):
    rows  =  []
    result = []
    with open(fileName, 'r',encoding="ISO-8859-1") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            rows.append(row[:2])
    return rows

In [4]:
def cleanAndTokenizeSentence(sentence):
        text = []
        sentence = sentence.strip().lower().split()
        for word in sentence:
            text.append("".join(w for w in re.findall("([a-z\-.']+)", word)))
        text = " ".join(text)
        for key, val in dict({'“': '"', '”': '"', '’': "'", '--': ','}).items():
            text = text.replace(key,val)
        
        cleanWords = []
        for word in nltk.tokenize.word_tokenize(text):
            if word not in stop_words and word != '.' * len(word):
                cleanWords.append(word)
        return cleanWords

In [5]:
def splitTrainAndTestData(rows, percent = 0.9,seed = 30):    
    random.seed(seed)
    rows.sort()
    trainSize = int(len(rows) * percent)
    random.shuffle(rows)
    return rows[:trainSize] , rows[trainSize:]

In [6]:
class NaiveBayes:
    def __init__(self, data):
        self.data = data
        self.tfValues = defaultdict(lambda: defaultdict(int))
        self.wordTfValues = defaultdict(lambda: defaultdict(int))
        self.nDocumentsContainingWord = defaultdict(lambda: defaultdict(int))
        self.trainCount = defaultdict(lambda: defaultdict(int));
        self.totalWordsinClass = defaultdict(int);
        self.classCount = defaultdict(int)
        self.trainOnce = False
    
    def train(self):
        if self.trainOnce == True:
            return
        self.trainOnce = True 
        for i in range(len(self.data)):
            self.data[i][1] = cleanAndTokenizeSentence(self.data[i][1])
            
            termVocab = set(self.data[i][1])
            termVocabSize = len(termVocab)
            termTfValues = {}
            
            for word in termVocab:
                self.nDocumentsContainingWord[word][self.data[i][0]] = self.nDocumentsContainingWord[word][self.data[i][0]]+ 1 
                self.nDocumentsContainingWord[word]['all'] = self.nDocumentsContainingWord[word]['all']+ 1
                
            for word in self.data[i][1]:
                termTfValues[word] = termTfValues.get(word, 0) + 1
            
            for value in termTfValues:
                termTfValues[value] = termTfValues[value]/termVocabSize
                self.wordTfValues[value][i] = termTfValues[value]
            
            self.tfValues[i][0] = self.data[i][0]
            self.tfValues[i][1] = termTfValues
            
            self.classCount[self.data[i][0]]+=1
            
            for word in self.data[i][1]:
                self.trainCount[self.data[i][0]][word]  += 1
                self.totalWordsinClass[self.data[i][0]] += 1

    
    def test(self, sentence, tell_all_word_prob = False):
        def myPrint(*args):
            if tell_all_word_prob:
                print(" ".join(args))
            
        all_prob = []
        myPrint(f'->Test Sentence is:\n',sentence,'\n')
        sentence = cleanAndTokenizeSentence(sentence)
        myPrint(f'Tokens:\n{sentence}')
        
        
        for key in self.classCount:
            myPrint(f"For class {key}")
            
            prob = 1.0
            
            for token in sentence:
                if token not in self.wordTfValues.keys():
                    num = 0
                else:
                    k = 0
                    num = 0
                    for entry in self.wordTfValues[token]:
                        if key != self.tfValues[k][0]:
                            continue
                        num += entry * math.log10(len(self.data)/self.nDocumentsContainingWord[token]['all'])
                        k = k+1
                
                denum = 0
                for i in range(len(self.tfValues)):
                    entry = self.tfValues[i]
                    if entry[0] != key:
                        continue
                    for entryWord in entry[1]:
                        denum += entry[1][entryWord]*math.log10(len(self.data)/self.nDocumentsContainingWord[entryWord]['all'])
             
                num += 1                             
                denum += len(self.trainCount[key])  
                prob *= num / denum
                myPrint("\t",f"The probability of word ({token}) is {round(num / denum, 4)}")
            prob *= self.classCount[key] / len(self.data)
            all_prob.append((prob, key))
        all_prob.sort(reverse=True)
    
        return (all_prob[0][0],all_prob[0][1])

In [7]:
rows = readCSV(r"./dataset.csv")

In [8]:
train, test = splitTrainAndTestData(rows)

In [9]:
model = NaiveBayes(train)

In [10]:
model.train()

In [11]:
for key in model.classCount:
    print(f"Prior probability of class ({key}) is {round(model.classCount[key]/len(model.data)*100,2)}")

Prior probability of class (ham) is 48.89
Prior probability of class (spam) is 51.11


In [12]:
prob, c =  model.test(test[0][1],True)

print("\n\nThe predicted class is " +  str(c) + " with probablity " + str(prob))

->Test Sentence is:
 Ffffffffff. Alright no way I can meet up with you sooner? 

Tokens:
['ffffffffff', 'alright', 'way', 'meet', 'sooner']
For class ham
	 The probability of word (ffffffffff) is 0.0014
	 The probability of word (alright) is 0.0014
	 The probability of word (way) is 0.1163
	 The probability of word (meet) is 0.5469
	 The probability of word (sooner) is 0.0014
For class spam
	 The probability of word (ffffffffff) is 0.0012
	 The probability of word (alright) is 0.0012
	 The probability of word (way) is 0.0012
	 The probability of word (meet) is 0.0012
	 The probability of word (sooner) is 0.0012


The predicted class is ham with probablity 9.419089393554257e-11


In [13]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [14]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [15]:
train_Y = train[0]
train_X = train[1]
train_X = [' '.join(map(str, l)) for l in train_X]

test_Y = test[0]
test_X = test[1]
test_X = [' '.join(map(str, l)) for l in test_X]
train_Y

0       ham
1       ham
2      spam
3       ham
4      spam
       ... 
175    spam
176    spam
177    spam
178    spam
179     ham
Name: 0, Length: 180, dtype: object

In [16]:
tf_vectorizer = TfidfVectorizer()
X_train_tf = tf_vectorizer.fit_transform(train_X)

X_test_tf = tf_vectorizer.transform(test_X)

In [17]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_Y)

MultinomialNB()

In [18]:
prediction = naive_bayes_classifier.predict(tf_vectorizer.transform(['Ffffffffff. Alright no way I can meet up with you sooner?']))

print("The prediction is: " + prediction[0])

The prediction is: ham
