In [1]:
import csv
import random
import re
import nltk
from collections import defaultdict

# Generating stop words and stemmer

In [2]:
stop_words = set(nltk.corpus.stopwords.words("english"))
ps = nltk.stem.PorterStemmer()

# Reading file

In [3]:
def readCSV(fileName):
   
    rows  =  []
    result = []
    
    with open(fileName, 'r',encoding="utf8") as csvfile:

        # creating a csv reader object
        csvreader = csv.reader(csvfile)

        # ignoring column heading
        next(csvreader)

        # extracting each data row one by one
        for row in csvreader:
            rows.append(row[:2])
    return rows
    

# Cleaning and tokenizing data

In [4]:
def cleanAndTokenizeSentence(sentence):
        text = []
        sentence = sentence.strip().lower().split()
        for word in sentence:
            text.append("".join(w for w in re.findall("([a-z\-.']+)", word)))
        text = " ".join(text)
        for key, val in dict({'‚Äú': '"', '‚Äù': '"', '‚Äô': "'", '--': ','}).items():
            text = text.replace(key,val)
        
        cleanWords = []
        for word in nltk.tokenize.word_tokenize(text):
            word = ps.stem(word)
            if word not in stop_words and word != '.' * len(word):
                cleanWords.append(word)
        return cleanWords

# Splitting testing and training data

In [5]:
def splitTrainAndTestData(rows, percent = 0.9,seed = 30):    
    random.seed(seed)
    rows.sort()
    trainSize = int(len(rows) * percent)
    random.shuffle(rows)
    return rows[:trainSize] , rows[trainSize:]

# NaiveBayes class 

In [6]:
class naiveBayes:
    def __init__(self, data):
        self.data = data
        self.trainCount = defaultdict(lambda: defaultdict(int));
        self.totalWordsinClass = defaultdict(int);
        self.classCount = defaultdict(int)
        self.trainOnce = False
    
    def train(self):
        if self.trainOnce == True:
            return
        self.trainOnce = True 
        for i in range(len(self.data)):
            self.data[i][1] = cleanAndTokenizeSentence(self.data[i][1]) 
            self.classCount[self.data[i][0]]+=1
            for word in self.data[i][1]:
                #return self.data[1]
                self.trainCount[self.data[i][0]][word]  += 1
                self.totalWordsinClass[self.data[i][0]] += 1
    
    def test(self, sentence, tell_all_word_prob = False):
        def myPrint(*args):
            if tell_all_word_prob:
                print(" ".join(args))
            
        all_prob = []
        #return sentence
        myPrint(f'->Test Sentence is:\n',sentence,'\n')
        sentence = cleanAndTokenizeSentence(sentence)
        myPrint(f'tokens\n{sentence}')
        
        #print(sentence)
        #return self.classCount
        for key in self.classCount:
            myPrint(f"for class {key}")
            #print("hi")
            prob = 1.0
            for token in sentence:
                if token not in self.trainCount[key].keys():
                    num = 0
                else:
                    num = self.trainCount[key][token];
                denum = self.totalWordsinClass[key]
                num += 1                              # add one smooting
                denum += len(self.trainCount[key])    #add one smooting
                prob *= num / denum
                myPrint("\t",f"the probability of word ({token}) is {num / denum}")
            prob *= self.classCount[key] / len(self.data)
            #print(prob)
            
            all_prob.append((prob, key))
        all_prob.sort(reverse=True)
        total = 0
        for x in all_prob:
            total += x[0];
        #print(total)    
        return (round(all_prob[0][0]/total*100,2),all_prob[0][1])

# starting function execution

In [7]:
rows = readCSV(r"./Dataset Message Spam - NLP Lab sample.csv - Dataset Message Spam - NLP Lab sample.csv")

In [8]:
train, test = splitTrainAndTestData(rows)

# Creating and Training model

In [9]:
model = naiveBayes(train)

In [10]:
model.train()

# Class Probability

In [11]:
for key in model.classCount:
    print(f"the probability of class ({key}) is {round(model.classCount[key]/len(model.data)*100,2)}")

the probability of class (ham) is 48.89
the probability of class (spam) is 51.11


# Testing data

In [12]:
test[0]
model.test(test[0][1],True)

->Test Sentence is:
 Ffffffffff. Alright no way I can meet up with you sooner? 

tokens
['ffffffffff', 'alright', 'way', 'meet', 'sooner']
for class ham
	 the probability of word (ffffffffff) is 0.0007598784194528875
	 the probability of word (alright) is 0.0007598784194528875
	 the probability of word (way) is 0.003799392097264438
	 the probability of word (meet) is 0.0022796352583586625
	 the probability of word (sooner) is 0.0007598784194528875
for class spam
	 the probability of word (ffffffffff) is 0.0005125576627370579
	 the probability of word (alright) is 0.0005125576627370579
	 the probability of word (way) is 0.0010251153254741158
	 the probability of word (meet) is 0.0015376729882111738
	 the probability of word (sooner) is 0.0005125576627370579


(94.48, 'ham')

In [13]:
countCorrect = 0
for t in test:
    to_string = t[1]
    print('*'*10,to_string,'*'*10,'\n')
    x = model.test(to_string)
    if t[0] == x[1]:
        countCorrect += 1
        print(f"it's a {x[1].upper()} with {x[0]}% surety üòÅ ")
    else:
        print(f"my code says it is {x[1].upper()} with {x[0]}% surety but it is not üò®")
    print('-' * 100)    

********** Ffffffffff. Alright no way I can meet up with you sooner? ********** 

it's a HAM with 94.48% surety üòÅ 
----------------------------------------------------------------------------------------------------
********** As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune ********** 

it's a HAM with 98.63% surety üòÅ 
----------------------------------------------------------------------------------------------------
********** Ok lar... Joking wif u oni... ********** 

it's a HAM with 99.94% surety üòÅ 
----------------------------------------------------------------------------------------------------
********** GENT! We are trying to contact you. Last weekends draw shows that you won a Êæπ1000 prize GUARANTEED. Call 09064012160. Claim Code K52. Valid 12hrs only. 150ppm ********** 

it's a SPAM with 100.0% surety üòÅ 
----------------------------------------------

# Accuracy

In [14]:
print(f"total count is {len(test)} and correct count is {countCorrect}")

total count is 20 and correct count is 18


In [15]:
print(f"Accuracy of the model is {countCorrect/len(test) * 100} % ")

Accuracy of the model is 90.0 % 
