# Naive Bayes text Document Classifier

<h3>Importing Libraries</h3>

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
#this block generates class_id, document_id, count of documents per class and a list containing paths to all the 
#files in the training dataset
directory = "train"
def pre_process(directory):
    paths = []              #contains paths to all the files of the dataset
    count_perclass = []  #contains count of documents in every class
    class_doc = []
    Class_id = {}           #assigns an id to each class
    doc_id_dict = {}        #assigns an id to each document
    id = 1
    doc_id = 1
    for subdir in os.listdir(directory):
        for document in os.listdir(directory+"/"+subdir):
            doc_id_dict[document] = doc_id
            doc_id+= 1
            class_doc.append([id , doc_id])
        Class_id[subdir] = id
        id+=1
        files = os.listdir(directory+'/'+subdir)
        paths.extend([ directory+'/'+subdir+'/'+i for i in files])
        count_perclass.append(len(files))
        
    class_doc = pd.DataFrame(class_doc) 
    
    return Class_id , doc_id_dict , class_doc , count_perclass , paths

In [None]:
import nltk
from nltk.stem import LancasterStemmer

In [None]:
#this block generates the total words dictionary and assigns a word ID to each word.
def generate_total_words():
    total_words = {}        #dictionary containing all the words as keys and their word id as values.
    for f in paths:
        file_words = []
        file = open(f , "r")
        for line in file.readlines():
            for word in line.split(" "):
                word = word.replace("\n" , "")
                word = word.replace("." , "")
                word = word.replace("," , "")
                word = word.replace(". " , "")
                word = word.replace(". " , "")
                word = word.replace(".\n" , "")
                word = word.replace(",\n" , "")
                word = word.replace("'s" , "")
                word = word.replace("s" , "")
                word = LancasterStemmer().stem(word)
                total_words[word] = 0

    w_id = 1
    for word in total_words.keys():
        total_words[word] = w_id
        w_id += 1
        
    return total_words

In [None]:
#this block generates a data list which contains information about every word in the dataset
def generate_data_df(directory , Class_id , doc_id_dict , total_words):
    data = []     #a list containing lists of all the words i.e [Class_id , document_id , total_words , word_frequency]
    for subdir in os.listdir(directory):
        pseudo_data = []
        #iterating over every document in every class
        for document in os.listdir(directory+"/"+subdir):
            word_frequency = {}

            file = open(directory + "/" + subdir + "/" + document , "r")

            #iterating over every line of every document in a class
            for line in file.readlines():
                #iterating over eery word of the line in a document
                for word in line.split(" "):
                    word = word.replace("\n" , "")
                    word = word.replace("." , "")
                    word = word.replace("," , "")
                    word = word.replace(". " , "")
                    word = word.replace(". " , "")
                    word = word.replace(".\n" , "")
                    word = word.replace(",\n" , "")
                    word = word.replace("'s" , "")
                    word = word.replace("s" , "")
                    word = LancasterStemmer().stem(word)

                    if word in word_frequency:
                        word_frequency[word] += 1
                    else:
                        word_frequency[word] = 0

            for word, count in word_frequency.items():
                pseudo_data = [Class_id[subdir] , doc_id_dict[document] , total_words[word] , count]
                data.append(pseudo_data)
                
    data_df = pd.DataFrame(data)
    data_df.rename(columns = {0:"Class_id" , 1:"document_id" , 2:"word_id" , 3:"word_frequency"} , inplace = True)
    
    return data_df

<h3>Pre-processing the training dataset to test against our naive bayes agent</h3>

In [None]:
Class_id , doc_id_dict , class_doc , count_perclass , paths = pre_process(directory)
total_words = generate_total_words()
data_df = generate_data_df(directory , Class_id , doc_id_dict , total_words)

<h4>pre-prosssed dataframe</h4>

In [None]:
data_df

In [None]:
data_df.to_csv("data.csv")

<h3>Generating probability for every class</h3>

In [None]:
#this block generates the probability of every class in the taining dataset
def generate_class_prob(directory):
    prob_class = {}         #contains probability of each class in the dataset
    for Class in os.listdir(directory):
        files = os.listdir(directory+'/'+Class)
        prob = len(files) / len(paths)
        prob_class[Class_id[Class]] = prob
    return prob_class

prob_class = generate_class_prob(directory)
prob_class

<h3>Applying laplace smoothing</h3>

In [None]:
#with laplace smoothing
a = 1

#probability of each word given class
pb_ij = data_df.groupby(['Class_id','word_id'])
pb_j = data_df.groupby(['Class_id'])
Pr_a =  (pb_ij['word_frequency'].sum() + a) / (pb_j['word_frequency'].sum() + len(total_words))

Pr_a = Pr_a.unstack()

for c in range(1,11):
     Pr_a.loc[c,:] = Pr_a.loc[c,:].fillna(a/(pb_j['word_frequency'].sum()[c] + len(total_words)))

Pr_dict_a = Pr_a.to_dict()


<h3>Probability without laplace smoothing</h3>

In [None]:
#without laplace smoothing

#probability of each word given class
pb_kl = data_df.groupby(['Class_id','word_id'])
pb_l = data_df.groupby(['Class_id'])
Pr =  (pb_kl['word_frequency'].sum()) / (pb_l['word_frequency'].sum())

Pr = Pr.unstack()

for c in range(1,11):
  Pr.loc[c,:] = Pr.loc[c,:]

Pr_dict = Pr.to_dict()

In [None]:
#Common stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

<h3>Removing Stopwords</h3>

In [None]:
total_words_df = pd.DataFrame(list(total_words.items()))

#word_id of all words
total = set(total_words_df[1])

#generating set of good words
total_words_df = total_words_df[~total_words_df[0].isin(stop_words)]
good = list(total_words_df[1])
good = set(good)

#generating set of stop words
stop = total - good

for bad in stop:
    for j in range(1,11):
        Pr_dict[j][bad] = a/(pb_j['word_frequency'].sum()[j] + len(total_words))    #removing stop words
        

<h3>Calculating probability of every class given document of test dataset and then assigning class to every document</h3>

In [None]:
def assign_class(Pr_dict , dict):
    result = []
    for doc_id in range(1, len(dict)+1):
        prob_dict = {}
        for class_id in range(1,len(count_perclass) + 1):
            prob_dict[class_id] = 1
            for word_id in dict[doc_id]: 
                prob=Pr_dict[word_id][class_id]
                if prob != 0:            
                    prob_dict[class_id]+=(np.log(1+ dict[doc_id][word_id]))*np.log(prob)
                else:
                    prob_dict[class_id] += 0       
#calculating final probability of each word      
            prob_dict[class_id] +=  np.log(prob_class[class_id])                          
            
#finding maximum value of probability
        max_probability = max(prob_dict, key=prob_dict.get)
        result.append(max_probability)

    return result

<h2>Creating a new dictionary to store the frequency, doc_id and word_count and to provide a way to see the results with and without laplace smoothing </h2>

In [None]:
def algorithm(df , laplace_smoothing= False):

    df_dict = df.to_dict()
#new_dict is a dictionary containing keys as document id's and values as dictionaries containing wordId as keys
#and word frequency as values.
    new_dict = {}
    
    for ID in range(len(df_dict['document_id'])):
        doc_id = df_dict['document_id'][ID]
        word_id = df_dict['word_id'][ID]
        frequency = df_dict['word_frequency'][ID]
        try: 
            new_dict[doc_id][word_id] = df_dict['word_frequency'][ID] 
        except:
            new_dict[df_dict['document_id'][ID]] = {}
            new_dict[doc_id][word_id] = df_dict['word_frequency'][ID]
  
    return assign_class(Pr_dict_a , new_dict) if laplace_smoothing else assign_class(Pr_dict , new_dict)

<h3>Testing our model against the training dataset and calculating the error</h3>

In [None]:
test_data_df = data_df

classification_withLaplace = algorithm(test_data_df , True)
classification_withoutLaplace = algorithm(test_data_df , False)

#training dataframe for verification of our model

training_data = list(class_doc[0]) 
correctness = 0
correctness_a = 0

for x,y in zip(classification_withoutLaplace, training_data):
    if x != y:
        correctness +=1
    else:
        pass   
print("Error without laplace:\t\t", "{0:.5f}".format(correctness/ len(training_data) *100), "%")

for x,y in zip(classification_withLaplace, training_data):
    if x != y:
        correctness_a +=1
    else:
        pass 
print("Error with laplace:\t\t", "{0:.5f}".format(correctness_a/ len(training_data) *100) , "%")