In [26]:
#In this Question, I have implemented Multinomial Naive Bayes from Scratch 

import pandas as pd 
import numpy as np 
import re
from unidecode import unidecode

In [27]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [28]:
#This function is used to preprocess strings in Abstract for the arxiv dataset
def preprocess_string(str_arg):
    str_arg=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE)
    str_arg=re.sub('(\s+)',' ',str_arg)
    str_arg=str_arg.lower()
    str_arg = unidecode(str_arg)
    str_arg = remove_html(str_arg)
    return str_arg

In [29]:
class NaiveBayes:
    
    def __init__(self):
        #Just initializing the object here, nothing to do
        pass
        

    def addToBow(self,example,dict_index):
        #Algorithm - I am storing counts of all words as per Category Index in BoW 
        example=example[0] #This is because the Example will have the shape - (1,)
        for token_word in example.split(): #for every word in preprocessed example
            considered_dict = self.bow_dicts[dict_index]
            intVal = 0
            if token_word in considered_dict:
                intVal = considered_dict[token_word]
                self.bow_dicts[dict_index][token_word] = intVal + 1
            else:
                self.bow_dicts[dict_index][token_word] = 1
            
    def train(self,dataset,labels):
        self.examples=dataset
        self.labels=labels
        
        self.classes = np.unique(labels)
        
        bag_of_word_list = []
        for index in range(self.classes.shape[0]):
            bag_of_word_list.append(dict())
        self.bow_dicts = np.array(bag_of_word_list) #We want to return 0 if value does not exist
        #For Bag of Words, we have one Dictionary for Every Category
        
        self.examples=np.array(self.examples)
        self.labels=np.array(self.labels)
            
        #constructing BoW for each category
        for cat_index,cat in enumerate(self.classes):
            np.apply_along_axis(self.addToBow, 1, pd.DataFrame(self.examples[self.labels==cat]),cat_index) #I am creating the BoW for each class

        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
           
            #Calculating prior probability p(c) for each class
            #For our particular dataset, we will observe that the prior values are the same after computation
            number_of_label_rows = self.labels.shape[0]
            number_of_rows_each_class = np.sum(self.labels==cat)
            prob_classes[cat_index]=float(number_of_rows_each_class/number_of_label_rows)
            
            #Calculating total counts of all the words of each class 
            count=np.asarray(list(self.bow_dicts[cat_index].values()))
            cat_word_counts[cat_index]=np.sum(count)+1
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
                                                     
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #computing denominator array (which we use later) 
        denoms_array = []
        for cat_index,cat in enumerate(self.classes):
            denoms_array.append(cat_word_counts[cat_index]+self.vocab_length+1)
        denoms = np.asarray(denoms_array) #We typecast
        
        #Storing BoW, Probability and Denom for each category (in this order):
        arr_new = []
        for cat_index,cat in enumerate(self.classes):
            arr_new.append([self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]])
        self.cats_info = np.array(arr_new)
                                              
                                              
    def getExampleProb(self,test_example):                                                                
        likelihood_prob=np.zeros(self.classes.shape[0]) #to store probability w.r.t each class
        post_prob=np.zeros(self.classes.shape[0])
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes): 
            for test_token in test_example.split():
                word_map = self.cats_info[cat_index][0]
                if test_token in word_map: 
                    c_val = self.cats_info[cat_index][0].get(test_token)
                else:
                    c_val = 0 #Take 0 if value is absent 
                test_token_counts=c_val+1 #If we don't add +1, we are going to run into Log0 or undefined statements
                denominator_values_arr = self.cats_info[cat_index][2]
                test_token_prob=float(test_token_counts/denominator_values_arr)                       
                likelihood_prob[cat_index]=likelihood_prob[cat_index]+np.log10(test_token_prob) #log stops underflow
                                              
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log10(self.cats_info[cat_index][1])                                  
      
        return post_prob
    
   
    def test(self,test_set):
        predictions=[]
        for example in test_set:                                
            post_prob=self.getExampleProb(example)
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions)

In [30]:
train_df = pd.read_csv('E:/kaggle1/train.csv') #Please Replace here

In [31]:
#I am dropping duplicates if there are any
train_df=train_df.drop_duplicates()

#Preprocess the string
train_df['Abstract'] =  train_df['Abstract'].apply(preprocess_string)

#Preprocess some more!
train_df['Abstract'] = train_df['Abstract'].apply(lambda x: x.replace('&gt;', ''))

In [32]:
#Forming the DataSets
#80-20 Split
total_rows = train_df.shape[0]
eighty_p = 80/100 * total_rows
remaining = total_rows - eighty_p

X_training = train_df['Abstract'][0:int(eighty_p)]
X_testing = train_df['Abstract'][int(eighty_p):]
Y_Training = train_df['Category'][0:int(eighty_p)]
Y_Testing = train_df['Category'][int(eighty_p):]

In [33]:
nb=NaiveBayes() #instantiate a NB class object

In [34]:
nb.train(X_training,Y_Training) #start tarining by calling the train function

In [35]:
pclasses=nb.test(X_testing) #get predcitions for test set

In [36]:
#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==Y_Testing)/float(Y_Testing.shape[0])

In [37]:
print ("Test Set Examples: ",Y_Testing.shape[0])
print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Examples:  1500
Test Set Accuracy:  78.93333333333334 %


In [38]:
#Reference I used code and guidance from while coding the class: 

#1. https://towardsdatascience.com/unfolding-na%C3%AFve-bayes-from-scratch-2e86dcae4b01
#2. https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/    
#3. https://github.com/aishajv/Unfolding-Naive-Bayes-from-Scratch/blob/master/%23%20Unfolding%20Na%C3%AFve%20Bayes%20from%20Scratch!%20Take-2%20%F0%9F%8E%AC.ipynb

In [39]:
#Now, let us run this against a CSV that can work in Kaggle

In [40]:
test_df = pd.read_csv('E:/kaggle1/test.csv')

In [41]:
#WE MUST apply the same preprocessing to the test dataframe. Otherwise we are making a grave mistake and might get worse predictions. 

#I am dropping duplicates if there are any
test_df=test_df.drop_duplicates()

#Preprocess the string
test_df['Abstract'] =  test_df['Abstract'].apply(preprocess_string)

#Preprocess some more!
test_df['Abstract'] = test_df['Abstract'].apply(lambda x: x.replace('&gt;', ''))

In [42]:
testFrame = test_df['Abstract']

In [43]:
pclasses2=nb.test(testFrame)

In [44]:
pclasses2

array(['stat.ML', 'astro-ph.SR', 'astro-ph.SR', ..., 'astro-ph.GA',
       'gr-qc', 'cond-mat.mes-hall'], dtype='<U17')

In [45]:
df_final_preds = pd.DataFrame(pclasses2)

In [46]:
df_final_preds = df_final_preds.rename(columns={0: "Category"})

In [47]:
df_final_preds["Id"] = df_final_preds.index

In [48]:
columns_titles = ["Id","Category"]
df_final_preds=df_final_preds.reindex(columns=columns_titles)

In [49]:
df_final_preds.to_csv("C:/kaggleMultiNB", index = False) #Generates File. It can be used to submit on Kaggle 

In [50]:
#Note: To improve performance, we can train on the whole dataset instead of 80%. 