In [21]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re
import nltk
import pickle
import sys


train = pd.read_csv('/home/stephen/Classes/Machine Learning/Hanna-hw2/training.csv')
test = pd.read_csv('/home/stephen/Classes/Machine Learning/Hanna-hw2/test.csv')

In [22]:
class NaiveBayes:
    
    def __init__(self,unique_classes):
        
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training set
        

    def addToBow(self,example,dict_index):
        
        if isinstance(example,np.ndarray): example=example[0]
     
        for token_word in example.split(): #for every word in preprocessed example
          
            self.bow_dicts[dict_index][token_word]+=1 #increment in its count
            
    def train(self,dataset,labels):
    
        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels)
            
        #constructing BoW for each category
        for cat_index,cat in enumerate(self.classes):
            
            examples=pd.DataFrame(train)
            filtered = examples[examples['label'] == cat]
            filtered.columns = ['review', 'label']
            cleaned_examples = filtered['review']
            cleaned_examples = pd.DataFrame(filtered)
            cleaned_examples = cleaned_examples.reset_index(drop=True)
            
            #now costruct BoW of this particular category
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)
            
      
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        topop = []
        for cat_index,cat in enumerate(self.classes):
            
            for key in self.bow_dicts[cat_index]:
                counter = self.bow_dicts[cat_index].get(key)
                if counter < 3:
                    topop.append(key)
            for i in topop:
                self.bow_dicts[cat_index].pop(i, None)
           
            #Calculating prior probability p(c) for each class
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            #Calculating total counts of all the words of each class 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1 # |v| is remaining to be added
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
                                                     
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #computing denominator value                                      
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])                                                                          
        
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]
        self.cats_info=np.array(self.cats_info)
                                              
                                              
    def getExampleProb(self,test_example):                                
                             
                                              
        likelihood_prob=np.zeros(self.classes.shape[0])#to store probability w.r.t each class
        
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes):
            for test_token in test_example.split(): #split the test example and get p of each test word
                
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1
                
                #now get likelihood of this test_token word                              
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])                              
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index]+=np.log(test_token_prob)                               
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])                                  
        
        return post_prob
    
   
    def test(self,test_set):
       
        predictions=[] #to store prediction of each test example
        for example in test_set: 
             
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(example) #get prob of this example for both classes
            
            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions)

In [23]:
train_data=train.review
train_labels=train.label
print ("Total Number of Training Examples: ",len(train_data))
print ("Total Number of Training Labels: ",len(train_labels))
nb=NaiveBayes(np.unique(train_labels)) #instantiate a NB class object

print ("---------------- Training In Progress --------------------")
 
nb.train(train_data,train_labels) #start tarining by calling the train function

print ('----------------- Training Completed ---------------------')

Total Number of Training Examples:  25000
Total Number of Training Labels:  25000
---------------- Training In Progress --------------------
----------------- Training Completed ---------------------


In [24]:
test_data=test.review #get test set examples
test_labels=test.label #get test set labels
print ("Number of Test Examples: ",len(test_data))
print ("Number of Test Labels: ",len(test_labels))

Number of Test Examples:  25000
Number of Test Labels:  25000


In [25]:
pclasses=nb.test(test_data) #get predcitions for test set

#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Examples: ",test_labels.shape[0])
print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Examples:  25000
Test Set Accuracy:  83.04 %
