In [1]:
import numpy as np
import pandas as pd
import os
import glob
import string
import re
from matplotlib import pyplot
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [2]:
def dataLoader(dataType):
    '''function to load the data'''
    tList = []
    label = []
    for name in glob.glob(os.getcwd()+"//Dataset//"+dataType+"//neg//*"):
        with open(name,encoding="utf8") as f:
            text = f.readlines()
            tList.append(text[0])
            label.append(0)
    
    for name in glob.glob(os.getcwd()+"//Dataset//"+dataType+"//pos//*"):
        with open(name,encoding="utf8") as f:
            text = f.readlines()
            tList.append(text[0])
            label.append(1)
    
    df = pd.DataFrame(list(zip(tList,label)),columns = ["Text","Label"])
    return df
    
    

In [3]:
#loading the data into training and testing sets
test = dataLoader("test")
train = dataLoader("train")

In [4]:
#Reusing from PA3
def loadPositveNegativeStop():
    '''function to stop words'''
    stop = []
    
                
    for name in glob.glob(os.getcwd()+"//Dataset//stop_words.txt"):
        with open(name,encoding="utf8") as f:
            lines = f.readlines()
            for line in lines:
                stop.append(line.strip())
    
    return stop
    

In [5]:
stop = loadPositveNegativeStop() #loading stop words

In [6]:
#reusing from PA3
def preprocessing(text):
    '''function to process the data'''
    
    #lowercasing
    text = text.lower()
    
    #removing everything except alphabets
    text = re.sub("[^a-zA-Z]", " ",text) 
    
    for word in stop:
        #removing stopwords using regex
        regex = r'\b'+word+r'\b'
        text = re.sub(regex," ",text)
    
    #removing punctuation
    for word in string.punctuation:
        text = text.replace(word," ")
    
    text = text.strip()
    return text
    
    

In [7]:
#applying pre-processing
train['Ptext'] = train['Text'].apply(preprocessing)
test['Ptext'] = test['Text'].apply(preprocessing)


In [8]:
#for speeding purposes
# train.to_pickle("train.pkl")
# test.to_pickle("test.pkl")

In [9]:
#for speeding purposes
# train = pd.read_pickle("train.pkl")
# test = pd.read_pickle("test.pkl")

In [10]:
def createDictionary(data):
    '''function to create dictionary with sums'''
    
    V = Counter()
    sums = 0
    for text in data['Ptext']:
        text = text.split()
        sums+=len(text)
        for chars in text:
            V[chars]+=1
        
    return V,sums
            
            

In [11]:
C = np.unique(train['Label'].ravel()) #Getting unique classes from data as said in the algorithm
D = train[['Ptext','Label']] #Creating Document as specififed in the algorithm

In [12]:
def TrainNaiveBayes(D,C):
    '''function to train naive bayes as by the provided algorithm'''
    #D is the documents
    #C is the classes
    #Claculating loppriors
    logprior = [0,0]
    Ndoc = len(D)
    Nc = [0,0]
    
    for classes in C:
        Nc[classes] = len(D[D['Label']==classes])
    
    for classes in C:
        logprior[classes] = np.log(Nc[classes]/Ndoc)
    
    #creating dictionary
    V,_ = createDictionary(D)
    
    #Creating bigDoc
    V0Doc,sums0 = createDictionary(D[D['Label']==0])
    V1Doc,sums1 = createDictionary(D[D['Label']==1])
    
    bigDoc = [V0Doc,V1Doc]
    
    #Calculating loglikelihood
    loglikelihood = Counter()
    
    for word in V:
        
        count_0 = bigDoc[0][word]
        count_1 = bigDoc[1][word]
        
        loglikelihood[(word,0)] = np.log((count_0+1)/(sums0+len(V)))
        loglikelihood[(word,1)] = np.log((count_1+1)/(sums1+len(V)))
        
    return logprior,loglikelihood,V

In [13]:
logprior,loglikelihood,V = TrainNaiveBayes(D,C)

In [14]:
def TestNaiveBayes(test,logprior=logprior,loglikelihood=loglikelihood,C=C,V=V):
    '''function to test naive bayes'''
    sums = [0,0]
    keys = V.keys()
    test = test.split()
    for classes in C:
        #Adding logprior
        sums[classes] = logprior[classes]
        for word in test: 
            #ignoring words that are not in the dictionary
            if word in keys:
                sums[classes]+= loglikelihood[(word,classes)]
    #returning the max probability class label and its probability
    sums = np.array(sums)
    return np.argmax(sums),np.max(sums)

In [15]:
#Getting prediciton for the data and printing the accuracy
ypred = []
ytrue = test['Label'].ravel()
for text in test['Ptext']:
    pred,_ = TestNaiveBayes(text)
    ypred.append(pred)
ypred = np.array(ypred)
accuracy = (ypred==ytrue).mean()
print("Accuracy on testing data is:",accuracy*100)

Accuracy on testing data is: 82.528


In [16]:
def confusionMatrix(Ytrue,Ypred):
    '''function to print confusion matrix'''
    tp = 0
    tn = 0
    fp = 0 
    fn = 0 
    
    for x,y in zip(Ytrue,Ypred):
        
        if x==0 and y==0:
            tp+=1
        if x==1 and y==1:
            tn+=1
        if x==0 and y==1:
            fn+=1
        if x==1 and y==0:
            fp+=1
    
    array = np.array([[tp,fn],[fp,tn]])
    print(array)
            

In [17]:
print("Confusion matrix on training data is:")
confusionMatrix(ytrue,ypred)

Confusion matrix on training data is:
[[11022  1478]
 [ 2890  9610]]


### Part 2

In [18]:
#Creating the training and testing corpus
train_corpus = list(train['Ptext'].ravel())
test_corpus = list(test['Ptext'].ravel())

In [19]:
#Creating an instance of count Vectorizer
vectorizer = CountVectorizer()

In [20]:
#Fitting and transforming training data and then simply fitting the testing data
XTrain = vectorizer.fit_transform(train_corpus)
XTest = vectorizer.transform(test_corpus)

In [21]:
#Converting labels into arrays
YTrain = train['Label'].ravel()
YTest = test['Label'].ravel()

In [22]:
#Creating multinomial naive bayes and fitting on training data
mnb = MultinomialNB()
mnb.fit(XTrain,YTrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
#Getting mrediction from the model on testing data
#Calculating accuracy and printing the confusion matrix
ypredp2 = mnb.predict(XTest)
acc = accuracy_score(YTrain,ypredp2)
print("Accuracy on testing data is:",acc*100)
print("The confusion matrix is as follows")
print(confusion_matrix(YTrain,ypredp2,labels=[0,1]))

Accuracy on testing data is: 82.528
The confusion matrix is as follows
[[11026  1474]
 [ 2894  9606]]
