In [1]:
from __future__ import division
import numpy as npzz
from collections import Counter
from codecs import open
import numpy as np


def read_documents(doc_file): 
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

In [2]:
def train_naiveBayes(documents, labels):
    
    freq_train_labels = Counter()
    for doc in labels:
        freq_train_labels[doc] += 1
        
        
        
    """  P(a doc is annotated as positive or negative )  """

    ProbabilityOfPositive = freq_train_labels['pos']/len(documents) 
    ProbabilityOfNegative = freq_train_labels['neg']/len(documents) 
    print('Probability that a review in a corpus is annotated as positive is: ', ProbabilityOfPositive)
    print('Probability that a review in a corpus is annotated as negative is: ', ProbabilityOfNegative)
 


    """  making a list of positive and one of negative annotated review  """

    listOfposDocsInTrainingData = []
    listOfnegDocsInTrainingData = []
    for doc, label in zip(documents, labels):
        if label == 'pos':
            listOfposDocsInTrainingData.append(doc)
        else:
            listOfnegDocsInTrainingData.append(doc)
            
        
    """  finding the freuency of words in training data for pos resp. neg docs  """
    
    freqsOfWordsInPosDocs = Counter(w for doc in listOfposDocsInTrainingData for w in doc)
    freqsOfWordsInNegDocs = Counter(w for doc in listOfnegDocsInTrainingData for w in doc)
    
    
    
    """ Prob(a specific word in traindata is present in pos or neg annotated docs) """
    
    totalfreq_pos = sum(list(freqsOfWordsInPosDocs.values()))
    totalfreq_neg = sum(list(freqsOfWordsInNegDocs.values()))
    
    posprobs = freqsOfWordsInPosDocs
    negprobs = freqsOfWordsInNegDocs

    for i in posprobs:
        posprobs[i] = posprobs[i] / totalfreq_pos
        
    for i in negprobs:
        negprobs[i] = negprobs[i] / totalfreq_neg
        
    
    return(posprobs, negprobs, ProbabilityOfPositive, ProbabilityOfNegative, totalfreq_pos, totalfreq_neg)



In [3]:
""" depending whether the prob of positive or negative is higher, annotate the high one as a predicted label """
def classify_label_byprobs(probOfPos, ProbOfNeg):
    if(probOfPos > ProbOfNeg):
        guess = 'pos'
    else:
        guess = 'neg'
    return(guess)


In [4]:
""" Given the predicted and true labels calclutate the ration of right-classified and missclassified labels"""
def accuracy(true_labels, guessed_labels):
    counter = 0
    for i in range(len(guessed_labels)):
        if(guessed_labels[i] == true_labels[i]):
            counter += 1
    return counter / len(guessed_labels)

In [5]:
"""  classify the reviews in the eval/test data as pos or negative labeled """
def classify_documents(docs, alpha, Posprobs, Negprobs, PriorPositive, PriorNegative, totalPos, totalNeg):
    
    predicted_labels = []
    NchoicesP = len(Posprobs) # uses for laplace smoothing
    NchoicesN = len(Negprobs)

    """
        for laplace-smoothing please check https://en.wikipedia.org/wiki/Additive_smoothing
        Check whether if the word is seen or unseen and use additive smoothing
    """
    
    for doc in docs:
            
        probOfAReviewBeingPos = 0
        probOfAReviewBeingNeg = 0
        
        for word in doc: 
         
            """ if the word is seen e.i. existed in training data """
            if Posprobs[word] > 0:                                  
                probOfAReviewBeingPos = probOfAReviewBeingPos +  np.log((Posprobs[word] * totalPos + alpha)/(alpha * NchoicesP + totalPos))
            else:                                                
                probOfAReviewBeingPos = probOfAReviewBeingPos + np.log(alpha / (alpha * NchoicesP + totalPos))
                
                
            if Negprobs[word] > 0:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log((Negprobs[word] * totalNeg + alpha) / (alpha * NchoicesN + totalNeg))
            else:
                probOfAReviewBeingNeg = probOfAReviewBeingNeg + np.log(alpha / (alpha * NchoicesN + totalNeg))
            
            
        """  Posterior ∝ likelihood * prior ==> Posterior ∝ log(likelihood) + log(prior) """  
        posteriorPosPerDoc = probOfAReviewBeingPos + np.log(PriorPositive)
        posteriorNegPerDoc = probOfAReviewBeingNeg + np.log(PriorNegative)
        
        predicted_labels.append(classify_label_byprobs(posteriorPosPerDoc, posteriorNegPerDoc)) 
        
    return(predicted_labels)

In [6]:
"""  Reaading the corpus """
all_docs, all_labels = read_documents('all_sentiment_shuffled.txt')

"""  Dividing data into the train 80% and evaluation part  """
split_point = int(0.80*len(all_docs)) 
train_docs = all_docs[:split_point]   
train_labels = all_labels[:split_point] 
eval_docs = all_docs[split_point:] 
eval_labels = all_labels[split_point:]

In [7]:
"""  send inputs to the functions and get outputs  """
Posprobs,Negprobs,ProbabilityOfPositive,ProbabilityOfNegative, totalPos, totalNeg = train_naiveBayes(train_docs, train_labels)

alpha_laplace = 1.0
predicted_labels1 = classify_documents(eval_docs,alpha_laplace,Posprobs,Negprobs,ProbabilityOfPositive,ProbabilityOfNegative, totalPos, totalNeg)
Accuracyy = accuracy(eval_labels, predicted_labels1)

print('The accuracy of predicted labels is: ' ,Accuracyy)

Probability that a review in a corpus is annotated as positive is:  0.5085510439618088
Probability that a review in a corpus is annotated as negative is:  0.49144895603819116
The accuracy of predicted labels is:  0.8124213176668066


In [8]:
"""  The function return shortest documents with desired lenght  """
def findSome_ShortestDocs(docs, labels, doc_length):
    shortdocs = []
    corresponinglabels = []
    k = 0
    for doc in docs:
        if len(doc) < doc_length:
            shortdocs.append(doc)
            corresponinglabels.append(labels[k])
        k += 1
    return(shortdocs, corresponinglabels)

In [9]:
"""  The function return missclassified documents and their true label  """    
def misclassificated_docs_And_lebels(test_docs ,true_labels, guessed_labels):
    truelabel_for_Misc = [] 
    miss_docs_eval = []
    
    for i in range(len(guessed_labels)):
        if(guessed_labels[i] != true_labels[i]):
            miss_docs_eval.append(test_docs[i])
            truelabel_for_Misc.append(true_labels[i])

    return(miss_docs_eval, truelabel_for_Misc)

In [10]:
"""  The possible reasons behind the missclassifaication  """
missdocs, misslabels = misclassificated_docs_And_lebels (eval_docs, eval_labels, predicted_labels1)

shortestmiss_docs, correpondedlabels= findSome_ShortestDocs(missdocs, misslabels, 12)

print(shortestmiss_docs)
print(correpondedlabels)

print(Posprobs['not'])
print(Negprobs['not'])
print(Posprobs['great'])
print(Negprobs['great'])
print(Posprobs['goo'])
print(Negprobs['goo'])
print('One reaason of not-higher-accuracy-than-81% is that the model dont set higher negative prob. to the word "not" and higher positive prob. to the word "great" ')
print('Other reason can be failure in annotating the reviews and difficulty of annotating to neutral words such as "goo" ')


[['i', 'would', 'have', 'liked', 'a', 'more', 'real', 'story', 'in', 'this', 'dvd'], ['it', "'s", 'not', 'great', 'music', 'to', 'dance', 'to'], ['goo']]
['neg', 'neg', 'pos']
0.004602605646306818
0.007199447207783851
0.0027854225852272725
0.001152582960795394
1.3871626420454546e-06
4.196297187361872e-06
One reaason of not-higher-accuracy-than-81% is that the model dont set higher negative prob. to the word "not" and higher positive prob. to the word "great" 
Other reason can be failure in annotating the reviews and difficulty of annotating to neutral words such as "goo" 
