In [23]:
from collections import Counter
import numpy as np
import nltk
from nltk.corpus import stopwords
from codecs import open
from __future__ import division

In [44]:
stop_words = set(stopwords.words('english'))
#print(stop_words)

In [152]:
def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            text_data = words[3:]
            #Filtering stop words and also words with length 2 or less
            text_data = [words for words in text_data if words not in stop_words and len(words)>=3]
            #print(text_data)
            docs.append(text_data)
            labels.append(words[1])
    return docs, labels 

In [153]:
docs, labels = read_documents(r'test.txt')

In [149]:
#Splitting the data set into training and validation data
train_docs=[]

split_point = int(0.80*len(docs))
train_docs = docs[:split_point]
train_labels = labels[:split_point]  
val_docs = docs[split_point:]
val_labels = labels[split_point:]
#print(train_docs, train_labels)

In [150]:
def sizeOfVocab(doc):
    length=0
    for line in doc:
        length += len(line) 
    print("length-->", length)
    return length

In [242]:
def likelihood(freq, total_count, sizeOfVocab): 
    likelihood={}
    #For smoothing.. using alpha = 1
    alpha = 1
    for word, count in freq.items():
        likelihood[word] = (count + alpha)/(total_count + alpha*sizeOfVocab)
        print(word, likelihood[word]) 
    return likelihood

In [241]:
# This method would check whether the word present in one label also present in other label. 
# If not present than fill their likelihood using smoothing function
def checkWords(likelihood_pos, likelihood_neg,total_pos_words,total_neg_words,length):
    alpha = 1
    #Check if words in pos labels are present in neg labels and if not then fill their likelihood using smoothing function
    for key in likelihood_pos.keys(): 
        if not key in likelihood_neg: 
            likelihood_neg[key]= alpha/(total_neg_words + alpha*length)
            #print("likelihood_neg",key, likelihood_neg[key])
    
    #Check if words in neg labels are present in pos labels and if not then fill their likelihood using smoothing function
    for key in likelihood_neg.keys(): 
        if not key in likelihood_pos: 
            likelihood_pos[key]= alpha/(total_neg_words + alpha*length)
            #print("likelihood_pos",key, likelihood_pos[key])
    

In [243]:
#Function to calculate posterior for the documents
def calculatePosterior(likelihood_pos, likelihood_neg, prior_pior_pos, prior_pior_neg):
    
    sum_likehood_pos = 0
    sum_likehood_neg = 0
   
    #Summing up the likelihood of all pos words
    for word, prob in likelihood_pos.items():
        sum_likehood_pos += np.log(prob)
    
    s_pos = prior_pior_pos*sum_likehood_pos
    print("############# s_pos ",s_pos)
    
    #Summing up the likelihood of all neg words
    for word, prob in likelihood_neg.items():
        sum_likehood_neg += np.log(prob)
    
    s_neg = prior_pior_neg*sum_likehood_neg
    print("############# s_neg ",s_neg)
   
    post_prob_pos = s_pos/(s_pos + s_neg)
    post_prob_neg = s_neg/(s_pos + s_neg)
    #print("\n############# Posterior P(pos|a doc) and P(neg|a doc)",post_prob_pos, post_prob_neg)
        
    return post_prob_pos, post_prob_neg
    

In [261]:
# Classifying new documents - "doc" with a label "label-pos or neg"
#def score_doc_label(document, label, <SOMETHING>):
def score_doc_label(doc, prior_pos, prior_neg, pos_likehood, neg_likehood):
    prob=1
    
    list_of_words = ' '.join(doc).split()
    print("\nlist of words -> ",list_of_words)
    
    for word in list_of_words:
        if word in pos_likehood:              
            print("\n  word and its likelihood-",word,pos_likehood[word])
            prob*=pos_likehood[word]
        else:
            #Case where word is not present in training set/Vocabulary.. Do nothing
            pass
    final_pos_prob=prior_pos*prob
    
    for word in list_of_words:
        if word in neg_likehood:
            print("\n word and its likelihood-",word,neg_likehood[word])
            prob*=neg_likehood[word]
        else:
            #Case where word is not present in training set/Vocabulary.. Do nothing
            pass
    final_neg_prob=prior_neg*prob
    
    print("\nfinal_pos_prob....", final_pos_prob)
    print("\nfinal_neg_prob....", final_neg_prob)
    
    return final_pos_prob, final_neg_prob

In [245]:
#Step 2: Estimate the prior P(spam) and P(not spam)
def prior_prob_event(labels, event):
    label_count = Counter(labels)
    prior = label_count[event]/sum(label_count.values())
    return prior
    

In [246]:
def word_freq(docs, labels):
    # 1.2 Count how many times each word appears in both spams and not spams:
    freq_pos = Counter()
    freq_neg = Counter()
    for line, label in zip(docs, labels):
        if label == 'pos':
            freq_pos.update(line)
        else:
            freq_neg.update(line)
    return freq_pos, freq_neg

In [247]:
prior_pos = prior_prob_event(labels, 'pos')
prior_neg = prior_prob_event(labels, 'neg')
#print('prior pos', prior_pos)
#print('prior neg', prior_neg)

In [264]:
def train_nb(docs, labels):

    # 1.2 Count how many times each word appears in both spams and not spams:
    freq_pos, freq_neg = word_freq(docs, labels)
        
    # 1.3 Count how many words in total for each class:
    total_pos_words = sum(freq_pos.values())
    total_neg_words = sum(freq_neg.values())
    #print("Total pos & neg values", total_pos_words,total_neg_words )
    
    # 1.4 Estimate the likelihood P(wordi| spam) and P(wordi| not spam) for all wi ∈ V:
    length = sizeOfVocab(docs)
    
    pos_likehood = likelihood(freq=freq_pos, total_count= total_pos_words, sizeOfVocab=length)
    neg_likehood = likelihood(freq=freq_neg, total_count= total_neg_words, sizeOfVocab=length)
    #print('pos likehood', pos_likehood)

    # Calculate likelihood for words present in one label but not in other
    checkWords(freq_pos,freq_neg,total_pos_words,total_neg_words,length)
    
    #Step 2: Estimate the prior P(spam) and P(not spam)
    prior_pos = prior_prob_event(labels, 'pos')
    prior_neg = prior_prob_event(labels, 'neg') # not positive
    #print("prior_pos,prior_neg", prior_pos,prior_neg)
    
    # Calculate Posterior probability
    s_pos,s_neg= calculatePosterior(pos_likehood,neg_likehood,prior_pior_pos,prior_pior_neg)
    
    # Sample document and label to classify
    doc=["excellent read highly recommended"]
    label="pos"

    #classify new doc/line
    final_pos_prob,final_neg_prob = score_doc_label(doc, prior_pos, prior_neg, pos_likehood, neg_likehood)
    
    if final_pos_prob > final_neg_prob:
        print("\nNew lines have positive label......")
    else:
        print("\nNew lines have negative label.......")


In [265]:
train_nb(train_docs, labels)


length--> 158
anything 0.010526315789473684
purchase 0.010526315789473684
left 0.010526315789473684
behind 0.010526315789473684
series 0.010526315789473684
excellent 0.010526315789473684
read 0.010526315789473684
books 0.010526315789473684
great 0.015789473684210527
close 0.010526315789473684
bible 0.010526315789473684
entire 0.010526315789473684
set 0.010526315789473684
amazon 0.010526315789473684
shopping 0.010526315789473684
site 0.010526315789473684
ship 0.010526315789473684
fast 0.010526315789473684
would 0.010526315789473684
recommend 0.010526315789473684
christian 0.010526315789473684
wanting 0.010526315789473684
know 0.010526315789473684
expect 0.010526315789473684
return 0.010526315789473684
christ 0.010526315789473684
fiction 0.010526315789473684
still 0.010526315789473684
makes 0.010526315789473684
good 0.010526315789473684
point 0.010526315789473684
bought 0.007042253521126761
album 0.01056338028169014
loved 0.007042253521126761
title 0.007042253521126761
song 0.01408450704