In [1]:
import numpy as np
import csv
import math
from nltk import word_tokenize
from collections import defaultdict
from collections import Counter
from functools import reduce
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# Constants
WSD_TRAIN = 'wsd_train.txt'
WSD_TEST = 'wsd_test.txt'

In [3]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import re
functionwords = {'everyone', 'himself', 'it', 'his', 'everything', 'little', 'those', 'inside', 'on', 'off', 'over', 
                 'of', 'first', 'within', 'around', 'near', 'so', 'would', 'else', 'for', 'moreover', 'besides', 
                 'into', 'while', 'here', 'never', 'such', 'each', 'who', 'anyone', 'through', 'despite', 'might',
                 'that', 'will', 'anything', 'in', 'therefore', 'your', 'someone', 'a', 'few', 'do', 'second', 'down',
                 'themself', 'usually', 'one', 'with', 'any', 'onto', 'all', 'to', 'must', 'herself', 'him', 'most', 'much',
                 'but', 'along', 'should', 'my', 'an', 'no', 'against', 'before', 'could', 'now', 'there', 'meanwhile',
                 'be', 'instead', 'during', 'them', 'from', 'less', 'if', 'something', 'ones', 'he', 'two', 'sometimes',
                 'yours', 'have', 'however', 'otherwise', 'its', 'though', 'often', 'toward', 'than', 'their', 'then',
                 'half', 'least', 'although', 'nothing', 'her', 'next', 'as', 'across', 'always', 'many', 'how', 'anyway',
                 'when', 'this', 'behind', 'own', 'both', 'at', 'itself', 'last', 'hers', 'other', 'they', 'our',
                 'incidentally', 'may', 'whose', 'beside', 'without', 'about', 'she', 'some', 'where', 'can', 'and',
                 'because', 'every', 'theirs', 'twice', 'another', 'since', 'what', 'after', 'which', 'these', 'more',
                 'shall', 'by', 'several', 'the', 'or','.'}
stopwords.update(functionwords) 


In [132]:
class Bayes1:
    
    def __init__(self, train):        
        self.prior = dict()
        self.likelihood = dict()  
        self.classes = None      
        self.classes_l= None
        self.bigdoc = defaultdict(list)
        self.word_counts =dict() 
        self.train_labels= []
        self.train_texts= [] 
        for i in train:
            if i[0]!='product':
                self.train_labels.append(i[0])
                self.train_texts.append(i[1]) 
            
      
    def stemm(self,filtered):
        stemmer = PorterStemmer()
        stemme = []        
        for t in filtered:            
            stemmed_word = stemmer.stem(t.lower())
            stemme.append(stemmed_word)            
        return stemme        
    
        
    def train(self):
        self.classes = set(self.train_labels)        
        self.classes_l= self.train_labels
        self.bigdoc = defaultdict(list)
        
        for e in range(len(self.train_texts)):
            text=re.sub('[^a-zA-Z.\d\s]', '',self.train_texts[e].lower())
            context=set(word_tokenize(text)).difference(stopwords)
            context=self.stemm(context)
            context=list(context)            
            self.bigdoc[self.train_labels[e]] += context
        
        self.class_counts = Counter(self.train_labels) 
        print(self.class_counts) 
        self.compute_prior()
        self.compute_likelihood()        
        
    
    def compute_prior(self):
        class_count = Counter(self.classes_l)
        self.prior = {c: class_count[c]/len(self.classes_l) for c in self.classes}
        print(self.prior)
        

    def compute_likelihood(self):
        self.word_counts1 = {c: dict(Counter(self.bigdoc[c]).most_common())  for c in self.classes}
                       
        self.likelihood=dict()        
        for c in self.classes:
            k=dict()
            print('for class',c, '----',len(self.word_counts1[c]),'No. of words present in its dictionary')
            for w in self.bigdoc[c]:                
                if self.word_counts1[c].get(w)!=None:
                    k.update({w: self.word_counts1[c][w]+1/len(self.bigdoc[c])+2})                    
                else:                    
                    k.update({w: 0.5})
            self.likelihood.update({c:k})                
        
            
    def log(self, n):
        return math.log(n, 2)
    
    def predict(self, sentence):        
        sentence= re.sub('[^a-zA-Z.\d\s]', '', sentence.lower())
        sentence=set(word_tokenize(sentence)).difference(stopwords)
        words=self.stemm(sentence )
        prediction = 'None'        
        z=[]
        for c in self.classes:            
            prior_score = self.log(self.prior[c])            
            l = lambda w: self.log(self.likelihood[c][w]) if w in self.likelihood[c] else self.log(1/len(words))                 
            word_likelihoods = list(map(l, words))             
            likelihood_score = sum(word_likelihoods)            
            score = prior_score + likelihood_score          
            z.append(score)
        
        z=np.array(z)         
        loc=np.argmax(z)         
        self.classes=list(self.classes)        
        prediction=self.classes[loc]         
        return prediction

    def report_score(self, num_correct, k):        
        print("Predicted word sense with {0}% accuracy\n".format(num_correct/k * 100))

    

In [133]:
with open(WSD_TRAIN, 'r') as f:
     data = list(csv.reader(f, delimiter='\t'))

with open(WSD_TEST, 'r') as f:
    tests = list(csv.reader(f, delimiter='\t'))

""" CREATE AND TRAIN THE MODEL """
bayes_classifier = Bayes1(data)
bayes_classifier.train()

test_labels=[]
test_texts=[]
num_correct = 0
k=0
for i in tests:
    if i[0]!='product':
        test_labels.append(i[0])
        test_texts.append(i[1]) 
        
for i in range(len(test_labels)):
    #print(test_labels[i],'----',bayes_classifier.predict(test_texts[i]))
    if (bayes_classifier.predict(test_texts[i]) == test_labels[i]):        
        k=k+1        
        num_correct += 1
    else:
        k=k+1
        pass

print("")
print("-----------------------")
bayes_classifier.report_score(num_correct, k)

Counter({'phone': 305, 'text': 283, 'cord': 268, 'division': 251, 'formation': 250})
{'cord': 0.19749447310243184, 'division': 0.184966838614591, 'text': 0.20854826823876196, 'phone': 0.2247605011053795, 'formation': 0.18422991893883567}
for class cord ---- 2357 No. of words present in its dictionary
for class division ---- 2637 No. of words present in its dictionary
for class text ---- 3123 No. of words present in its dictionary
for class phone ---- 2544 No. of words present in its dictionary
for class formation ---- 2805 No. of words present in its dictionary

-----------------------
Predicted word sense with 70.9090909090909% accuracy

