In [23]:
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

class SimplifiedLesk: 

    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def disambiguate(self, word, sentence):       
        word_senses = wordnet.synsets(word)
        best_sense = word_senses[0]  # Assume that first sense is most freq.
        max_overlap = 0
        context = set(word_tokenize(sentence))
        for sense in word_senses:
            signature = self.tokenized_gloss(sense)
            overlap = self.compute_overlap(signature, context)            
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense
        return best_sense  
    
    
    def tokenized_gloss(self, sense):        
        tokens = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            tokens.union(set(word_tokenize(example)))
        return tokens

    def compute_overlap(self, signature, context):       
        sig = signature.difference(self.stopwords)
        return len(sig.intersection(context))


import re
functionwords = {'everyone', 'himself', 'it', 'his', 'everything', 'little', 'those', 'inside', 'on', 'off', 'over', 
                 'of', 'first', 'within', 'around', 'near', 'so', 'would', 'else', 'for', 'moreover', 'besides', 
                 'into', 'while', 'here', 'never', 'such', 'each', 'who', 'anyone', 'through', 'despite', 'might',
                 'that', 'will', 'anything', 'in', 'therefore', 'your', 'someone', 'a', 'few', 'do', 'second', 'down',
                 'themself', 'usually', 'one', 'with', 'any', 'onto', 'all', 'to', 'must', 'herself', 'him', 'most', 'much',
                 'but', 'along', 'should', 'my', 'an', 'no', 'against', 'before', 'could', 'now', 'there', 'meanwhile',
                 'be', 'instead', 'during', 'them', 'from', 'less', 'if', 'something', 'ones', 'he', 'two', 'sometimes',
                 'yours', 'have', 'however', 'otherwise', 'its', 'though', 'often', 'toward', 'than', 'their', 'then',
                 'half', 'least', 'although', 'nothing', 'her', 'next', 'as', 'across', 'always', 'many', 'how', 'anyway',
                 'when', 'this', 'behind', 'own', 'both', 'at', 'itself', 'last', 'hers', 'other', 'they', 'our',
                 'incidentally', 'may', 'whose', 'beside', 'without', 'about', 'she', 'some', 'where', 'can', 'and',
                 'because', 'every', 'theirs', 'twice', 'another', 'since', 'what', 'after', 'which', 'these', 'more',
                 'shall', 'by', 'several', 'the', 'or'}

from nltk.stem import PorterStemmer
ps=PorterStemmer()

class UpdatedLesk: 

    def __init__(self):
        self.stopwords = set(stopwords.words('english'))
        self.stopwords.update(functionwords)         
        self.pos=None

    def disambiguate(self, word, pos, sentence):       
        word_senses = wordnet.synsets(word)        
        word_senses1=[]        
        for i in word_senses:
            if i.pos()==pos:
                word_senses1.append(i) 
                
        best_sense = word_senses1[0]  # Assume that first sense is most freqent
        max_overlap = 0
        context = set(word_tokenize(sentence))        
        for sense in word_senses1:            
            signature = self.tokenized_gloss(sense)
            overlap= self.overlapcontext(sense,signature, str(sentence))
            if overlap > max_overlap:
                    max_overlap = overlap
                    best_sense = sense            
       
        return best_sense.definition() 
            
       
    def find(self, sentence):
        context = set(word_tokenize(sentence))
        context=list(context)
        for i in range(len(context)):
            context[i]=str(self.stem(context[i])) 
        context=set(context)
        context = context.difference(self.stopwords)
        z=[]
        for i in context:            
            if len(wordnet.synsets(i))>1:
                z.append(i)
        return z
    
    def change(self,i): 
        if i=='VB' or i=='VBD' or i=='VBG' or i=='VBN' or i=='VBP' or i=='VBZ' :
            return 'v'
        elif i== 'NN' or i=='NNS' or i=='NNP' or i=='NNPS' :
            return 'n'
        elif i== 'RB'or i=='RBR' or i=='RBS' or i=='RP':
            return 'r'
        elif i=='JJ'or i=='JJR'or i=='JJS':
            return 's'
        else:
            return 'a' 
        
    def stem(self,i):
        if wordnet.synsets(ps.stem(i))==wordnet.synsets(i):
            return ps.stem(i)
        else:
            return i
        
        
    
    def findpos(self, sentence):
        t1 = nltk.word_tokenize(sentence)
        tag1 = nltk.pos_tag(t1)
        context=[]
        for i in tag1:
            context.append(i[0])
        context = set(word_tokenize(sentence))        
        context = context.difference(self.stopwords)        
        tag11=[]
        for k in context:
            for i in tag1:
                if i[0]==k:
                    tag11.append(i)        
        
        tag11=np.array(tag11)       
        for i in range(len(tag11)):
            tag11[i][1]=str(self.change(tag11[i][1])) 
            
        for i in range(len(tag11)):
            tag11[i][0]=str(self.stem(tag11[i][0])) 
                
        return tag11
    
    def tokenized_gloss(self, sense):        
        tokens = set(word_tokenize(sense.definition()))
        tokens=list(tokens)
        for i in range(len(tokens)):
            tokens[i]=str(self.stem(tokens[i])) 
        tokens=set(tokens)
        for example in sense.examples():
            tokens.union(set(word_tokenize(example)))
        return tokens
    
    def tokenized_sent(self, k):
        string=str(k)
        tokens = set(word_tokenize(string))
        tokens=list(tokens)
        for i in range(len(tokens)):
            tokens[i]=str(self.stem(tokens[i])) 
        tokens=set(tokens)        
        return tokens
    
    def overlapcontext(self, synset, glos, sentence):        
        gloss=glos                    
        for m in synset.hypernyms(): 
            string = re.sub('[^a-zA-Z.\d\s]', '', str(m.definition()))
            m1=self.tokenized_sent(string)
            gloss=gloss.union(m1)
            string1 = re.sub('[^a-zA-Z.\d\s]', '', str(m.examples()))
            x1=self.tokenized_sent(string1)
            gloss=gloss.union(x1)        
            
        for z in synset.hyponyms():
            string = re.sub('[^a-zA-Z.\d\s]', '', str(z.definition()))
            m1=self.tokenized_sent(string)
            gloss=gloss.union(m1)
            string1 = re.sub('[^a-zA-Z.\d\s]', '', str(z.examples()))
            x1=self.tokenized_sent(string1)
            gloss=gloss.union(x1)  

        for k in synset.part_meronyms():        
            string = re.sub('[^a-zA-Z.\d\s]', '', str(k.definition()))
            m1=self.tokenized_sent(string)
            gloss=gloss.union(m1)
            string1 = re.sub('[^a-zA-Z.\d\s]', '', str(k.examples()))
            x1=self.tokenized_sent(string1)
            gloss=gloss.union(x1) 
         
        for w in synset.substance_meronyms():        
            string = re.sub('[^a-zA-Z.\d\s]', '', str(w.definition()))
            m1=self.tokenized_sent(string)
            gloss=gloss.union(m1)
            string1 = re.sub('[^a-zA-Z.\d\s]', '', str(w.examples()))
            x1=self.tokenized_sent(string1)
            gloss=gloss.union(x1) 
        
        for q in synset.member_meronyms():        
            string = re.sub('[^a-zA-Z.\d\s]', '', str(q.definition()))
            m1=self.tokenized_sent(string)
            gloss=gloss.union(m1)
            string1 = re.sub('[^a-zA-Z.\d\s]', '', str(q.examples()))
            x1=self.tokenized_sent(string1)
            gloss=gloss.union(x1)  
        
        gloss = gloss.difference(self.stopwords)        
        sentence =self.tokenized_sent(sentence)
        sentence = sentence.difference(self.stopwords)
        return len( gloss.intersection(sentence)) 
    
    
import tkinter as tk
def get_Words():
        sentence = utext.get('1.0', "end").strip().lower() 
        
        model=UpdatedLesk()
        z=model.find(sentence)
        AW.config(state='normal') 
        AW.delete('1.0','end')
        AW.insert('1.0', z)
        AW.config(state='disabled')

def get_sense():
        sentence = utext.get('1.0', "end").strip().lower()       
        word = Word.get('1.0', "end").strip()    
        
        model=SimplifiedLesk()
        sense=model.disambiguate(word,sentence)         
        
        summary.config(state='normal') 
        summary.delete('1.0','end')
        summary.insert('1.0', sense.definition())        

        summary.config(state='disabled')
        
def get_updatedsense():
        sentence = utext.get('1.0', "end").strip().lower()       
        word = Word.get('1.0', "end").strip()  
        up=UpdatedLesk()
        pos=up.findpos(sentence)        
        poss.config(state='normal')
        poss.delete('1.0','end')
        poss.insert('1.0', pos) 
        poss.config(state='disabled')
        pos=list(pos)
        
        k = None
        for i in pos:
            if i[0]== word:
                k=i[1]
                        
        sense=up.disambiguate(word,k,sentence)         
        summary1.config(state='normal') 
        summary1.delete('1.0','end')
        summary1.insert('1.0', sense)
        summary1.config(state='disabled')

        
        
root = tk.Tk()
root.title('Word Sense Predictor')
root.geometry('800x650')
# root['bg']='green'

ulabel = tk.Label(root, text='Give a Sentence',font='ar 15 bold')
ulabel.pack()
utext = tk.Text(root, height=1.5, width=80)
utext.pack()

b1 = tk.Button(root, text='Get_Words',width=10, height=1, command=get_Words,font='ar 10 bold')
b1.pack()

wlabel = tk.Label(root, text='Ambiguous words',font='ar 10 bold')
wlabel.pack()
AW= tk.Text(root, height=4, width=80)
AW.config(state='disabled', bg='#dddddd')
AW.pack()

alabel = tk.Label(root, text='Word To be disambiguated',font='ar 12 bold')
alabel.pack()
Word = tk.Text(root, height=1.5, width=20)
Word.pack()

slabel = tk.Label(root, text='Sense with Lesk Algorithm ',font='ar 15 bold')
slabel.pack()
summary = tk.Text(root, height=3, width=80)
summary.config(state='disabled', bg='#dddddd')
summary.pack()


b2 = tk.Button(root, text='Get_Sense', width=20, height=1,command=get_sense,font='ar 10 bold')
b2.pack()

plabel = tk.Label(root, text='POS tags',font='ar 15 bold')
plabel.pack()
poss= tk.Text(root, height=4, width=80)
poss.config(state='disabled', bg='#dddddd')
poss.pack()


s1label = tk.Label(root, text='Sense with Updated Lesk Algorithm',font='ar 15 bold')
s1label.pack()
summary1 = tk.Text(root, height=3, width=80)
summary1.config(state='disabled', bg='#dddddd')
summary1.pack()


b3 = tk.Button(root, text='Get_Updated_Sense ', width=20, height=1, command=get_updatedsense,font='ar 10 bold')
b3.pack()

root.mainloop()

In [17]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
def stem(i):
    if wordnet.synsets(ps.stem(i))==wordnet.synsets(i):
        return ps.stem(i)
    else:
        return i
        

In [29]:
input1='he was cutting the carrot'
tokens = word_tokenize(input1)
tokenss=[]
for i in range(len(tokens)):
    tokenss.append(str(stem(tokens[i])))


    
print(tokens,'\n',tokenss)

['he', 'was', 'cutting', 'the', 'carrot'] 
 ['he', 'was', 'cutting', 'the', 'carrot']
