## In this assignment, we will implement a simple lesk-based WSD. We use SEMCOR WSD dataset for the purpose

In [1]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn

### Let us define a word class containing the following attributes

text --> the actual word
pos --> POS tag of the word
lemma --> Lemma of the word
wnsn --> wordnet synset id of the sense used

In [2]:
class Word:
    def __init__(self, text, pos=None, lemma=None, wnsn=None, lexsn=None):
        self.text = text
        self.pos = pos
        self.lemma = lemma
        self.wnsn = wnsn
        self.lexsn = lexsn

In [3]:
# let us read a sample xml file
tree = ET.parse('semcor/brown2/tagfiles/br-e22.xml')

# get the root element
root = tree.getroot()

documents = []
# let us read every sentence, one-by-one
# we are ignoring the paragraph structure
for sentence_tree in root.findall('context/p/'):
    sentence = []
#     for every word in that sentence
    for word_tree in sentence_tree:
#         get the word
        word = Word(word_tree.text)
        
#         if the word xml tag contains info about pos, lemma, wnsn, lexsn, then extract it
        if 'pos' in word_tree.attrib:
            word.pos = word_tree.attrib['pos']
            
        if 'lemma' in word_tree.attrib:
            word.lemma = word_tree.attrib['lemma']
            
        if 'wnsn' in word_tree.attrib:
            word.wnsn = word_tree.attrib['wnsn']
            
        if 'lexsn' in word_tree.attrib:
            word.lexsn = word_tree.attrib['lexsn']
        
        sentence.append( word )
    documents.append(sentence)

print('Read {0} number of documents '.format(len(documents)))

Read 84 number of documents 


In [4]:
from nltk.corpus import stopwords

In [5]:
stp = stopwords.words('english')

In [8]:
correct = 0
total = 0

# window is chosen as 5, vary the window size
window = 5

# for every sentence
for every_sentence in documents:
#     let's get the word in the sentence as a list
    sentence = [x.text for x in every_sentence] 
#     print("Sentence is {0}".format(' '.join(sentence))
    
# for every word in the sentence
    for word_index in range(len(every_sentence)):
#         not all words have sense info
        if every_sentence[word_index].wnsn is not None:
            context_bag = []
            every_word = every_sentence[word_index]
            
            for index in range( max(0, word_index - window), min( word_index + window, len(sentence) ) ):
                if index == word_index:
                    continue
                context_bag.append( sentence[index] )
                
            
#             we know the POS tag of the word in the sentence
# restrict ourselves to only the senses for that POS category
            if every_word.pos.startswith('V'):
                synsets = wn.synsets(every_word.text, pos=wn.VERB)
            elif every_word.pos.startswith('J'):
                synsets = wn.synsets(every_word.text, pos=wn.ADJ)
            elif every_word.pos.startswith('R'):
                synsets = wn.synsets(every_word.text, pos=wn.ADV)
            else:
                synsets = wn.synsets(every_word.text, pos=wn.NOUN)
                
#             all inflections of the word might not be present
# search based on lemma
            if len(synsets) == 0:
                if every_word.pos.startswith('V'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.VERB)
                elif every_word.pos.startswith('J'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.ADJ)
                elif every_word.pos.startswith('R'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.ADV)
                else:
                    synsets = wn.synsets(every_word.lemma, pos=wn.NOUN)
                
            if len(synsets) == 0:
                continue
            
#             find the best synset based on simple word-overlap between word context and synset examples
            synset_score = -100
            synset_id = ""
            for every_synset in synsets:
                synset_bag = []
                for every_synset_example in every_synset.examples():
                    synset_bag.extend( every_synset_example.split(' ') )
                    
                synset_bag.extend(every_synset.definition().split())

                sign = []
                for each in synset_bag:
                    if each.lower() not in stp:
                        sign.append(each)
                
 
                matching_words = list( set( context_bag ).intersection( set(sign) ) )
                if len(matching_words) > synset_score:
                    synset_score = len(matching_words)
                    synset_id = every_synset.name().split('.')[-1]
                    
            if synset_id.startswith('0'):
                synset_id = synset_id[1:]

#             print('Best matching synset id is {0} with overlapping words {1}'.format( synset_id, synset_score ))
#             print('Actual synset id is {0}'.format( every_word.wnsn ))
            
            if synset_id == every_word.wnsn:
                correct = correct + 1
            total = total + 1

print('Accuracy is {0}'.format( (correct * 1.0)/ (total * 1.0) ))

Accuracy is 0.6745454545454546
