## In this assignment, we will implement a simple lesk-based WSD. We use SEMCOR WSD dataset for the purpose

In [1]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
import re
from nltk.corpus import stopwords
import os
import string

### Let us define a word class containing the following attributes

text --> the actual word
pos --> POS tag of the word
lemma --> Lemma of the word
wnsn --> wordnet synset id of the sense used

In [2]:
class Word:
    def __init__(self, text, pos=None, lemma=None, wnsn=None, lexsn=None):
        self.text = text
        self.pos = pos
        self.lemma = lemma
        self.wnsn = wnsn
        self.lexsn = lexsn

In [3]:
files_ls = []
for file in os.listdir("semcor/brown2/tagfiles/"):
    if file.endswith(".xml"):
        files_ls.append(file)
print(files_ls)

['br-e22.xml', 'br-e23.xml', 'br-e25.xml', 'br-e26.xml', 'br-e27.xml', 'br-e28.xml', 'br-e30.xml', 'br-e31.xml', 'br-f08.xml', 'br-f13.xml', 'br-f14.xml', 'br-f15.xml', 'br-f16.xml', 'br-f17.xml', 'br-f18.xml', 'br-f20.xml', 'br-f21.xml', 'br-f22.xml', 'br-f23.xml', 'br-f24.xml', 'br-f25.xml', 'br-f33.xml', 'br-f44.xml', 'br-g12.xml', 'br-g14.xml', 'br-g16.xml', 'br-g17.xml', 'br-g18.xml', 'br-g19.xml', 'br-g20.xml', 'br-g21.xml', 'br-g22.xml', 'br-g23.xml', 'br-g28.xml', 'br-g31.xml', 'br-g39.xml', 'br-g43.xml', 'br-g44.xml', 'br-h09.xml', 'br-h11.xml', 'br-h12.xml', 'br-h13.xml', 'br-h14.xml', 'br-h15.xml', 'br-h16.xml', 'br-h17.xml', 'br-h18.xml', 'br-h21.xml', 'br-h24.xml', 'br-j29.xml', 'br-j30.xml', 'br-j31.xml', 'br-j32.xml', 'br-j33.xml', 'br-j34.xml', 'br-j35.xml', 'br-j38.xml', 'br-j41.xml', 'br-j42.xml', 'br-l08.xml', 'br-l09.xml', 'br-l10.xml', 'br-l13.xml', 'br-l14.xml', 'br-l15.xml', 'br-l16.xml', 'br-l17.xml', 'br-l18.xml', 'br-n09.xml', 'br-n10.xml', 'br-n11.xml', 'br-n

In [4]:
# let us read a sample xml file
doc_ls = []
for each_file in files_ls:
    tree = ET.parse('semcor/brown2/tagfiles/' + each_file)

    # get the root element
    root = tree.getroot()
    # print(root)
    documents = []
    # let us read every sentence, one-by-one
    # we are ignoring the paragraph structure
    for sentence_tree in root.findall('context/p/'):
        sentence = []
    #     for every word in that sentence
        for word_tree in sentence_tree:
    #         get the word
            word = Word(word_tree.text)
        
    #         if the word xml tag contains info about pos, lemma, wnsn, lexsn, then extract it
            if 'pos' in word_tree.attrib:
                word.pos = word_tree.attrib['pos']
            
            if 'lemma' in word_tree.attrib:
                word.lemma = word_tree.attrib['lemma']
            
            if 'wnsn' in word_tree.attrib:
                word.wnsn = word_tree.attrib['wnsn']
            
            if 'lexsn' in word_tree.attrib:
                word.lexsn = word_tree.attrib['lexsn']
        
            sentence.append( word )
        documents.append(sentence)

#     print(documents)
    doc_ls.append(documents)
#     print('Read {0} number of documents '.format(len(documents)))

print(len(doc_ls))

83


In [7]:
# To calculate accuracy
stop_words = set(stopwords.words('english'))
acc_ls = []
for every_doc in doc_ls:
    correct = 0
    total = 0

    # window size varies the context window
    window = 3

    # for every sentence
    for every_sentence in every_doc:
        # let's get the word in the sentence as a list
        sentence = [x.text for x in every_sentence] 
    #     print("Sentence is {0}".format(' '.join(sentence)))
    
        # removing punctuations from sentence list
        sentence = [''.join(chr for chr in s if chr not in string.punctuation) for s in sentence] 
        sentence = [s for s in sentence if s]
        # Removing stop words from sentence for getting better context window
        sentence = [w for w in sentence if not w.lower() in stop_words]
        
        # for every word in the sentence
        for word_index in range(len(every_sentence)):
            # not all words have sense info
            if every_sentence[word_index].wnsn is not None:
                context_bag = []
                every_word = every_sentence[word_index]
            
                for index in range( max(0, word_index - window), min( word_index + window, len(sentence) ) ):
                    if index == word_index:
                        continue
                    context_bag.append( sentence[index] )
                
#                 print('Context bag', context_bag)
            
                # we know the POS tag of the word in the sentence restrict ourselves to only the senses for that POS category
                if every_word.pos.startswith('V'):
                    synsets = wn.synsets(every_word.text, pos=wn.VERB)
                elif every_word.pos.startswith('J'):
                    synsets = wn.synsets(every_word.text, pos=wn.ADJ)
                elif every_word.pos.startswith('R'):
                    synsets = wn.synsets(every_word.text, pos=wn.ADV)
                else:
                    synsets = wn.synsets(every_word.text, pos=wn.NOUN)
                
                # all inflections of the word might not be present search based on lemma
                if len(synsets) == 0:
                    if every_word.pos.startswith('V'):
                        synsets = wn.synsets(every_word.lemma, pos=wn.VERB)
                    elif every_word.pos.startswith('J'):
                        synsets = wn.synsets(every_word.lemma, pos=wn.ADJ)
                    elif every_word.pos.startswith('R'):
                        synsets = wn.synsets(every_word.lemma, pos=wn.ADV)
                    else:
                        synsets = wn.synsets(every_word.lemma, pos=wn.NOUN)
                
                if len(synsets) == 0:
                    continue
                
                # find the best synset based on simple word-overlap between word context and synset examples
                synset_score = -100
                synset_id = ""
                ############################ Adding words eg and gloss ##################################
                for every_synset in synsets:
                    synset_bag = []
                    for every_synset_example in every_synset.examples():
                        synset_bag.extend( every_synset_example.split(' ') )
                
                    gloss_ls = []
                    defi = every_synset.definition().split()
                    for each_wrd in defi: 
                        def_no_punc = re.sub(r'[^\w\s]', '', each_wrd) # removing punctuation from each word in list
                        gloss_ls.append(def_no_punc)
                    
                    synset_bag.extend(gloss_ls)
                
                    ############################ Adding hypernym eg and gloss ##################################
                    hypern = every_synset.hypernyms()
                    for each_hyper in hypern:
                        for hyper_eg in each_hyper.examples():
                            synset_bag.extend(hyper_eg.split()) # Adding hypernyms example to sign 
                        hyper_gloss = []
                        hyper_def = each_hyper.definition().split()
                        for e in hyper_def:
                            hyper_gloss.append(re.sub(r'[^\w\s]', '', e))
                        synset_bag.extend(hyper_gloss) # Adding hypernyms gloss to sign
                
                    ############################ Adding hyponym eg and gloss ##################################
                    hypon = every_synset.hyponyms()
                    for each_hypo in hypon:
                        for hypo_eg in each_hypo.examples():
                            synset_bag.extend(hypo_eg.split()) # Adding hyponyms example to sign 
                        hypo_gloss = []
                        hypo_def = each_hypo.definition().split()
                        for e in hypo_def:
                            hypo_gloss.append(re.sub(r'[^\w\s]', '', e))
                        synset_bag.extend(hypo_gloss) # Adding hyponyms gloss to sign
                
#                     print('Synset bag', synset_bag)
                    ############################ Filtering signature for stopwords ##################################
                    filtered_synset_bag = [w for w in synset_bag if not w.lower() in stop_words] # stop word removal
                                
                    ############################ Checking overlap between context and extended signature ############################    
                    matching_words = list( set( context_bag ).intersection( set(filtered_synset_bag) ) )
                    if len(matching_words) > synset_score:
                        synset_score = len(matching_words)
                        synset_id = every_synset.name().split('.')[-1]
                    
                if synset_id.startswith('0'):
                    synset_id = synset_id[1:]

    #             print('Best matching synset id is {0} with overlapping words {1}'.format( synset_id, synset_score ))
    #             print('Actual synset id is {0}'.format( every_word.wnsn ))
            
                if synset_id == every_word.wnsn:
                    correct = correct + 1
                total = total + 1

#     print('Accuracy is {0}'.format( (correct * 1.0)/ (total * 1.0) ))
    acc_ls.append((correct * 1.0)/ (total * 1.0))
    
print(len(acc_ls))
print(acc_ls)

83
[0.6727272727272727, 0.6333973128598849, 0.6866359447004609, 0.6551040634291377, 0.7806267806267806, 0.6453110492107706, 0.6838235294117647, 0.6772438803263826, 0.6817769718948323, 0.704052780395853, 0.6356192425793245, 0.6137667304015296, 0.7152255639097744, 0.7180020811654526, 0.6975124378109453, 0.6890424481737414, 0.681075888568684, 0.6620959843290891, 0.6579961464354528, 0.6802656546489564, 0.7303877366997295, 0.6908023483365949, 0.6809480401093893, 0.6167176350662589, 0.6598639455782312, 0.5954198473282443, 0.643574297188755, 0.584349593495935, 0.6739345887016849, 0.6705539358600583, 0.6132478632478633, 0.6487985212569316, 0.6881188118811881, 0.6855983772819473, 0.6835187057633973, 0.6633064516129032, 0.6035502958579881, 0.6413793103448275, 0.7046580773042617, 0.7455565949485501, 0.6991341991341992, 0.6445725264169068, 0.712776176753122, 0.704, 0.6977911646586346, 0.6995884773662552, 0.6147368421052631, 0.6889726672950047, 0.7515856236786469, 0.707227813357731, 0.6237524950099

In [8]:
file_num = 0
max_acc = 0
for i in range(len(acc_ls)):
    if acc_ls[i] > max_acc:
        max_acc = acc_ls[i]
        file_num = i
        
print('Maximum accuracy {0} is obtained in file {1}'.format(max_acc, files_ls[file_num]))

Maximum accuracy 0.7806267806267806 is obtained in file br-e27.xml


#### Some notable observations from the expreiment
- Initially in the original code accuracy incresed in the range 55.36 to 65.63 for context window in range 7 to 1 respectively. 
- Just by adding gloss(definition) to the signature, accuracy showed a dip by an average of 1 for corresponding window sizes.
- By removing stop words from the context and signature(gloss + example), a significant increase in accuracy was observed across different window sizes. (since it considered more important words)
- Apart from stopwords punctuations were also removed from the lists. 
- By further adding hypernyms and hyponyms gloss and examples to signature for the words, the accuracy scores showed even better accuracy. (more chances for getting overlap)
- Window size was an important parameter in this experiment, with a window size of around 2-4 across files better accuracies were obtained as compared to larger window sizes(>5 or 6). (This maybe because in most sentences, the words around the context word are more significant in shorter span size, increasing this window do not give any significant improvements, and even dropped few fractional accuracy points maybe because it included more irrelevant words in the context which may have increased overlap with other irrelevant sense)(Also the greater window size may have better impact in those datasets where there are long dependencies or relevance among words, like some compound sentences or phrases).
- Another approach which was not tried here, but maybe tried later was instead of removing stopwords using TF-IDF weighing for frequent words as mentioned in lectures also.