In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn

data = pd.read_csv('semcor1.csv')

In [5]:
def simple_lesk(cotext_sentence,amb_word):
    max_overlap = 0
    lesk_sense = None
    context_words = nltk.word_tokenize(cotext_sentence)
    context_words = set(context_words)
    # print(context_words)
    for sense in wn.synsets(amb_word):
        signature = set()
        sene_definitions = nltk.word_tokenize(sense.definition())
        signature = signature.union(set(sene_definitions))
        signature = signature.union(set(sense.lemma_names()))
        # print(signature)
        for example in sense.examples():
            signature = signature.union(set(example.split()))
        overlap = len(context_words.intersection(signature))
        if overlap > max_overlap:
            lesk_sense = sense
            max_overlap = overlap

    return lesk_sense

In [10]:
def accuracy(algorithm,data):
    correct = 0
    for i in range(len(data)):
        context = data['context'][i]
        word = data['sense_full'][i]
        word = word.split('.')[0]
        pred_sense = algorithm(context, word)
        if pred_sense is None:
            continue

        if data['sense_full'][i] == pred_sense.name():
            correct += 1
        
    return correct/len(data)*100





#### Accuracy of Simple Lesk Algorithm

In [11]:
print(accuracy(simple_lesk, data), "%")


39.68529137218296 %


In [13]:

import pprint
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords


In [14]:
stopwords_en = set(stopwords.words('english'))


def tokenize(document, word):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    tokens = [
        token for token in tokens if token not in stopwords_en and token.isalpha()]
    tokens = [token for token in tokens if token != word]
    return set(tokens)


In [17]:
def extended_lesk(context, word):
    context = context.lower()
    word = word.lower()

    context_tokens = tokenize(context, word)

    # calculating the word sense disambiguation using simple LESK
    synsets = wordnet.synsets(word)
    finWeights = [0] * len(synsets)
    N_t = len(synsets)
    weights= {}

    for context_token in context_tokens :
        weights[context_token] = 1
        for sense in synsets:
            if context_token in sense.definition():
                weights[context_token] += N_t
                continue

            for example in sense.examples():
                if context_token in example:
                    weights[context_token] += N_t
                    break

            for lemma in sense.lemma_names():
                if context_token in lemma:
                    weights[context_token] += N_t
                    break

    for ind,sense in enumerate(synsets):
        overlap = set()
        for example in sense.examples():
            for token in tokenize(example, word):
                overlap.add(token)

        for token in tokenize(sense.definition(), word):
            overlap.add(token)

        for token in sense.lemma_names():
            overlap.add(token)

        for token in context_tokens:
            if token in overlap:
                finWeights[ind] += np.log(weights[token] / N_t)

    max_weight = max(finWeights)
    index = finWeights.index(max_weight)
    return synsets[index]
                




### Accuracy of Extended Lesk Algorithm

In [18]:
print(accuracy(extended_lesk, data), "%")


55.36738231764235 %
