# Word Sense Disambiguation

In [1]:
import numpy as np
import random
import nltk
import re
from nltk import MWETokenizer, WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
from nltk.corpus import stopwords

Estrazione stopwords

In [2]:
stop_words = set(stopwords.words('english'))
TOTAL_SEMCOR_SENTENCES = len(semcor.tagged_sents(tag = 'sem'))

multi_word_expressions = [x for x in wn.all_lemma_names() if '_' in x]
multi_word_expressions = [tuple(x.split('_')) for x in multi_word_expressions]
mwe_tokenizer = MWETokenizer(multi_word_expressions, separator=' ')
word_lemmatizer = WordNetLemmatizer()

### Semcor
Estrazione di 50 frasi casuali 

In [4]:
def preprocess_text(txt):
    """Preprocessa il testo rimuovendo punteggiatura, stop words, ecc."""
    txt = re.sub(r'[^\w\s]',' ',txt)
    txt = txt.lower()
    txt = mwe_tokenizer.tokenize(txt.split())
    txt = [word_lemmatizer.lemmatize(token) for token in txt]
    txt = [w for w in txt if not w in stop_words]
    return txt

### Lesk
Implementazione dell'algoritmo

In [5]:
def basic_lesk_algorithm(word, pos, context):
    """Implementazione semplificata dell'algoritmo Lesk."""
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    best_synset = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None #wordnet mette in ordine i sysnet basandosi sulla frequenza
    max_overlap = 0

    for synset in wn.synsets(word, pos=pos):
        signature = set(preprocess_text(synset.definition())).union(set(preprocess_text(' '.join(synset.examples()))))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_synset = synset
    
    return best_synset

Valutazione dell'algoritmo

In [6]:

def evaluate_algorithm():
    """Valuta l'accuratezza dell'algoritmo su un campione di 50 frasi di SemCor."""
    tagged_sentences = semcor.tagged_sents(tag = 'sem')[:100]
    plain_sentences = semcor.sents()[:100]
    np.random.seed()
    selected_indices = np.random.permutation(len(tagged_sentences))[:50]

    sample_tagged_sentences = [tagged_sentences[i] for i in selected_indices]
    sample_plain_sentences = [plain_sentences[i] for i in selected_indices]
    total_sentences = len(sample_plain_sentences)
    correct_predictions = 0

    for i, sentence in enumerate(sample_plain_sentences):
        sentence_without_stopwords = [w for w in sentence if not w in stop_words and w.isalpha()]
        random_word = random.choice(sentence_without_stopwords)
        actual_synset = None
        predicted_synset = None

        found = False
        for j, cell in enumerate(sample_tagged_sentences[i]):
            if not found:
                if random_word in cell[0]:
                    label = cell.label() if isinstance(cell, nltk.tree.Tree) else None

                    if label and hasattr(label, 'synset'):
                        actual_synset = label.synset()
                        pos = actual_synset.pos()
                        context = set(sentence)
                        predicted_synset = basic_lesk_algorithm(random_word, pos, context)
                        found = True
        
        if actual_synset is not None and predicted_synset is not None:
            correct_predictions += predicted_synset == actual_synset
        else:
            total_sentences -= 1
                
    accuracy = correct_predictions / total_sentences
    print('accuracy: ', accuracy)
    
    return accuracy


Applicazione dell'algoritmo a frasi estratte casualmente

In [7]:
accuracies = []
for i in range(10):
    accuracies.append(evaluate_algorithm())

print('\n')
print('mean accuracy: ', np.mean(accuracies))

accuracy:  0.4090909090909091
accuracy:  0.43243243243243246
accuracy:  0.3695652173913043
accuracy:  0.43478260869565216
accuracy:  0.5238095238095238
accuracy:  0.45652173913043476
accuracy:  0.36585365853658536
accuracy:  0.5
accuracy:  0.38095238095238093
accuracy:  0.4666666666666667


mean accuracy:  0.433967513670589
