# Exercise 2
Cercare synset di WordNet corretto per un insieme di definizioni

In [1]:
#IMPORT AND COSTANTS

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
import statistics
import pandas as pd

lemmatizer = WordNetLemmatizer()

file_path = 'TLN-definitions-23.tsv'
RELEVANT_WORD_SIZE_FOR_GENUS = 5
MIN_SYNSET_HEIGHT = 2
MEANING_CANDIDATES_SIZE = 5
DEVIATION = 4 #Used for refine dataset
#Global variable used for dynamic programming
meaningCandidates = []

## Utilities functions

In [2]:
#Function used to load the input data
def parse_tsv_file(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df

#Given a list of sentences calculate the avarage sentence lenght
def average_length(dataset):
    lengths = [len(sentence) for sentence in dataset]
    avg_length = statistics.mean(lengths)
    return avg_length

#Lemmatize sentences and set all verb to lemma to avoid different verbal forms
def lemmatized_tokens(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmas = []
    for token, tag in pos_tag(tokens):
        if token.isalpha() and token not in stop_words:
            if tag.startswith('VB'):
                lemmas.append(lemmatizer.lemmatize(token, pos='v'))
            else:
                lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

#Function used to create the dictionary structure used to store definitions in an organized way from the csv dataset
def elaborate_dataset(dataframe):
    dataset = {
        'door': [],
        'ladybug': [],
        'pain': [],
        'blurriness': []
    }
    dataframe = dataframe.iloc[:, 1:]  # Rimuovi la prima colona che non serve
    for index, row in dataframe.iterrows():
        for column in dataframe.columns:
            dataset[column].extend([lemmatized_tokens(row[column])])
    return dataset

#Remove from the dataset all sentences that differ more than k  from the average of length
def refine_dataset(dataset,k):
    for key in dataset:
        avg = int(average_length(dataset[key]))
        dataset[key] = [elem for elem in  dataset[key] if abs(len(elem)-avg) <= k]
    return dataset

#Tokenize sentences and set all verb to lemma to avoid different verbal forms
def lemmatized_tokens(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmas = []
    for token, tag in pos_tag(tokens):
        if token.isalpha() and token not in stop_words:
            if tag.startswith('VB'):
                lemmas.append(lemmatizer.lemmatize(token, pos='v'))
            else:
                lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

#Function used to clear the global array
def clear_candidate():
    while (len(meaningCandidates) > 0):
        meaningCandidates.pop()

## Similarity function

In [3]:
#Use calculate the word intersection between two sentences,used to perform the lexical overlap
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

#Calculate avarage similarity between a synset definitions and the list of definitions from the dataset
def calc_similarity(definition, definitions_dataset):
    result = 0
    for sentence in definitions_dataset:
        #Normalize for the avarage of the definition and the sentence to avoid bias for short defined synsets
        result += len(intersection(sentence, definition)) / ((len(sentence) + len(definition)) / 2)
    return result / (len(definitions_dataset))



#This recursive function calculates the similarity between all the synsets in a wordnet tree with head synset with the definitions of a term.
#The result is the global variable meaningCandidates used to store tuples (Name,definitions,similarity)
#In this way in each point of the recursive process we can easyli find if a synset similarity has already been calculated and avoid expensive operations
def calcSimilarityForSynset(synset, sentences):
    #If this synset isn't present in the result structure
    if not any(synset.name() == item[0] for item in meaningCandidates):
        #Calculate similarity and add it
        similarity = calc_similarity(lemmatized_tokens(synset.definition()), sentences)
        meaningCandidates.append((synset.name(), synset.definition(), similarity))

    #For each child synset
    for hyponim in synset.hyponyms():
        #If the child synset isn't present in the result list we calculate the similarity of it's subtree
        if not any(hyponim.name() == item[0] for item in meaningCandidates):
            calcSimilarityForSynset(hyponim, sentences)


## Synset elaboration functions

In [4]:
#Given a dataset we extract the list of all the words present in the dataset sentences ordered by their frequencies
def getWordsInOrder(dataset):
    print("ELABORATING WORDS FROM DATASET ON ", len(dataset), " SENTENCES")
    words_dict = {}
    #For each sentence in the dataset
    for sentence in dataset:
        #For each word in each sentence
        for word in sentence:
            #Insert/update the word count
            if (word in words_dict):
                words_dict[word] += 1
            else:
                words_dict[word] = 1
    #Transform the dictionary in a tuple list and sort it by the frequency
    my_list = list(words_dict.items())
    sorted_list = sorted(my_list, key=lambda x: x[1], reverse=True)
    print("ELABORATED WORDS, SHOWING FIRST", RELEVANT_WORD_SIZE_FOR_GENUS, " RELEVANT WORDS FIND ON DESCRIPTIONS")
    #Select only TOP RELEVANT_WORD_SIZE_FOR_GENUS
    for item in sorted_list[0:RELEVANT_WORD_SIZE_FOR_GENUS]:
        print(item)
    return sorted_list

#Given a list of words extract all synset for each word
def getSynsetsInOrderFromWordNet(words):
    print("ELABORATING SYNSET SEARCH ON ", len(words), " WORDS")
    synsetWithHeight = []
    #For each word
    for word in words:
        #Load all word synsets
        synsets = wordnet.synsets(word[0])
        #For each synset
        for synset in synsets:
            #If it isn't too high in the gerarchy we save it
            if synset.max_depth() > MIN_SYNSET_HEIGHT:
                synsetWithHeight.append((synset.name(), synset.max_depth(), synset))
    #Sort all synset based on synset height so that we explore first the deepest synset to use dynamic programming
    sortedSynsetWithHeight = sorted(synsetWithHeight, key=lambda x: x[1])
    print("FOUND A TOTAL OF ", len(sortedSynsetWithHeight), " SYNSETS ")
    return sortedSynsetWithHeight

#Extract from the list of all the synsets of the most relevant words in the definitions and the list of definitions a list of possible synsets
def getMeaningCandidatesFromSynsets(synsets, sentences):
    print("ELABORATING MEANING ON ", len(synsets), " WITH A TOTAL OF ", len(sentences), " DEFINITIONS")
    #For each synset
    for synset in synsets:
        #Calculate the similarity between the synset and the sentences
        #The result of this calculation is stored in the global variable meaningCandidates as explained in the function definition
        calcSimilarityForSynset(synset[2], sentences)
    #Sort the meaning candidates by their similarity
    sortedmeaningCandidates = sorted(meaningCandidates, key=lambda x: x[2], reverse=True)
    print("ELABORATED MEANING, SHOWING FIRST ", MEANING_CANDIDATES_SIZE, " RESULTS WITH SCORES: ")
    for item in sortedmeaningCandidates[0:MEANING_CANDIDATES_SIZE]:
        print(item)
    return sortedmeaningCandidates[0:MEANING_CANDIDATES_SIZE]


## Dataset loading

In [5]:
df = parse_tsv_file(file_path)
dataset = elaborate_dataset(df)

## Elaborate meaning for word Door

In [6]:
print("ELABORATING DOOR")

doorWords = getWordsInOrder(dataset['door'])
doorParentSynsetCandidates = getSynsetsInOrderFromWordNet(doorWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
doorMeaningCandidates = getMeaningCandidatesFromSynsets(doorParentSynsetCandidates, dataset['door'])
clear_candidate()

ELABORATING DOOR
ELABORATING WORDS FROM DATASET ON  30  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('room', 14)
('object', 14)
('access', 11)
('open', 10)
('allow', 9)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  27  SYNSETS 
ELABORATING MEANING ON  27  WITH A TOTAL OF  30  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('doorway.n.01', 'the entrance (the space in a wall) through which you enter or leave a room or building; the space that a door can close', 0.1772477982213478)
('dining_room.n.01', 'a room used for dining', 0.1336919561919562)
('anteroom.n.01', 'a large entrance or reception room or area', 0.1231687897448767)
('room.n.04', 'the people who are present in a room', 0.12072125042713279)
('bedroom.n.01', 'a room used primarily for sleeping', 0.11999106066468496)


## Elaborate meaning for word Ladybug

In [7]:
print("ELABORATING LADYBUG")

ladyBugWords = getWordsInOrder(dataset['ladybug'])
ladyBugParentSynsetCandidates = getSynsetsInOrderFromWordNet(ladyBugWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
ladyBugMeaningCandidates = getMeaningCandidatesFromSynsets(ladyBugParentSynsetCandidates, dataset['ladybug'])
clear_candidate()


ELABORATING LADYBUG
ELABORATING WORDS FROM DATASET ON  30  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('insect', 28)
('red', 26)
('black', 22)
('small', 20)
('dot', 18)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  20  SYNSETS 
ELABORATING MEANING ON  20  WITH A TOTAL OF  30  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('buffalo_carpet_beetle.n.01', 'a small black and red and white carpet beetle', 0.3372284554136081)
('aphid.n.01', 'any of various small plant-sucking insects', 0.3226517926517927)
('two-spotted_ladybug.n.01', 'red ladybug with a black spot on each wing', 0.29553888377417786)
('leaf_bug.n.01', 'small bright-colored insect that feeds on plant juices', 0.265823979941627)
('chinch_bug.n.01', 'small black-and-white insect that feeds on cereal grasses', 0.265823979941627)


## Elaborate meaning for word Pain

In [8]:
print("ELABORATING PAIN")

painWords = getWordsInOrder(dataset['pain'])
painParentSynsetCandidates = getSynsetsInOrderFromWordNet(painWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
painMeaningCandidates = getMeaningCandidatesFromSynsets(painParentSynsetCandidates, dataset['pain'])
clear_candidate()


ELABORATING PAIN
ELABORATING WORDS FROM DATASET ON  30  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('physical', 14)
('feeling', 11)
('emotional', 10)
('sensation', 10)
('cause', 6)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  17  SYNSETS 
ELABORATING MEANING ON  17  WITH A TOTAL OF  30  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('feeling.n.04', 'a physical sensation that you experience', 0.2068701668701669)
('suffering.n.04', 'feelings of mental or physical pain', 0.20522015022015025)
('agony.n.01', 'intense feelings of suffering; acute mental or physical pain', 0.16490142210730444)
('wildness.n.01', 'a feeling of extreme emotional intensity', 0.15920061420061418)
('sensitivity.n.03', 'sensitivity to emotional feelings (of self and others)', 0.14241462241462244)


## Elaborate meaning for word Blurriness

In [9]:
print("ELABORATING BLURRINESS")

blurrinessBugWords = getWordsInOrder(dataset['blurriness'])
blurrinessParentSynsetCandidates = getSynsetsInOrderFromWordNet(blurrinessBugWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
blurrinessMeaningCandidates = getMeaningCandidatesFromSynsets(blurrinessParentSynsetCandidates,
                                                              dataset['blurriness'])

ELABORATING BLURRINESS
ELABORATING WORDS FROM DATASET ON  30  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('see', 8)
('image', 6)
('visual', 6)
('something', 5)
('eye', 5)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  26  SYNSETS 
ELABORATING MEANING ON  26  WITH A TOTAL OF  30  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('memory_picture.n.01', 'a memory image that is similar to a visual perception', 0.0833507817718344)
('visual_image.n.01', 'a mental image that is similar to a visual perception', 0.07779522621627884)
('mental_picture.n.01', 'a clear and telling mental image', 0.055752765752765755)
('naked_eye.n.01', 'the eye unaided by any optical instrument that alters the power of vision or alters the apparent size or distance of objects', 0.05455394889033331)
('memory_image.n.01', 'a mental image of something previously experienced', 0.05365183148464882)
