In [27]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
import statistics
import pandas as pd

lemmatizer = WordNetLemmatizer()

In [28]:
file_path = 'TLN-definitions-23.tsv'
RELEVANT_WORD_SIZE_FOR_GENUS = 5
MIN_SYNSET_HEIGHT = 2
MEANING_CANDIDATES_SIZE = 5
DEVIATION = 4 #Used for refine dataset

meaningCandidates = []


In [29]:
def average_length(list):
    lengths = [len(sublist) for sublist in list]
    avg_length = statistics.mean(lengths)
    return avg_length


def refine_dataset(dataset, k):
    for key in dataset:
        avg = int(average_length(dataset[key]))
        dataset[key] = [elem for elem in dataset[key] if abs(len(elem) - avg) <= k]
        avg = int(average_length(dataset[key]))


def parse_tsv_file(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df


def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

#Lexical Overlap
def calc_similarity(definition, lists):
    result = 0
    for list1 in lists:
        #Divido per la media della lunghezza dei due e non per il minimo per evitare di avere un bias sui synset con descrizioni corte rispetto a synset con descrizioni piu lunghe
        result += len(intersection(list1, definition)) / ((len(list1) + len(definition)) / 2)
    return result / (len(lists))


def lemmatized_tokens(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    lemmas = []
    for token, tag in pos_tag(tokens):
        if token.isalpha() and token not in stop_words:
            if tag.startswith('VB'):
                lemmas.append(lemmatizer.lemmatize(token, pos='v'))
            else:
                lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

#Seleziono la lista di Genus come termine più occorente all'interno delle descrizioni
def getWordsInOrder(sentences):
    print("ELABORATING WORDS FROM DATASET ON ", len(sentences), " SENTENCES")
    words_dict = {}
    for sentence in sentences:
        for word in sentence:
            if (word in words_dict):
                words_dict[word] += 1
            else:
                words_dict[word] = 1
    my_list = list(words_dict.items())
    sorted_list = sorted(my_list, key=lambda x: x[1], reverse=True)
    print("ELABORATED WORDS, SHOWING FIRST", RELEVANT_WORD_SIZE_FOR_GENUS, " RELEVANT WORDS FIND ON DESCRIPTIONS")
    #Select only TOP RELEVANT_WORD_SIZE_FOR_GENUS
    for item in sorted_list[0:RELEVANT_WORD_SIZE_FOR_GENUS]:
        print(item)
    return sorted_list


def getSynsetsInOrderFromWordNet(words):
    print("ELABORATING SYNSET SEARCH ON ", len(words), " WORDS")
    synsetWithHeight = []
    for word in words:
        synsets = wordnet.synsets(word[0])
        for synset in synsets:
            if synset.max_depth() > MIN_SYNSET_HEIGHT:
                synsetWithHeight.append((synset.name(), synset.max_depth(), synset))
    sortedSynsetWithHeight = sorted(synsetWithHeight, key=lambda x: x[1])
    print("FOUND A TOTAL OF ", len(sortedSynsetWithHeight), " SYNSETS ")
    # for item in sortedSynsetWithHeight:
    #    print(item)
    return sortedSynsetWithHeight


def calcSimilarityForSynset(synset, sentences):
    if not any(synset.name() == item[0] for item in meaningCandidates):
        similarity = calc_similarity(lemmatized_tokens(synset.definition()), sentences)
        meaningCandidates.append((synset.name(), synset.definition(), similarity))

    for hyponim in synset.hyponyms():
        if not any(hyponim.name() == item[0] for item in meaningCandidates):
            calcSimilarityForSynset(hyponim, sentences)


def getMeaningCandidatesFromSynsets(synsets, sentences):
    print("ELABORATING MEANING ON ", len(synsets), " WITH A TOTAL OF ", len(sentences), " DEFINITIONS")
    for synset in synsets:
        calcSimilarityForSynset(synset[2], sentences)
    sortedmeaningCandidates = sorted(meaningCandidates, key=lambda x: x[1], reverse=True)
    print("ELABORATED MEANING, SHOWING FIRST ", MEANING_CANDIDATES_SIZE, " RESULTS WITH SCORES: ")
    for item in sortedmeaningCandidates[0:MEANING_CANDIDATES_SIZE]:
        print(item)
    return sortedmeaningCandidates[0:MEANING_CANDIDATES_SIZE]

def clear_candidate():
    while (len(meaningCandidates) > 0):
        meaningCandidates.pop()

def elaborate_dataset(dataframe):
    dataset = {
        'door': [],
        'ladybug': [],
        'pain': [],
        'blurriness': []
    }
    dataframe = dataframe.iloc[:, 1:]  # Rimuovi la prima colona
    for index, row in dataframe.iterrows():
        for column in dataframe.columns:
            dataset[column].extend([lemmatized_tokens(row[column])])
    print("Refining dataset removing sentences that are at least ", DEVIATION, " apart from avarage lenght")
    refine_dataset(dataset, DEVIATION)
    return dataset

In [30]:
df = parse_tsv_file(file_path)
dataset = elaborate_dataset(df)

Refining dataset removing sentences that are at least  4  apart from avarage lenght


## Elaborate Door

In [31]:
print("ELABORATING DOOR")

doorWords = getWordsInOrder(dataset['door'])
doorParentSynsetCandidates = getSynsetsInOrderFromWordNet(doorWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
doorMeaningCandidates = getMeaningCandidatesFromSynsets(doorParentSynsetCandidates, dataset['door'])
clear_candidate()

ELABORATING DOOR
ELABORATING WORDS FROM DATASET ON  24  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('room', 11)
('access', 10)
('object', 10)
('open', 7)
('allow', 7)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  27  SYNSETS 
ELABORATING MEANING ON  27  WITH A TOTAL OF  24  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('doorway.n.01', 0.18539547791676272, 'the entrance (the space in a wall) through which you enter or leave a room or building; the space that a door can close')
('dining_room.n.01', 0.1396843896843897, 'a room used for dining')
('anteroom.n.01', 0.13119924057424057, 'a large entrance or reception room or area')
('exterior_door.n.01', 0.1300189393939394, 'a doorway that allows entrance to or exit from a building')
('bedroom.n.01', 0.12582163207163208, 'a room used primarily for sleeping')


## Elaborate Ladybug

In [32]:
print("ELABORATING LADYBUG")

ladyBugWords = getWordsInOrder(dataset['ladybug'])
ladyBugParentSynsetCandidates = getSynsetsInOrderFromWordNet(ladyBugWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
ladyBugMeaningCandidates = getMeaningCandidatesFromSynsets(ladyBugParentSynsetCandidates, dataset['ladybug'])
clear_candidate()


ELABORATING LADYBUG
ELABORATING WORDS FROM DATASET ON  26  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('insect', 24)
('red', 22)
('small', 17)
('black', 17)
('dot', 16)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  20  SYNSETS 
ELABORATING MEANING ON  20  WITH A TOTAL OF  26  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('buffalo_carpet_beetle.n.01', 0.3422185594696907, 'a small black and red and white carpet beetle')
('aphid.n.01', 0.33959822229052994, 'any of various small plant-sucking insects')
('two-spotted_ladybug.n.01', 0.290547059777829, 'red ladybug with a black spot on each wing')
('leaf_bug.n.01', 0.2776053006822238, 'small bright-colored insect that feeds on plant juices')
('chinch_bug.n.01', 0.2776053006822238, 'small black-and-white insect that feeds on cereal grasses')


## Elaborate Pain

In [33]:
print("ELABORATING PAIN")

painWords = getWordsInOrder(dataset['pain'])
painParentSynsetCandidates = getSynsetsInOrderFromWordNet(painWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
painMeaningCandidates = getMeaningCandidatesFromSynsets(painParentSynsetCandidates, dataset['pain'])
clear_candidate()


ELABORATING PAIN
ELABORATING WORDS FROM DATASET ON  26  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('physical', 12)
('feeling', 11)
('emotional', 9)
('sensation', 9)
('cause', 6)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  17  SYNSETS 
ELABORATING MEANING ON  17  WITH A TOTAL OF  26  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('suffering.n.04', 0.22616977040053962, 'feelings of mental or physical pain')
('feeling.n.04', 0.2213675213675214, 'a physical sensation that you experience')
('agony.n.01', 0.181472480510942, 'intense feelings of suffering; acute mental or physical pain')
('wildness.n.01', 0.1781985108908186, 'a feeling of extreme emotional intensity')
('sensitivity.n.03', 0.1591963591963592, 'sensitivity to emotional feelings (of self and others)')


## Elaborate Blurriness

In [34]:
print("ELABORATING BLURRINESS")

blurrinessBugWords = getWordsInOrder(dataset['blurriness'])
blurrinessParentSynsetCandidates = getSynsetsInOrderFromWordNet(blurrinessBugWords[0:RELEVANT_WORD_SIZE_FOR_GENUS])
blurrinessMeaningCandidates = getMeaningCandidatesFromSynsets(blurrinessParentSynsetCandidates,
                                                              dataset['blurriness'])

ELABORATING BLURRINESS
ELABORATING WORDS FROM DATASET ON  22  SENTENCES
ELABORATED WORDS, SHOWING FIRST 5  RELEVANT WORDS FIND ON DESCRIPTIONS
('see', 7)
('image', 6)
('visual', 5)
('border', 4)
('eye', 4)
ELABORATING SYNSET SEARCH ON  5  WORDS
FOUND A TOTAL OF  34  SYNSETS 
ELABORATING MEANING ON  34  WITH A TOTAL OF  22  DEFINITIONS
ELABORATED MEANING, SHOWING FIRST  5  RESULTS WITH SCORES: 
('memory_picture.n.01', 0.10887546796637704, 'a memory image that is similar to a visual perception')
('visual_image.n.01', 0.10129971039061947, 'a mental image that is similar to a visual perception')
('mental_picture.n.01', 0.07097599370326645, 'a clear and telling mental image')
('eye.n.03', 0.06633644133644134, 'attention to what is seen')
('naked_eye.n.01', 0.060905235000304696, 'the eye unaided by any optical instrument that alters the power of vision or alters the apparent size or distance of objects')
