In [51]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet as wn
from googletrans import Translator
from prettytable import PrettyTable

# import nltk
# nltk.download('omw-1.4')

import string
import pandas as pd

In [52]:
import deepl

DEEPL_API_KEY = "2d6ad839-652e-439a-9086-91e97200d4a8:fx"


def traduci_definizioni():
    translator = deepl.Translator(DEEPL_API_KEY)
    definitions = pd.read_csv("data/definizioni_column.csv")
    columns = definitions.columns.tolist()



    traslated_df = []
    for index, row in definitions.iterrows():
        new_row = []
        for defition in row:
            if defition == "nan":
                new_row.append("nan")
                continue

            translated = translator.translate_text(defition, target_lang="EN-US", source_lang="IT")
            new_row.append(translated.text)
        traslated_df.append(new_row)

    new_df = pd.DataFrame(traslated_df, columns=columns)
    new_df.to_csv("data/definizioni_column_en_2.csv", index=False)


#traduci_definizioni()


In [53]:
def get_k_genus(definitions, k=1):

    defintions_tokens = []
    stop_words = stopwords.words('english')
    for definition in definitions:
        tokens =  word_tokenize(definition.lower())
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
        defintions_tokens.append(tokens)

    # count the frequency of the tokens
    genus_counts = {}
    for tokens in defintions_tokens:
        for token in tokens:
            if token in genus_counts:
                genus_counts[token] += 1
            else:
                genus_counts[token] = 1

    sorted_genus_counts = sorted(genus_counts.items(), key=lambda x: x[1], reverse=True)

    genus_to_return = [genus[0] for genus in sorted_genus_counts[:k]]

    return genus_to_return

def load_embeddings(embeddings_path):
    # load the embedding space from the file

    data = np.load(embeddings_path, allow_pickle=True)
    embeddings = data['embeddings']
    synset_names = data['synsets']

    # create a dict [Synset: Definition]
    synsets_embeddings = dict(zip(synset_names, embeddings))

    return synsets_embeddings

def preprocess_text(text):
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Stemming
    stemmer = SnowballStemmer("english")

    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)


In [54]:
definitions = pd.read_csv("data/definizioni_column_en_2.csv")

categoria = "Pantalone[CG]"
concetto = "trouser.n.01"



**CONTENT TO FORM USING WORDNET**

In [55]:
def onomasiological_by_word_freq(definitions, k=1):
    # get the possible genus for the definitions
    genus = get_k_genus(definitions, 1)

    print(f"The genus are: {genus}")
    # build the possibile synset set

    # set of the synsets associated to the genus
    genus_synsets = set()

    # add the genus synsets and it's lemmas
    for genus_name in genus:
        for current_synset in wn.synsets(genus_name):
            genus_synsets.add(current_synset)
            for current_lemma in current_synset.lemmas():
                genus_synsets.add(current_lemma)

    # build the set of genus hyponyms (where the research will be)
    possible_synsets = set()

    for genus_synset in genus_synsets:
        for hyponym in genus_synset.hyponyms():
            possible_synsets.add(hyponym)

    # list of (DEFINITION_ID, POSSIBLE_SYNSETS)
    results = []

    for index, definition in enumerate(definitions):
        definition_tokens = preprocess_text(definition).split(" ")

        synset_similarities = []

        for possible_synset in possible_synsets:
            synsets_tokens = preprocess_text(possible_synset.definition()).split(" ")

            intersection = set(definition_tokens).intersection(synsets_tokens)
            union = set(definition_tokens).union(synsets_tokens)

            similarity = len(intersection)/len(union)

            synset_similarities.append((similarity, possible_synset))

        synset_similarities.sort(key=lambda x: x[0], reverse=True)
        best_synsets = [synset for similarity, synset in synset_similarities[:k]]

        results.append((index+1, best_synsets))

    return results



results = onomasiological_by_word_freq(definitions[categoria], k=4)
correct_synset = wn.synset(concetto)

results_table = PrettyTable(field_names=["Definition ID", "Synsets", "Correct"])

correct_count = 0
for result in results:
    if correct_synset in result[1]:
        correct_count += 1
        correct = True
    else:
        correct = False
    row = [f"P{result[0]}", result[1], correct]
    results_table.add_row(row)


print(f"Correct synset guessed: {correct_count}/{len(results)}")

print(results_table)



The genus are: ['garment']
Correct synset guessed: 11/39
+---------------+------------------------------------------------------------------------------------------------------------+---------+
| Definition ID |                                                  Synsets                                                   | Correct |
+---------------+------------------------------------------------------------------------------------------------------------+---------+
|       P1      |       [Synset('sweater.n.01'), Synset('trouser.n.02'), Synset('shirt.n.01'), Synset('romper.n.02')]        |  False  |
|       P2      |      [Synset('camlet.n.01'), Synset('sunsuit.n.01'), Synset('legging.n.01'), Synset('trouser.n.01')]       |   True  |
|       P3      |    [Synset('gown.v.01'), Synset('prim.v.03'), Synset('overgarment.n.01'), Synset('undergarment.n.01')]     |  False  |
|       P4      |   [Synset('overgarment.n.01'), Synset('undergarment.n.01'), Synset('legging.n.01'), Synset('robe.n.01')

**CONTENT TO FORM: Sentence Embeddings**

In [56]:
def onomasiological_by_sentence_embeddings(definitions, k=1):

    # get the possibile genus for the definitions
    genus = get_k_genus(definitions, 1)


    print(f"The genus are: {genus}")
    # build the possibile synset set

    # set of the synsets associated to the genus
    genus_synsets = set()

    # add the genus synsets and it's lemmas
    for genus_name in genus:
        for current_synset in wn.synsets(genus_name):
            genus_synsets.add(current_synset)
            for current_lemma in current_synset.lemmas():
                genus_synsets.add(current_lemma)

    # build the set of genus hyponyms (where the research will be)
    possible_synsets = set()

    for genus_synset in genus_synsets:
        for hyponym in genus_synset.hyponyms():
            possible_synsets.add(hyponym)


    # get the synset description

    synsets_tokens = {}
    for synset in possible_synsets:
        synset_signature = synset.definition()
        synsets_tokens[synset.name()] = synset_signature

    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("thenlper/gte-small", cache_folder="model", local_files_only = True)

    synsets_embeddings = model.encode(list(synsets_tokens.values()), show_progress_bar=True, random_state=42)

    #cleaned_definitions = [preprocess_text(definition) for definition in definitions['Pantalone[CG]'] ]

    definitions_embeddings = model.encode(definitions , show_progress_bar=True, random_state=42)


    # get the synset embeddings and definition most similar
    similarities = cosine_similarity(definitions_embeddings, synsets_embeddings)

    # list of (DEFINITION_ID, POSSIBLE_SYNSETS)
    results = []

    for def_index, row in enumerate(similarities):
        index = np.argsort(row)[::-1]
        best_k_index = index[:k]

        best_synsets = []
        for index in best_k_index:
            tmp_synset = wn.synset(list(synsets_tokens.keys())[index])
            best_synsets.append(tmp_synset)

        results.append((def_index+1, best_synsets))

    return results

results = onomasiological_by_sentence_embeddings(definitions[categoria], k=4)

correct_synset = wn.synset(concetto)

results_table = PrettyTable(field_names=["Definition ID", "Synsets", "Correct"])

correct_count = 0
for result in results:
    if correct_synset in result[1]:
        correct_count += 1
        correct = True
    else:
        correct = False
    row = [f"P{result[0]}", result[1], correct]
    results_table.add_row(row)


print(f"Correct synset guessed: {correct_count}/{len(results)}")

print(results_table)




The genus are: ['garment']


Batches: 100%|██████████| 2/2 [00:00<00:00, 14.63it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 65.71it/s]

Correct synset guessed: 21/39
+---------------+------------------------------------------------------------------------------------------------------------+---------+
| Definition ID |                                                  Synsets                                                   | Correct |
+---------------+------------------------------------------------------------------------------------------------------------+---------+
|       P1      | [Synset('shirt.n.01'), Synset('undergarment.n.01'), Synset('sweater.n.01'), Synset('head_covering.n.01')]  |  False  |
|       P2      |   [Synset('trouser.n.01'), Synset('legging.n.01'), Synset('separate.n.02'), Synset('overgarment.n.01')]    |   True  |
|       P3      |      [Synset('legging.n.01'), Synset('overgarment.n.01'), Synset('shirt.n.01'), Synset('weeds.n.01')]      |  False  |
|       P4      |    [Synset('legging.n.01'), Synset('trouser.n.01'), Synset('overgarment.n.01'), Synset('camlet.n.01')]     |   True  |
|       P5 




**GLOBAL EMBEDDINGS**

In [57]:
def global_onomasiological_by_sentence_embeddings(definitions, k=1):


    embeddings_path = "./data/embeddings.npz"

    embeddings = load_embeddings(embeddings_path)

    from sentence_transformers import SentenceTransformer


    model = SentenceTransformer("thenlper/gte-small", cache_folder="model", local_files_only = True)

    #cleaned_definitions = [preprocess_text(definition) for definition in definitions]

    definitions_embeddings = model.encode(definitions , show_progress_bar=True, random_state=42)


    # get the synset embeddings and definition most similar
    similarities = cosine_similarity(definitions_embeddings, list(embeddings.values()))

    # list of (DEFINITION_ID, POSSIBLE_SYNSETS)
    results = []

    for def_index, row in enumerate(similarities):
        index = np.argsort(row)[::-1]
        best_k_index = index[:k]

        best_synsets = []
        for index in best_k_index:
            tmp_synset = wn.synset(list(embeddings.keys())[index])
            best_synsets.append(tmp_synset)

        results.append((def_index+1, best_synsets))

    return results

results = global_onomasiological_by_sentence_embeddings(definitions[categoria], k=4)

correct_synset = wn.synset(concetto)

results_table = PrettyTable(field_names=["Definition ID", "Synsets", "Correct"])

correct_count = 0
for result in results:
    if correct_synset in result[1]:
        correct_count += 1
        correct = True
    else:
        correct = False
    row = [f"P{result[0]}", result[1], correct]
    results_table.add_row(row)


print(f"Correct synset guessed: {correct_count}/{len(results)}")

print(results_table)




Batches: 100%|██████████| 2/2 [00:00<00:00, 69.21it/s]


Correct synset guessed: 3/39
+---------------+-----------------------------------------------------------------------------------------------------------------------+---------+
| Definition ID |                                                        Synsets                                                        | Correct |
+---------------+-----------------------------------------------------------------------------------------------------------------------+---------+
|       P1      |                [Synset('shirt.n.01'), Synset('person.n.02'), Synset('back.n.07'), Synset('skirt.n.01')]               |  False  |
|       P2      |          [Synset('trouser.n.01'), Synset('legging.n.01'), Synset('leg.n.07'), Synset('piece_of_cloth.n.01')]          |   True  |
|       P3      |         [Synset('cross-legged.r.01'), Synset('barelegged.s.01'), Synset('kirtle.n.02'), Synset('bodice.n.01')]        |  False  |
|       P4      |            [Synset('legging.n.01'), Synset('pant_leg.n.01'), Syns

In [58]:



tmp = wn.synset("trouser.n.01")

print(tmp.definition())



(usually in the plural) a garment extending from the waist to the knee or ankle, covering each leg separately
