In [1]:
import sys

import numpy as np
from nltk.corpus import wordnet as wn
from numpy.linalg import norm
from prettytable import PrettyTable
import textwrap
from sys import exit


from torchgen.native_function_generation import return_str


**UTILS**

In [2]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

def get_synset_or_first(word_or_synset: str):
    try:
        # Try to interpret the input as a synset name ("dog.n.01")
        return wn.synset(word_or_synset)
    except:
        # If it fails, interpret it as a word
        synsets = wn.synsets(word_or_synset)
        if synsets:
            return synsets[0]  # return the first synset in the list (usually the most common sense)
        else:
            return None


def wrap_text(text, width=20):
    return "\n".join(textwrap.wrap(text, width=width))

**EMBEDDING SPACE BUILDING**

In [3]:
all_synsets = list(wn.all_synsets())

# load the definition of each synset in wordnet
synsets_tokens = []
for synset in wn.all_synsets():
    synset_signature = synset.definition()
    synsets_tokens.append(synset_signature)

# map the definition in an Embedding space
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("thenlper/gte-small", cache_folder="model", local_files_only = True)
synsets_embeddings = model.encode(synsets_tokens, show_progress_bar=True, random_state=42, convert_to_numpy=True)



save_path = "embeddings.npz"

print(f"Saving embeddings in {save_path} ...")
np.savez_compressed(save_path, embeddings=synsets_embeddings, synsets=[s.name() for s in all_synsets])
print("Fatto.")



  from .autonotebook import tqdm as notebook_tqdm
Batches:   6%|▌         | 217/3677 [00:09<02:27, 23.45it/s]


KeyboardInterrupt: 

**EMBEDDING SPACE USAGE**

In [5]:


def load_embeddings(embeddings_path):
    # load the embedding space from the file

    data = np.load(embeddings_path, allow_pickle=True)
    embeddings = data['embeddings']
    synset_names = data['synsets']

    # create a dict [Synset: Definition]
    synsets_embeddings = dict(zip(synset_names, embeddings))

    return synsets_embeddings

def most_similar_embedings(synsets_embeddings, target_synset, pos=None, k=5):

    # get the embedding associated to the target synset
    target_vec = synsets_embeddings[target_synset.name()]

    similarities = []
    # compute the cosine imilarity between the target synset embedding and all the others in the vector space
    for syn in wn.all_synsets():
        if syn.name() == target_synset.name():
            continue
        sim = cosine_similarity(target_vec, synsets_embeddings[syn.name()])
        similarities.append((syn, sim))

    if pos:
        similarities = [elem for elem in similarities if elem[0].pos() == pos]
    # ordina per similarità decrescente
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]


In [6]:
embeddings_path = "embeddings.npz"
embeddings = load_embeddings(embeddings_path)

# TARGET WORD
target = "computer"

# get the WordNet Synset associated at the target word
target_synset = get_synset_or_first(target)

if target_synset is None:
    print(f"synset for {target} not found.")
    sys.exit(0)

# get the most (semantic) symilar synset
similar_synsets = most_similar_embedings(embeddings, target_synset, pos=target_synset.pos(), k=5)



# RESULT OUTPUT
print(f"--- WORDNET RESULT FOR {target_synset.name()} ---")

wordnet_table = PrettyTable(hrules=True, header=False)
width = 30
wordnet_table.add_row(["GLOSS (DEFINITION)", wrap_text(str(target_synset.definition()), width)])
if target_synset.examples():
    wordnet_table.add_row(["USAGE EXAMPLE", wrap_text(str(target_synset.examples()[0]), width)])

wordnet_table.add_row(["LEMMAS", wrap_text(str(target_synset.lemma_names()), width)])

wordnet_table.add_row(["HYPERNYMS", wrap_text(str(target_synset.hypernyms()[:5]) + "...", width)])
wordnet_table.add_row(["HYPONYMS", wrap_text(str(target_synset.hyponyms()[:5]) + "...", width)])

print(wordnet_table)

embeddings_table = PrettyTable(hrules=True, field_names = ["SYNSET", "DESCRIPTION", "SEMSIM"])

for synset in similar_synsets:
    synset_name = synset[0].name()
    synset_definition = synset[0].definition()
    synset_similarity = round(float(synset[1]), 2)

    embeddings_table.add_row([synset_name, wrap_text(str(synset_definition), width), synset_similarity])

print(f"--- EMBEDDINGS RESULT FOR {target_synset.name()} ---")
print(embeddings_table)



--- WORDNET RESULT FOR computer.n.01 ---
+--------------------+--------------------------------+
| GLOSS (DEFINITION) |    a machine for performing    |
|                    |   calculations automatically   |
+--------------------+--------------------------------+
|       LEMMAS       |          ['computer',          |
|                    |      'computing_machine',      |
|                    |      'computing_device',       |
|                    |       'data_processor',        |
|                    | 'electronic_computer', 'inform |
|                    |   ation_processing_system']    |
+--------------------+--------------------------------+
|     HYPERNYMS      |  [Synset('machine.n.01')]...   |
+--------------------+--------------------------------+
|      HYPONYMS      | [Synset('home_computer.n.01'), |
|                    |         Synset('pari-          |
|                    |     mutuel_machine.n.01'),     |
|                    | Synset('turing_machine.n.01'), |
|      