In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from prettytable import PrettyTable


import string
import pandas as pd


definitions = pd.read_csv("data/definizioni_column.csv")

CONCEPTS = [concept for concept in definitions.columns]

In [2]:
def preprocess_text(text):
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('italian'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Stemming
    stemmer = SnowballStemmer("italian")

    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

***simlex value: terms overlap between definitions***

In [3]:

results = []

for concept in CONCEPTS:

    # get all the defition for the concept
    concept_definitions = definitions[concept].to_list()
    simsem_values = {}

    # compute the simlex value for each pair of definitions
    for index1, definition1 in enumerate(concept_definitions):
        for index2, definition2 in enumerate(concept_definitions):
            set1 = set(preprocess_text(definition1).split(" "))
            set2 = set(preprocess_text(definition2).split(" "))

            intersection = len(set1.intersection(set2))
            union = len(set1.union(set2))
            key = tuple(sorted([index1, index2]))

            simsem_values[key] = intersection / union

    # Print all SimLex Values Sorted
    # print(f"Values of {concept}:")
    # print(sorted(simsem_values.items(), key=lambda x:x[1], reverse=True))

    simlex_sum = 0
    for simlex_value in simsem_values.values():
        simlex_sum += simlex_value

    percentage = round((simlex_sum / len(simsem_values)) * 100, 2)


    results.append(percentage)

print("---AGGREGATE SIMLEX VALUES---")
table = PrettyTable(field_names=CONCEPTS)
table.add_row(results)
print(table)




---AGGREGATE SIMLEX VALUES---
+---------------+-----------------+--------------+---------------+
| Pantalone[CG] | Microscopio[CS] | Pericolo[AG] | Euristica[AS] |
+---------------+-----------------+--------------+---------------+
|     22.91     |      18.43      |    15.36     |      9.14     |
+---------------+-----------------+--------------+---------------+


***simsem value: cosine similarity between sentence embeddings***

In [4]:

results = []

for concept in CONCEPTS:

    # get all the defition for the concept
    concept_definitions = definitions[concept].to_list()

    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("thenlper/gte-small", cache_folder="model", local_files_only = True)
    embeddings = model.encode(concept_definitions, show_progress_bar=True, random_state=42)

    simsem_values = {}

    for index1, embedding1 in enumerate(embeddings):
        for index2, embedding2 in enumerate(embeddings):
            key = tuple(sorted([f"P{index1}", f"P{index2}"]))
            simsem_values[key] = cosine_similarity([embedding1], [embedding2]).tolist()[0][0]

    simsem_sum = 0
    for simsem_value in simsem_values.values():
        simsem_sum += simsem_value

    percentage = round((simsem_sum / len(simsem_values)) * 100, 2)
    results.append(percentage)

print("---AGGREGATE SIMSEM VALUES---")
table = PrettyTable(field_names=CONCEPTS)
table.add_row(results)
print(table)





  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 11.11it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  7.79it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 22.37it/s]


---AGGREGATE SIMSEM VALUES---
+---------------+-----------------+--------------+---------------+
| Pantalone[CG] | Microscopio[CS] | Pericolo[AG] | Euristica[AS] |
+---------------+-----------------+--------------+---------------+
|     87.83     |      88.24      |    86.35     |     85.15     |
+---------------+-----------------+--------------+---------------+
