In [None]:
import numpy as np
import pandas as pd
import deepl
from keybert import KeyBERT
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import yellowbrick
import scipy.cluster.hierarchy as shc
import nltk
from nltk.tokenize import word_tokenize as wt
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
stop_words.add("something")
stop_words.add("like")
stop_words.add("multiple")
stop_words.add("lower")
stop_words.add("upper")
stop_words.add("typical")
stop_words.add("maybe")
stop_words.add("center")
stop_words.add("least")
stop_words.add("most")
stop_words.add("previous")
stop_words.add("next")
stop_words.add("left")
stop_words.add("right")

LABELS = ['Cassette Player', 'Chainsaw', 'Church', 'Dog', 'French Horn', 'Garbage Truck', 'Gas Pump', 'Golf Ball', 'Parachute', 'Fish']

# Creating a new df
df = pd.read_csv('db_for_data_analysis.csv')


target_class = 'Parachute'
target_class_it = 'Paracadute'
target_label = LABELS.index(target_class) + 1
data2 = df[df.language_en == 'f']
data2 = data2[data2.guessed_label_id == target_label]['characteristic'].to_list()
data = df[df.language_en == 't']
data = data[data.guessed_label_id == target_label]['characteristic'].to_list()

data2

In [None]:
if(False):
    translator = deepl.Translator("")

    result = translator.translate_text(data2, target_lang="EN-US", source_lang="IT", context=target_class)
    
    translation_dict = dict()
    for i in range(len(data2)):
        translation_dict.update({data2[i]: result[i].text})

    translation_dict   
    np.save(target_class.lower() + '_translation', translation_dict, allow_pickle=True)


In [None]:
translation_dict = np.load(target_class.lower() + '_translation.npy', allow_pickle=True).item()
translation_dict
print(translation_dict.get(target_class_it.lower()))

In [None]:
for k, v in translation_dict.items():
    translation_dict.update({k : ' ' + v.lower() + ' '})
for e in range(len(data)):
    data[e] = ' ' + data[e] + ' '
data2 = data + list(translation_dict.values())
original = [item.strip() for item in data2]

for k, v in translation_dict.items():
    translation_dict.update({k : v.strip()})

data2 = [item.replace(' '+target_class.lower()+' ', ' ') for item in data2]
if(translation_dict.get(target_class_it.lower()) is not None):
    data2 = [item.replace(' '+translation_dict.get(target_class_it.lower())+' ', ' ') for item in data2]
    data2 = [item.replace(' '+translation_dict.get(target_class_it.lower())+'\'s ', ' ') for item in data2]
data2 = [item.replace(' '+target_class.lower()+'\'s ', ' ').strip() for item in data2]

dictionary = {k: v for k, v in zip(original, data2)}

In [None]:
from sentence_transformers import SentenceTransformer
import nmslib

# Creating the BERT embeddings
model = SentenceTransformer('all-mpnet-base-v2')

distinct_data = list(dict.fromkeys(data2))
distinct_data.remove('')
encoding_ = model.encode(distinct_data)
data_encoding = np.array(encoding_)
np.save('encoding.npy', data_encoding)
embedding = np.load('encoding.npy')

# Viewing the words embeddings
vector_list = []
for emb in embedding:
    vector_list.append(emb)
    
vect = np.array(vector_list)

new_df = pd.DataFrame({"vector": vector_list, "title": distinct_data})
new_df = new_df.reset_index()
new_df.drop(columns=["index"], inplace=True)
new_df

In [None]:
from sklearn.cluster import AgglomerativeClustering
from yellowbrick.cluster import silhouette_visualizer
from sklearn import metrics

vectors = new_df['vector'].tolist()

x, y = [], []
for i in range(2, len(vectors)):
    agglomerative = AgglomerativeClustering(n_clusters=i, affinity='cosine', linkage='complete')
    labels = agglomerative.fit_predict(vectors)
    sc = metrics.silhouette_score(vectors, labels)
    x.append(i)
    y.append(sc)
    #print(agglomerative.distances_)
    #print(agglomerative.children_)

plt.plot(x, y)
plt.show()

In [None]:
print(x[np.argmax(y)])
agglomerative = AgglomerativeClustering(n_clusters=x[np.argmax(y)], affinity='cosine', linkage='complete')
labels = agglomerative.fit_predict(vectors)

new_df['cluster'] = labels
new_df.head(50)

In [None]:
# Create a list of lists of similar words
syns = []
from collections import defaultdict
t = defaultdict(list)

for g in set(new_df["cluster"].to_list()):
    rows = new_df[new_df["cluster"] == g]["title"].to_list() # get all matching queries
    syns.append(rows) # add list of synonyms to main list


for i in range(len(syns)):
    syns[i] = list(dict.fromkeys(syns[i]))
    
for l in range(len(syns)):
    new_l = syns[l].copy()
    for s in syns[l]:
        count = data2.count(s)
        for i in range(count - 1):
            new_l.append(s)
    syns[l] = new_l
   
merged_syns = syns

In [None]:
if(False):
    top_2 = []

    for index in range(len(merged_syns)):
        candidates = []
        for sentence in merged_syns[index]:
            words = sentence.split(' ')
            w_product = list(product(words, words))
            for pair in w_product:
                if((pair[0] in stop_words) or (pair[1] in stop_words)):
                    continue
                if (pair[0] == pair[1]): 
                    if(pair[0] not in candidates): candidates.append(pair[0])
                else: 
                    word = pair[0] + ' ' + pair[1]
                    if(word not in candidates): candidates.append(word)
        
        candidates_vectors = model.encode(candidates) 
        sentences_vectors = model.encode(merged_syns[index]) 
        candidates_scores = []
        for i in range(len(candidates)):
            sum_dist = 0
            for j in range(len(merged_syns[index])):
                dist = cosine_similarity([candidates_vectors[i]], [sentences_vectors[j]])[0][0]
                sum_dist += dist
            candidates_scores.append(sum_dist)
        
        if(candidates == []):
            top = ''
        else:
            top = candidates[np.argmax(candidates_scores)] 
        #print(top, ': ', merged_syns[index])
        top_2.append(top)
    

In [None]:
if(False):
    top_3 = []

    for index in range(len(merged_syns)):
        candidates = []
        for sentence in merged_syns[index]:
            words = sentence.split(' ')
            w_product = list(product(words, words, words))
            for pair in w_product:
                if (pair[0] == pair[1] and pair[1] == pair[2]): 
                    if(pair[0] not in candidates): candidates.append(pair[0])           
                elif (pair[0] == pair[1] and pair[1] != pair[2]):
                    word = pair[1] + ' ' + pair[2]
                    if(word not in candidates): candidates.append(word)
                elif (pair[0] != pair[1] and pair[1] == pair[2]):
                    word = pair[0] + ' ' + pair[1]
                    if(word not in candidates): candidates.append(word)
                elif (pair[0] != pair[1] and pair[0] == pair[2]):
                    word = pair[0] + ' ' + pair[1]
                    if(word not in candidates): candidates.append(word)  
                else:
                    word = pair[0] + ' ' + pair[1] + ' ' + pair[2]
                    if(word not in candidates): candidates.append(word)
        
        candidates_vectors = model.encode(candidates) 
        sentences_vectors = model.encode(merged_syns[index]) 
        candidates_scores = []
        for i in range(len(candidates)):
            sum_dist = 0
            for j in range(len(merged_syns[index])):
                dist = cosine_similarity([candidates_vectors[i]], [sentences_vectors[j]])[0][0]
                sum_dist += dist
            candidates_scores.append(sum_dist)
        
        
        top = candidates[np.argmax(candidates_scores)]     
        #print(top, ': ', merged_syns[index])
        top_3.append(top)

In [None]:
if(True):
    top_1 = []
    for index in range(len(merged_syns)):
        candidates = []
        for sentence in merged_syns[index]:
            words = sentence.split(' ')
            for word in words:
                if word in stop_words: continue
                if(word not in candidates): candidates.append(word)
        
        candidates_vectors = model.encode(candidates) 
        sentences_vectors = model.encode(merged_syns[index]) 
        candidates_scores = []
        for i in range(len(candidates)):
            sum_dist = 0
            for j in range(len(merged_syns[index])):
                dist = cosine_similarity([candidates_vectors[i]], [sentences_vectors[j]])[0][0]
                sum_dist += dist
            candidates_scores.append(sum_dist)
 
        candidates_scores_indexed = set(zip(list(np.linspace(0, len(candidates_scores)-1, len(candidates_scores)).astype(int)), candidates_scores))
        candidates_scores_indexed = sorted(candidates_scores_indexed, key=lambda a : a[1], reverse=True)
        if(candidates_scores != []):
            top = candidates[candidates_scores_indexed[0][0]]
        else:
            top = ''
        
        top_1.append(top)

In [None]:
for i in range(len(merged_syns)):
    #print("top_1:", top_1[i], '\t\t', merged_syns[i])
    print("top_1:", top_1[i], '\t\t', merged_syns[i])

In [None]:
# Compute lemmas
nltk.download('averaged_perceptron_tagger')

def verb_to_wordnet(verb_tag):
    if verb_tag.startswith('N'):
        return 'n'
    if verb_tag.startswith('V'):
        return 'v'
    if verb_tag.startswith('J'):
        return 'a'
    if verb_tag.startswith('R'):
        return 'r'

lems = []
for i in range(len(top_1)):
    cont_sent_tag = nltk.pos_tag([top_1[i]])
    wordnet_verb = verb_to_wordnet(cont_sent_tag[0][1])
    if wordnet_verb is not None:
        lems.append(lem.lemmatize(top_1[i], wordnet_verb))
    else:
        lems.append(top_1[i])
    #print(nltk.pos_tag([word]))

print(len(lems))
print(len(top_1))

#print(lem.lemmatize("are", pos="v"))

In [None]:
from collections import Counter

# Create a dictionary to store the most frequent word for each lemma
most_frequent = {}

# Count the frequency of each word for each lemma
for lemma in set(lems):
    words = [word for word, lem in zip(top_1, lems) if lem == lemma]
    most_frequent[lemma] = max(words, key=words.count)

# Create a dictionary where the key is a word from top_1 and the value is the most frequent word with the same lemma
word_dict = {word: most_frequent[lem] for word, lem in zip(top_1, lems)}

for i in range(len(top_1)):
    top_1[i] = word_dict.get(top_1[i])
    
    

In [None]:
synonims_dict = dict()
for i in range(len(merged_syns)):
    for j in range(len(merged_syns[i])):
        originals = [key for key, val in dictionary.items() if val == merged_syns[i][j]]
        for original_sentence in originals:
            synonims_dict.update({original_sentence : top_1[i]})

In [None]:
df = pd.read_csv('db_for_data_analysis.csv')
df = df[df.guessed_label_id == target_label]
df_en = df[df.language_en == 't']
df_it = df[df.language_en == 'f']

df_it['characteristic_en'] = df_it['characteristic'].apply(str.lower).map(translation_dict)
df_en['characteristic_en'] = df_en['characteristic'].apply(str.lower)

df = pd.concat([df_it, df_en])
df['synonym'] = df['characteristic_en'].apply(str.lower).map(synonims_dict)
print(df['synonym'])
df.to_csv(target_class.lower()+'_with_synonyms.csv')