### Prompt to generate list of words
Please produce list of 7 word as python list of tuples in English and Russian language
like [('snow', 'cнег'), ('winter sport', 'зимние виды спорта'), ...]
Some of that words should be semantically close, some of them - not, so that we can use these words for exploration of cousin similarity using vector embeddings

In [34]:

from sentence_transformers import SentenceTransformer, util

def calculate_similarity(model, sent1, sent2):
    embeddings = model.encode([sent1, sent2])
    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

models = [
    SentenceTransformer('all-mpnet-base-v2'),
    SentenceTransformer('ai-forever/sbert_large_nlu_ru'),
    SentenceTransformer('distiluse-base-multilingual-cased-v1'),
]
# 'DeepPavlov/rubert-base-cased-sentence' getting 'No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with MEAN pooling.'





You try to use a model that was created with version 3.0.0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [78]:
import itertools
from tqdm import tqdm
import pandas as pd

words_en_ru = [
    ('snow', 'снег'),
    ('winter sports', 'зимние виды спорта'),
    ('Ski', 'лыжи'),
    ('Sad', 'грустный'),
    ('sorrowful', 'печальный'),
    ('wish yor were here', 'хотел бы чтобы ты был здесь'),
    ('Pink floyd', 'Pink floyd'),
    ('pink panter', 'розовая пантера'),
    ('cafe', 'кафе'),
    ('tasty food', 'вкусная еда'),
    ('music', 'музыка'),
    ('guitar', 'гитара')
]

# Generate all pairs of sentences, excluding identical pairs
pairs = list(itertools.combinations(words_en_ru, 2))
# Calculate similarities and store results
results = []
columns=['Word 1 EN', 'Word 2 EN', 'Sim(EN by Base)', 'Sim(EN by RU)' , 'Sim(EN by Mult)', 
         'Word 1 RU', 'Word 2 RU', 'Sim(RU by Base)', 'Sim(RU by RU)' , 'Sim(RU by Mult)']
for sent1, sent2 in tqdm(pairs): 
    row_array = []   
    for i_lang, lang in enumerate(['EN', 'RU']):
        row_array.extend([sent1[i_lang], sent2[i_lang]])
        sims = []
        for model in models:
            ss = (sent1[i_lang], sent2[i_lang])
            sims.append(calculate_similarity(model, *ss))
        row_array.extend(sims)
    results.append(row_array)
df = pd.DataFrame(results, columns=columns)
df.shape



100%|██████████| 66/66 [00:13<00:00,  5.02it/s]


(66, 10)

In [82]:
pd.set_option('display.max_rows', 100)  
word_columns=['Word 1 EN', 'Word 2 EN', 'Word 1 RU', 'Word 2 RU',]
sim_columns = columns=['Sim(EN by Base)', 'Sim(EN by RU)' , 'Sim(EN by Mult)', 'Sim(RU by Base)', 'Sim(RU by RU)' , 'Sim(RU by Mult)']

for col in sim_columns: 
    df.sort_values(by=col, inplace=True,ascending=False)
    df['Rank'] = df[col].rank(method='min', ascending=False)
    cc = word_columns + [col, 'Rank']
    subset_df = df[cc]
    print(f'sorted by {col}')
    display(subset_df)


sorted by Sim(EN by Base)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(EN by Base),Rank
11,winter sports,Ski,зимние виды спорта,лыжи,0.70117,1.0
0,snow,winter sports,снег,зимние виды спорта,0.590515,2.0
30,Sad,sorrowful,грустный,печальный,0.576371,3.0
54,Pink floyd,music,Pink floyd,музыка,0.501653,4.0
1,snow,Ski,снег,лыжи,0.472321,5.0
55,Pink floyd,guitar,Pink floyd,гитара,0.410168,6.0
65,music,guitar,музыка,гитара,0.399208,7.0
63,tasty food,music,вкусная еда,музыка,0.383167,8.0
51,Pink floyd,pink panter,Pink floyd,розовая пантера,0.36001,9.0
60,cafe,tasty food,кафе,вкусная еда,0.318768,10.0


sorted by Sim(EN by RU)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(EN by RU),Rank
51,Pink floyd,pink panter,Pink floyd,розовая пантера,0.924591,1.0
65,music,guitar,музыка,гитара,0.85308,2.0
56,pink panter,cafe,розовая пантера,кафе,0.826093,3.0
53,Pink floyd,tasty food,Pink floyd,вкусная еда,0.824252,4.0
57,pink panter,tasty food,розовая пантера,вкусная еда,0.819045,5.0
26,Ski,cafe,лыжи,кафе,0.803761,6.0
34,Sad,cafe,грустный,кафе,0.801131,7.0
1,snow,Ski,снег,лыжи,0.79152,8.0
55,Pink floyd,guitar,Pink floyd,гитара,0.78978,9.0
5,snow,Pink floyd,снег,Pink floyd,0.786584,10.0


sorted by Sim(EN by Mult)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(EN by Mult),Rank
30,Sad,sorrowful,грустный,печальный,0.912052,1.0
51,Pink floyd,pink panter,Pink floyd,розовая пантера,0.819969,2.0
1,snow,Ski,снег,лыжи,0.71941,3.0
65,music,guitar,музыка,гитара,0.594409,4.0
0,snow,winter sports,снег,зимние виды спорта,0.5671,5.0
11,winter sports,Ski,зимние виды спорта,лыжи,0.565557,6.0
29,Ski,guitar,лыжи,гитара,0.396769,7.0
10,snow,guitar,снег,гитара,0.377323,8.0
3,snow,sorrowful,снег,печальный,0.376097,9.0
28,Ski,music,лыжи,музыка,0.363767,10.0


sorted by Sim(RU by Base)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(RU by Base),Rank
37,Sad,guitar,грустный,гитара,0.928943,1.0
28,Ski,music,лыжи,музыка,0.794958,2.0
18,winter sports,tasty food,зимние виды спорта,вкусная еда,0.726518,3.0
65,music,guitar,музыка,гитара,0.721338,4.0
2,snow,Sad,снег,грустный,0.712263,5.0
1,snow,Ski,снег,лыжи,0.711367,6.0
36,Sad,music,грустный,музыка,0.705755,7.0
10,snow,guitar,снег,гитара,0.699609,8.0
63,tasty food,music,вкусная еда,музыка,0.693135,9.0
26,Ski,cafe,лыжи,кафе,0.691134,10.0


sorted by Sim(RU by RU)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(RU by RU),Rank
30,Sad,sorrowful,грустный,печальный,0.941616,1.0
65,music,guitar,музыка,гитара,0.870257,2.0
11,winter sports,Ski,зимние виды спорта,лыжи,0.795187,3.0
29,Ski,guitar,лыжи,гитара,0.628148,4.0
1,snow,Ski,снег,лыжи,0.599416,5.0
28,Ski,music,лыжи,музыка,0.587576,6.0
19,winter sports,music,зимние виды спорта,музыка,0.566123,7.0
61,cafe,music,кафе,музыка,0.544291,8.0
0,snow,winter sports,снег,зимние виды спорта,0.523946,9.0
9,snow,music,снег,музыка,0.523527,10.0


sorted by Sim(RU by Mult)


Unnamed: 0,Word 1 EN,Word 2 EN,Word 1 RU,Word 2 RU,Sim(RU by Mult),Rank
30,Sad,sorrowful,грустный,печальный,0.886939,1.0
51,Pink floyd,pink panter,Pink floyd,розовая пантера,0.864301,2.0
1,snow,Ski,снег,лыжи,0.663878,3.0
65,music,guitar,музыка,гитара,0.56746,4.0
0,snow,winter sports,снег,зимние виды спорта,0.488915,5.0
11,winter sports,Ski,зимние виды спорта,лыжи,0.48448,6.0
60,cafe,tasty food,кафе,вкусная еда,0.399614,7.0
29,Ski,guitar,лыжи,гитара,0.396394,8.0
2,snow,Sad,снег,грустный,0.39041,9.0
26,Ski,cafe,лыжи,кафе,0.351549,10.0


In [30]:
import openai
from numpy import dot
from numpy.linalg import norm
import numpy as np

client = OpenAI() # api_key=''

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(vec1, vec2):
    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))

   # Words for which embeddings are required
word1 = "money"
word2 = "bank"

embedding1 = get_embedding( word1)
embedding2 = get_embedding( word2)

print(f"Embedding for '{word1}': {embedding1}")
print(f"Embedding for '{word2}': {embedding2}")
# Compute cosine similarity
similarity = cosine_similarity(np.array(embedding1), np.array(embedding2))

# Output the result
print(f"Cosine Similarity between '{word1}' and '{word2}': {similarity}")

Embedding for 'money': [0.022288266569375992, -0.015567759051918983, -0.003337214235216379, 0.07599561661481857, -0.0010394456330686808, -0.04349898546934128, -0.007092688232660294, 0.06159047782421112, -0.0499926395714283, -0.058868248015642166, 0.01616324670612812, 0.018857121467590332, -0.03433981165289879, -0.05535203218460083, 0.03553078696131706, 0.021253252401947975, -0.015326728112995625, 0.006178188603371382, -0.019495144486427307, 0.007213203236460686, 0.0023146052844822407, 0.010229634121060371, 0.000506873766425997, 0.004267664160579443, 0.0242590494453907, 0.010938548482954502, 0.01984960213303566, -0.054132699966430664, 0.037742599844932556, -0.060002509504556656, 0.02510974556207657, -0.038876861333847046, 0.029207270592451096, -0.027463341131806374, -0.022997181862592697, 0.004590220283716917, -0.034935299307107925, -0.012682477943599224, 0.027633480727672577, -0.0587548203766346, -0.04605107754468918, -0.016375921666622162, 0.01262576412409544, 0.0008161376463249326, 0