<a href="https://colab.research.google.com/github/FaezeM/NLP_Bambara/blob/main/similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
bam_embeddings = np.load('/content/drive/MyDrive/embeddings/bam_embeddings.npy')
en_embeddings = np.load('/content/drive/MyDrive/embeddings/en_embeddings.npy')
fr_embeddings = np.load('/content/drive/MyDrive/embeddings/fr_embeddings.npy')

In [4]:
print(bam_embeddings.shape)

(500, 354, 768)


In [8]:
def cosine_sim(em1, em2):

    em1_means = np.mean(em1, axis=1)
    em2_means = np.mean(em2, axis=1)

    similarities = []
    for em1_vec, em2_vec in zip(em1_means, em2_means):
        similarity = np.dot(em1_vec, em2_vec) / (np.linalg.norm(em1_vec) * np.linalg.norm(em2_vec))
        similarities.append(similarity)

    return np.array(similarities)


In [11]:
def euclidean_dist(em1, em2):

    em1_means = np.mean(em1, axis=1)
    em2_means = np.mean(em2, axis=1)

    similarities = []
    for em1_vec, em2_vec in zip(em1_means, em2_means):
        distance = np.linalg.norm(em1_vec - em2_vec)
        similarity = 1 / (1 + distance)
        similarities.append(similarity)

    return np.array(similarities)

In [9]:
cosine_sim(bam_embeddings, bam_embeddings)

array([1.0000001 , 1.0000001 , 0.99999994, 0.9999999 , 0.99999994,
       1.        , 1.        , 0.99999994, 1.        , 1.0000001 ,
       1.        , 1.        , 1.        , 1.0000001 , 1.0000001 ,
       0.99999994, 1.0000001 , 0.99999994, 1.        , 1.        ,
       1.        , 1.        , 0.99999994, 1.0000001 , 1.0000001 ,
       1.0000001 , 1.        , 0.99999994, 1.        , 1.        ,
       1.        , 1.0000001 , 1.0000001 , 0.99999994, 0.99999994,
       1.0000001 , 1.        , 1.        , 1.0000001 , 1.        ,
       1.        , 1.        , 1.0000001 , 1.        , 1.        ,
       1.0000001 , 1.        , 0.99999994, 0.99999994, 1.        ,
       0.9999999 , 1.        , 0.99999994, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.0000001 , 1.0000001 ,
       1.0000001 , 1.        , 1.        , 0.99999994, 0.99999994,
       0.99999994, 1.0000001 , 1.        , 1.        , 0.99999994,
       1.        , 0.99999994, 0.99999994, 1.        , 1.     

In [12]:
euclidean_dist(bam_embeddings, bam_embeddings)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [13]:
cosine_sim(bam_embeddings, en_embeddings)

array([0.6385678 , 0.6034815 , 0.6798109 , 0.7027718 , 0.68825674,
       0.71049356, 0.6809772 , 0.7147039 , 0.6684008 , 0.76904   ,
       0.62084836, 0.5320029 , 0.6927854 , 0.7316845 , 0.5120958 ,
       0.5336267 , 0.6742702 , 0.70131266, 0.46950346, 0.5985009 ,
       0.62156385, 0.5611829 , 0.6034633 , 0.7112521 , 0.6826683 ,
       0.69211817, 0.48167446, 0.509577  , 0.67326534, 0.6828997 ,
       0.6132888 , 0.57203245, 0.664309  , 0.48700753, 0.73934686,
       0.6234639 , 0.6884891 , 0.7556099 , 0.56380016, 0.44762215,
       0.49958566, 0.58818144, 0.63170093, 0.53749925, 0.81196296,
       0.68336207, 0.5767435 , 0.6800613 , 0.57867765, 0.7161753 ,
       0.56470144, 0.73212934, 0.5461634 , 0.77635485, 0.61217505,
       0.68673354, 0.6381255 , 0.6489724 , 0.60850674, 0.6420348 ,
       0.5842957 , 0.6695852 , 0.65913516, 0.67687356, 0.63971996,
       0.69086444, 0.6520214 , 0.6115442 , 0.55010086, 0.40674078,
       0.58296454, 0.74386936, 0.5351383 , 0.6001566 , 0.67429

In [14]:
cosine_sim(bam_embeddings, fr_embeddings)

array([0.5972741 , 0.659186  , 0.65096545, 0.64135236, 0.58851504,
       0.70666885, 0.5822021 , 0.8236556 , 0.64326096, 0.63238907,
       0.53828126, 0.627107  , 0.7155786 , 0.6401367 , 0.5142811 ,
       0.48237917, 0.6361347 , 0.74010664, 0.36546052, 0.65221894,
       0.6326266 , 0.609091  , 0.61460054, 0.6267824 , 0.6675623 ,
       0.67399293, 0.49388427, 0.503334  , 0.5947177 , 0.6195743 ,
       0.6053467 , 0.559947  , 0.6848413 , 0.46005628, 0.73715264,
       0.55423754, 0.67054117, 0.69452536, 0.54053825, 0.48572782,
       0.5382509 , 0.6662663 , 0.70388186, 0.44459274, 0.7912384 ,
       0.668031  , 0.5409309 , 0.5313578 , 0.5206986 , 0.6526096 ,
       0.50035673, 0.69163954, 0.5589744 , 0.6925965 , 0.6566788 ,
       0.6796825 , 0.6240511 , 0.46996158, 0.693651  , 0.66435647,
       0.5960584 , 0.6169896 , 0.6562961 , 0.56121397, 0.6523594 ,
       0.61158097, 0.6095707 , 0.48528546, 0.50206876, 0.35523793,
       0.60265356, 0.701286  , 0.51878023, 0.5087875 , 0.63885

In [15]:
euclidean_dist(bam_embeddings, en_embeddings)

array([0.0481327 , 0.04619471, 0.05707689, 0.05759201, 0.05565496,
       0.0587354 , 0.05758184, 0.05937605, 0.05048163, 0.06485593,
       0.05573678, 0.04337219, 0.05803744, 0.06005725, 0.03870824,
       0.04950384, 0.05106488, 0.06241307, 0.03639894, 0.04372556,
       0.04930075, 0.04146808, 0.05026526, 0.05039528, 0.0468374 ,
       0.05386747, 0.03715495, 0.04168659, 0.05658503, 0.05791732,
       0.04493883, 0.04745696, 0.05105077, 0.03737586, 0.05705628,
       0.04681007, 0.0576201 , 0.06312949, 0.04175195, 0.03787231,
       0.04011328, 0.04760102, 0.04988319, 0.04077486, 0.05693331,
       0.05587696, 0.04259352, 0.05278032, 0.04288343, 0.05065078,
       0.0444351 , 0.06849094, 0.04043722, 0.06124246, 0.04647907,
       0.05177992, 0.04750439, 0.05291354, 0.0473597 , 0.04544364,
       0.0461547 , 0.05250074, 0.0599065 , 0.04693602, 0.04749188,
       0.05148654, 0.05300809, 0.04900416, 0.04078552, 0.03651477,
       0.0461706 , 0.05641316, 0.04371611, 0.04439914, 0.05019

In [16]:
euclidean_dist(bam_embeddings, fr_embeddings)

array([0.04491478, 0.05118275, 0.05504689, 0.05015549, 0.04697656,
       0.0597861 , 0.0470521 , 0.08126238, 0.04861514, 0.04310212,
       0.0481363 , 0.04842369, 0.05681019, 0.04878467, 0.03878921,
       0.04329356, 0.04817123, 0.06695019, 0.03151331, 0.04910023,
       0.05044316, 0.04761935, 0.05071794, 0.04167085, 0.04593246,
       0.05368403, 0.03943036, 0.04131065, 0.04829571, 0.05291674,
       0.04609429, 0.04581309, 0.05159269, 0.03528783, 0.05737395,
       0.04025037, 0.05732593, 0.05588174, 0.04090182, 0.03818607,
       0.04307191, 0.05782612, 0.0583871 , 0.03536439, 0.05405185,
       0.05470268, 0.03823314, 0.03806414, 0.03942496, 0.04540789,
       0.04008888, 0.06115809, 0.04104344, 0.05309736, 0.05092302,
       0.05159858, 0.04858074, 0.03611582, 0.05528988, 0.0490892 ,
       0.04786438, 0.04799471, 0.05765816, 0.03892647, 0.04774006,
       0.04533438, 0.04834342, 0.03682609, 0.03696631, 0.03279521,
       0.04868721, 0.05388683, 0.04142889, 0.04023274, 0.04814