In [44]:
!pip install gensim



In [1]:
# restart session before running this cell
import gensim

from gensim.models import KeyedVectors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### Data Preperation

In [3]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz

--2025-04-07 10:07:41--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.53, 13.226.34.7, 13.226.34.122, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz.1’


2025-04-07 10:07:51 (126 MB/s) - ‘cc.en.300.vec.gz.1’ saved [1325960915/1325960915]

--2025-04-07 10:07:51--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.53, 13.226.34.7, 13.226.34.122, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1118942272 (1.0G) [binary/octet-stream]
Saving to: ‘cc.hi.300.vec.gz.1’


2025-04-07 10:08:01 (109 MB/s) - ‘cc.hi.300.vec.gz.1’ saved [11

In [4]:
# using pre-trained FastText embeddings for English and Hindi languages
english_embeddings = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz', limit=100000)
hindi_embeddings = KeyedVectors.load_word2vec_format('cc.hi.300.vec.gz', limit=100000)

In [7]:
print("number of English and Hindi embeddings: ")
print(len(english_embeddings.index_to_key))
print(len(hindi_embeddings.index_to_key))

number of English and Hindi embeddings: 
100000
100000


In [8]:
# get the English-Hindi bilingual lexicon from MUSE train dataset
!wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt

# MUSE test dataset
!wget https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.5000-6500.txt

--2025-04-07 10:10:22--  https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.122, 13.226.34.53, 13.226.34.7, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 930856 (909K) [text/x-c++]
Saving to: ‘en-hi.txt.1’


2025-04-07 10:10:22 (21.3 MB/s) - ‘en-hi.txt.1’ saved [930856/930856]

--2025-04-07 10:10:22--  https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.5000-6500.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.122, 13.226.34.53, 13.226.34.7, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.122|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52464 (51K) [text/plain]
Saving to: ‘en-hi.5000-6500.txt.1’


2025-04-07 10:10:22 (6.04 MB/s) - ‘en-hi.5000-6500.txt.1’ saved [52464/52464]



In [9]:
# Extract the English-Hindi bilingual lexicon from the downloaded .txt file
# the final lexicon contains word pairs like (english_word, hindi_word)
def read_muse(file_path):
  enhi_lexicon = []
  with open(file_path,'r') as f:
    for entry in f:
      english_word, hindi_word = entry.split()
      if english_word in english_embeddings.index_to_key and hindi_word in hindi_embeddings.index_to_key:
        enhi_lexicon.append((english_word,hindi_word))
    return enhi_lexicon

enhi_lexicon = read_muse('en-hi.txt')

#### Embedding Alignment

In [11]:
# Prepare the english and hindi matrices for alignment using the MUSE en-hi lexicon and the pre-trained FastText embeddings
def prepare_x_and_y(enhi_lexicon, english_embeddings, hindi_embeddings):
  X, Y = [], []
  for entry in enhi_lexicon:
    en_word, hi_word = entry[0], entry[1]
    if en_word in english_embeddings.index_to_key and hi_word in hindi_embeddings.index_to_key:
      X.append(english_embeddings[en_word])
      Y.append(hindi_embeddings[hi_word])
  X, Y = np.array(X, dtype=np.float32), np.array(Y, dtype=np.float32)

  return X, Y

In [12]:
def compute_procrustes(x,y):
  u, s, vt = np.linalg.svd(np.dot(x.T,y))
  w = np.dot(u,vt)
  return w

In [13]:
# use the matrices X and Y to learn the transformation matrix W
X, Y = prepare_x_and_y(enhi_lexicon, english_embeddings, hindi_embeddings)

W = compute_procrustes(X, Y)

In [17]:
print("Number of aligned pairs: "+str(len(X)))
print("Shape of the transformation matrix: "+str(W.shape))

number of aligned pairs: 18972
shape of the transformation matrix: (300, 300)


In [115]:
aligned_english_embeddings = np.dot(english_embeddings.vectors,W).astype(np.float32)

In [116]:
# verify the translation using an example
word = 'cat'
en_vec = aligned_english_embeddings[english_embeddings.key_to_index[word]]
similar_results = cosine_similarity(en_vec.reshape(1,-1), hindi_embeddings.vectors)
most_similar = hindi_embeddings.index_to_key[similar_results.argmax()]

print('English word: '+word+'\n'+'Hindi word (translated): '+most_similar)

English word: cat
Hindi word (translated): बिल्ली


In [24]:
muse_test_set = read_muse('en-hi.5000-6500.txt')
# muse_test_set = enhi_lexicon[1500:3000]

X_test, Y_test = prepare_x_and_y(muse_test_set, english_embeddings, hindi_embeddings)
print("Number of pairs in the test set: "+str(len(X_test)))

Number of pairs in the test set: 1600


In [None]:
aligned_test_embeddings = np.dot(X_test,W).astype(np.float32)

#### Evaluation

In [103]:
# evaluation: calculate Precision@1 and Precision@5 metrics for word translation
def calculate_precision(test_data, aligned_embeddings, hindi_embeddings):
  first = 0
  fifth = 0
  p = 5
  for word_en, word_hi in test_data:
    en_vec = aligned_english_embeddings[english_embeddings.key_to_index[word]]
    similar_results = cosine_similarity(en_vec.reshape(1,-1), hindi_embeddings.vectors)
    # indices = similar_results[0].argsort()[-p:][::-1]
    indices = similar_results[0].argsort()[-p:][::-1]
    words = [hindi_embeddings.index_to_key[i] for i in indices]
    if word_hi==words[0]:
      first += 1
    if word_hi in words:
      fifth += 1
  first_precision = first / len(test_data)
  fifth_precision = fifth / len(test_data)

  return first_precision, fifth_precision

In [108]:
precision_at_1, precision_at_5 = calculate_precision(muse_test_set, aligned_english_embeddings, hindi_embeddings)

print('Precision@1= '+str(precision_at_1))
print('Precision@5= '+str(precision_at_5))

Precision@1= 0.0
Precision@5= 0.0


In [109]:
# evaluation: cosine similarities between word pairs to assess cross-lingual semantic similarity
def calculate_similarities(bi_lexicon, aligned_embeddings, hindi_embeddings, pairs=10):
  similarity = []
  i = 0

  for word_en, word_hi in bi_lexicon:
    if word_en in english_embeddings and word_hi in hindi_embeddings:
      en_vec = aligned_embeddings[english_embeddings.key_to_index[word_en]]
      hi_vec = hindi_embeddings[word_hi]

      sim = cosine_similarity(en_vec.reshape(1,-1), hi_vec.reshape(1,-1))
      similarity.append((word_en, word_hi, sim))

    i+=1
    if i>=pairs:
      break
  return similarity

In [112]:
word_pair_cosine_similarities = calculate_similarities(enhi_lexicon, aligned_english_embeddings, hindi_embeddings)

print('{English word}  {Hindi word}  {Similarity}')
for en,hi,sim in word_pair_cosine_similarities:
  print(f"{en}  {hi}  {sim}")

{English word}  {Hindi word}  {Similarity}
and  और  [[0.14246619]]
was  था  [[0.42320135]]
was  थी  [[0.410759]]
for  लिये  [[0.26180744]]
that  उस  [[0.35282412]]
that  कि  [[0.1700798]]
with  साथ  [[0.18845299]]
from  से  [[0.12805238]]
from  इससे  [[0.26954097]]
this  ये  [[0.22080687]]


In [113]:
# evaluation: ablation study to assess the impact of bilingual lexicon size on alignment quality
def ablation_study(bi_lexicon, english_embeddings, hindi_embeddings, dictionary_size=[5000,10000,20000]):
  ablation_results = []

  for size in dictionary_size:
    lexicon = bi_lexicon[:size]
    X, Y = [], []
    for entry in lexicon:
      en_word, hi_word = entry[0], entry[1]
      X.append(english_embeddings[en_word])
      Y.append(hindi_embeddings[hi_word])
    X = np.array(X)
    Y = np.array(Y)

    W = compute_procrustes(X, Y)
    aligned_english_embeddings = np.dot(english_embeddings.vectors, W)

    precision_one, precision_five = calculate_precision(lexicon, aligned_english_embeddings, hindi_embeddings)
    ablation_results.append((size, precision_one, precision_five))
  return ablation_results

In [114]:
ablation_results = ablation_study(enhi_lexicon, english_embeddings, hindi_embeddings)

print("{Bilingual dictionary size}   {Precision@1}   {Precision@5}")
for size, p1, p5 in ablation_results:
  print(f"{size}  {p1}  {p5}")

KeyboardInterrupt: 