In [2]:
!pip install fasttext nltk stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.3.0->stanza)
  Downloading nvi

In [None]:
!wget -c -P ../models https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!wget -c -P ../models https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz

In [None]:
!wget -c -P ../datasets https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.txt

--2025-08-23 20:19:14--  https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.96, 3.163.189.108, 3.163.189.14, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827554 (808K) [text/plain]
Saving to: ‘hi-en.txt’


2025-08-23 20:19:14 (21.5 MB/s) - ‘hi-en.txt’ saved [827554/827554]



In [None]:
!gunzip ../models/cc.en.300.bin.gz
!gunzip ../models/cc.hi.300.bin.gz

In [None]:
import fasttext
en_model = fasttext.load_model("../models/cc.en.300.bin")

In [7]:
import fasttext.util
fasttext.util.reduce_model(en_model, 100)

<fasttext.FastText._FastText at 0x7ee3c93e3ad0>

In [None]:
hi_model = fasttext.load_model("../models/cc.hi.300.bin")
fasttext.util.reduce_model(hi_model, 100)

<fasttext.FastText._FastText at 0x7ee3ca254f10>

In [9]:
def load_dictonary(input_file_path: str):
  word_pairs = []
  with open(input_file_path, 'r') as file:
    for line in file:
      hi_word, en_word = line.strip().split('\t')
      if en_word.isalnum():
        word_pairs.append(
            (
                hi_word, en_word
            )
        )
  return word_pairs

In [None]:
pairs = load_dictonary("../datasets/hi-en.txt")

In [11]:
import numpy as np
from scipy.linalg import orthogonal_procrustes
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
len(pairs)

30923

In [13]:
pairs[10]

('है', 'is')

In [14]:
def get_ortho_matrix(en_model, hi_model, word_pairs):
    en_vectors = np.array([en_model.get_word_vector(word) for _, word in word_pairs])
    hi_vectors = np.array([hi_model.get_word_vector(word) for word, _ in word_pairs])
    omega, _ = orthogonal_procrustes(en_vectors, hi_vectors)
    return omega

In [15]:
omega = get_ortho_matrix(en_model, hi_model, pairs)

In [16]:
from tqdm import tqdm

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
pairs[0]

('के', 'of')

In [19]:
def calculate_precisions(en_model, hi_model, word_pairs, omega, k):
    correct_predictions = 0
    total = 0
    hindi_words = hi_model.get_words()

    print("Getting Hindi word Vectors")
    hindi_vectors = np.array([hi_model.get_word_vector(word) for word in hindi_words])

    print("Starting Evaluation")
    for true_hindi_word, english_word in tqdm(word_pairs, desc=f"Calculating Precision@{k}"):
        if english_word not in en_model.get_words() or true_hindi_word not in hindi_words:
            continue
        english_vector = en_model.get_word_vector(english_word)
        map_vector = np.dot(english_vector, omega)
    
        distances = np.linalg.norm(hindi_vectors - map_vector, axis=1)
        top_k_indices = np.argsort(distances)[:k]
        top_k_words = [list(hindi_words)[i] for i in top_k_indices]
        print(f"English word: {english_word} True Hindi word: {true_hindi_word}")
        print(f"Top K words: {top_k_words}")
        if true_hindi_word in top_k_words:
          correct_predictions += 1
        total += 1
        
    precision = correct_predictions / total if total > 0 else 0
    return precision*100

In [20]:
import stanza
stanza.download('hi')

pos_tagger = stanza.Pipeline('hi')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.10.0/models/default.zip:   0%|          | …

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

In [21]:
pos_tagger('प्रकार').sentences[0].words[0].upos

'NOUN'

In [22]:
pairs[0]

('के', 'of')

In [26]:
tagged_words = {}
test_set = pairs[:1000]

for word_pair in tqdm(test_set, total=len(test_set), desc="POS Tagging words..."):
    tag = pos_tagger(word_pair[0]).sentences[0].words[0].upos
    if tag not in tagged_words:
        tagged_words[tag] = [word_pair]
    else:
        tagged_words[tag].append(word_pair)

POS Tagging words...: 100%|██████████| 1000/1000 [01:28<00:00, 11.34it/s]


In [33]:
tagged_words.keys()

dict_keys(['ADP', 'CCONJ', 'AUX', 'NUM', 'NOUN', 'PART', 'VERB', 'PRON', 'DET', 'SCONJ', 'PROPN', 'ADV', 'ADJ'])

In [None]:
tagged_words

In [None]:
import random
calculate_precisions(en_model, hi_model,test_set, omega, 20)