In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fast-text-embeddings-without-subwords/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec
/kaggle/input/fast-text-embeddings-without-subwords/crawl-300d-2M.vec/crawl-300d-2M.vec
/kaggle/input/fasttext-hindi-300-vec/cc.hi.300.vec/cc.hi.300.vec
/kaggle/input/english-to-hindi-anchor-pair/New Text Document.txt
/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv


In [2]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt

### Cross Lingual Alignment

In [3]:
# embedding addresses
from gensim.models import KeyedVectors

english_embedding_addr = '/kaggle/input/fast-text-embeddings-without-subwords/crawl-300d-2M.vec/crawl-300d-2M.vec'
hindi_embedding_addr = '/kaggle/input/fasttext-hindi-300-vec/cc.hi.300.vec/cc.hi.300.vec'

In [4]:
english_model = KeyedVectors.load_word2vec_format(english_embedding_addr, binary=False)
print("English Model read from embeddings")

English Model read from embeddings


In [5]:
hindi_model = KeyedVectors.load_word2vec_format(hindi_embedding_addr, binary=False)
print("Hindi Model read from emeddings")

Hindi Model read from emeddings


### Procrustes: Single and Iterative

In [6]:
import random

anchor_pairs = []

with open('/kaggle/input/english-to-hindi-anchor-pair/New Text Document.txt', encoding='utf-8', errors='ignore') as f:
    for line in f:
        line = line.strip().split()
        if len(line) != 2:
            continue
        en_word = line[0]
        
        try:
            hin_word = line[1].encode('cp1252').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            hin_word = line[1]

        if en_word in english_model and hin_word in hindi_model:
            if en_word != hin_word:
                anchor_pairs.append([en_word, hin_word])


num_test_samples = 200
test_anchor_pairs = random.sample(anchor_pairs, num_test_samples)
anchor_pairs = [p for p in anchor_pairs if p not in test_anchor_pairs]

print(f"Total valid anchor pairs: {len(anchor_pairs)}")
for en, hi in anchor_pairs[:10]:
    print(f"English: {en} — Hindi: {hi}")


Total valid anchor pairs: 14602
English: and — Hindi: और
English: was — Hindi: था
English: was — Hindi: थी
English: for — Hindi: लिये
English: that — Hindi: कि
English: with — Hindi: साथ
English: from — Hindi: से
English: from — Hindi: इससे
English: this — Hindi: ये
English: this — Hindi: यह


In [7]:
for en, hi in test_anchor_pairs[:10]:
    print(f"English: {en} — Hindi: {hi}")

English: bathinda — Hindi: भटिंडा
English: stop — Hindi: रोकें
English: genoa — Hindi: जेनोआ
English: kuta — Hindi: कूट
English: soybean — Hindi: सोयाबीन
English: sunny — Hindi: सनी
English: netherlands — Hindi: नीदरलैंड
English: valentino — Hindi: वैलेंटिनो
English: bull — Hindi: बैल
English: namibia — Hindi: नामीबिया


In [8]:
# making matrices for the embeddings
X = []  # English
Y = []  # Hindi

for en, hi in anchor_pairs:
    if en in english_model and hi in hindi_model:
        X.append(english_model[en])
        Y.append(hindi_model[hi])

X = np.array(X)
Y = np.array(Y)

print("Embedding matrix shapes:", X.shape, Y.shape)

Embedding matrix shapes: (14602, 300) (14602, 300)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
def procrustes(X, Y):
    X_mean, Y_mean = X.mean(axis=0), Y.mean(axis=0)
    X_cent, Y_cent = X - X_mean, Y - Y_mean
    U, _, Vt = np.linalg.svd(Y_cent.T @ X_cent)
    W = U @ Vt
    return W, X_mean, Y_mean


def iterative_procrustes(anchor_pairs, en_model, hi_model, iterations=5, vocab_limit=20000, top_k=5000):
    anchor_pairs = [tuple(pair) for pair in anchor_pairs]  # Ensure tuples at start
    for iter_num in range(iterations):
        print(f"\nIteration {iter_num + 1}")

        # Prepare embedding matrices
        X = np.array([en_model[en] for en, _ in anchor_pairs])
        Y = np.array([hi_model[hi] for _, hi in anchor_pairs])

        # Compute transformation
        W, X_mean, Y_mean = procrustes(X, Y)

        # Align English embeddings
        en_words = en_model.index_to_key[:vocab_limit]
        hi_words = hi_model.index_to_key[:vocab_limit]

        aligned_en = np.array([((en_model[w] - X_mean) @ W) + Y_mean for w in en_words])
        hi_emb_matrix = np.array([hi_model[w] for w in hi_words])

        # Mutual Nearest Neighbor dictionary update
        sim_matrix = cosine_similarity(aligned_en, hi_emb_matrix)

        # English to Hindi
        en2hi = sim_matrix.argmax(axis=1)
        # Hindi to English
        hi2en = sim_matrix.argmax(axis=0)

        # Mutual Nearest Neighbors
        new_anchor_pairs = []
        for en_idx, hi_idx in enumerate(en2hi):
            if hi2en[hi_idx] == en_idx:
                new_anchor_pairs.append((en_words[en_idx], hi_words[hi_idx]))
            if len(new_anchor_pairs) >= top_k:
                break

        # Compute overlap (fixed with tuple conversion)
        overlap = len(set(new_anchor_pairs).intersection(anchor_pairs))
        print(f"Anchor pairs updated: {len(new_anchor_pairs)}, overlap: {overlap}")

        anchor_pairs = new_anchor_pairs

    return W, X_mean, Y_mean, anchor_pairs



In [10]:
# Single-step Procrustes Alignment
W_single, X_mean_single, Y_mean_single = procrustes(X , Y)


In [11]:
# Iterative Procrustes Alignment
W_iter, X_mean_iter, Y_mean_iter, final_anchor_pairs = iterative_procrustes(anchor_pairs, english_model, hindi_model, iterations=2)



Iteration 1
Anchor pairs updated: 5000, overlap: 0

Iteration 2
Anchor pairs updated: 4310, overlap: 0


In [12]:
def translate(word, en_model, hi_model, W, X_mean, Y_mean, topn=5, dont_align= False):
    if word not in en_model:
        return f"{word} not found in English embeddings."
    if dont_align:
        aligned_vec = en_model[word]
    else:
        aligned_vec = ((en_model[word] - X_mean) @ W) + Y_mean
    similarities = cosine_similarity([aligned_vec], hi_model.vectors)[0]
    top_indices = similarities.argsort()[::-1][:topn]
    return [(hi_model.index_to_key[i], similarities[i]) for i in top_indices]


In [13]:
word = 'politician'

In [14]:
print(f"\nNo alignment translations for {word}:")
print(translate(word, english_model, hindi_model, W_single, X_mean_single, Y_mean_single,topn=5, dont_align=False))


No alignment translations for politician:
[('बदचलन', 0.31706637), ('पॄष्ठोदयादि', 0.29870856), ('हीअक्सर', 0.29508567), ('परअक्सर', 0.29399613), ('हूंअक्सर', 0.29346555)]


In [15]:
print(f"\nSingle-step alignment translations for {word}:")
print(translate(word, english_model, hindi_model, W_single, X_mean_single, Y_mean_single))


Single-step alignment translations for politician:
[('बदचलन', 0.31706637), ('पॄष्ठोदयादि', 0.29870856), ('हीअक्सर', 0.29508567), ('परअक्सर', 0.29399613), ('हूंअक्सर', 0.29346555)]


In [16]:
print(f"\nIterative alignment translations for {word}:")
print(translate(word, english_model, hindi_model, W_iter, X_mean_iter, Y_mean_iter))


Iterative alignment translations for politician:
[('छुटभैया', 0.34693155), ('नेता', 0.33297762), ('राजनेता', 0.33034098), ('मुख़्यमंत्री', 0.3215288), ('भाजपानेता', 0.31550306)]


In [17]:
def report_accuracy(anchor_pair_test_set, top_k, english_model, hindi_model, W, X, Y, dont_align= False):
    correct = 0
    total_samples = 0
    correct_idx = []

    for idx, pair in enumerate(anchor_pair_test_set):
        print(f"Evaluating {idx}/{len(anchor_pair_test_set)}", end='\r')
        en_word = pair[0]
        hin_word = pair[1]
        if en_word  in english_model and hin_word in hindi_model:
        
            total_samples +=1
            translations = translate(en_word, english_model, hindi_model, W, X, Y, top_k, dont_align)
            predicted = [pred[0] for pred in translations ]
            if hin_word in predicted:
                correct +=1
                correct_idx.append(idx)
                
    accuracy = correct/total_samples
    print("Number of total samples:", total_samples)
    print("Number of correct samples:", correct)
    print("Accuracy", accuracy)
    return correct_idx

In [18]:
# without_procrustes
_ = report_accuracy(test_anchor_pairs, 5, english_model, hindi_model, W_single, X_mean_single, Y_mean_single, dont_align= True)

Number of total samples: 200
Number of correct samples: 0
Accuracy 0.0


In [19]:
# single
_ = report_accuracy(test_anchor_pairs, 5, english_model, hindi_model, W_single, X_mean_single, Y_mean_single)

Number of total samples: 200
Number of correct samples: 0
Accuracy 0.0


In [20]:
# iterative
ci = report_accuracy(test_anchor_pairs, 5, english_model, hindi_model, W_iter, X_mean_iter, Y_mean_iter)

Number of total samples: 200
Number of correct samples: 26
Accuracy 0.13


### Contrastive loss and Sentence level alignment

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from sentence_transformers import SentenceTransformer

class NTXentLoss(nn.Module):

    def __init__(self, temperature=0.05):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, embeddings_a, embeddings_b):
        
        norm_a = nn.functional.normalize(embeddings_a, p=2, dim=1)
        norm_b = nn.functional.normalize(embeddings_b, p=2, dim=1)

        logits = torch.mm(norm_a, norm_b.t()) / self.temperature

   
        batch_size = embeddings_a.size(0)
        labels = torch.arange(batch_size).to(embeddings_a.device)

        # Compute loss in both directions (symmetrically)
        loss_a = self.criterion(logits, labels)
        loss_b = self.criterion(logits.t(), labels)
        loss = (loss_a + loss_b) / 2
        return loss


In [22]:
# multilingual Sentence-BERT model.
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')


print(model)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [23]:
# # freeze all parameters
# for param in model.parameters():
#     param.requires_grad = False

# # Unfreeze only the embedding layer weights
# for name, param in model._first_module().named_parameters():
#     if "embeddings" in name:
#         param.requires_grad = True
#         print(f"Unfreezing {name}")
#     else:
#         param.requires_grad = False
#         print(f"Freezing {name}")

In [24]:
dataset_pair_addr = '/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv'

df = pd.read_csv(dataset_pair_addr)

df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [25]:
df['english'] = df['english'].astype(str)
df['hindi'] = df['hindi'].astype(str)

dataset_pair = list(zip(df['english'].tolist(), df['hindi'].tolist()))
dataset_pair = random.sample(dataset_pair, 5000)
test_dataset_pair = random.sample(dataset_pair, 50)
dataset_pair = [pair for pair in dataset_pair if pair not in test_dataset_pair]
for pair in dataset_pair[:5]:
    print(pair)

('Interface & name:', 'इंटरफेस नामः (n) ')
('He treated them all alike, as his own kith and kin.', 'उन सबको वे अपने बंधु-बांधव समझकर उनसे एकसमान व्यवहार करते थे। ')
('Marc Ravalomanana', 'मार्क रावलोमनना')
('lower status', 'असारता')
('States are increasingly borrowing even to finance their current consumption.', 'यहां तक कि राज्य अपने वर्तमान उपभोग के लिए वित्त जुटाने के वास्ते उधार ले रहे हैं। ')


In [26]:
def training_step(sentences_en, sentences_hi, contrastive_loss, model, optimizer, device='cuda'):
    # Tokenize the sentences, returns a dict of tensors
    inputs_en = model.tokenize(sentences_en)
    inputs_en = {key: value.to(device) for key, value in inputs_en.items()}
    
    inputs_hi = model.tokenize(sentences_hi)
    inputs_hi = {key: value.to(device) for key, value in inputs_hi.items()}

    outputs_en = model(inputs_en)
    outputs_hi = model(inputs_hi)
    
    embeddings_en = outputs_en['sentence_embedding']
    embeddings_hi = outputs_hi['sentence_embedding']
    
    loss = contrastive_loss(embeddings_en, embeddings_hi)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss


In [27]:
import torch
import torch.optim as optim
from sentence_transformers import SentenceTransformer

loss_fn = NTXentLoss(temperature=0.05)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.train() 
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 5
batch_size = 2

for epoch in range(num_epochs):
    total_loss = 0.0
    
    random.shuffle(dataset_pair)
    for i in range(0, len(dataset_pair), batch_size):
        print(f"{i}/{len(dataset_pair)}", end='\r')
        batch = dataset_pair[i:i+batch_size]
        
        if len(batch) < batch_size:
            continue
        sentences_en = [pair[0] for pair in batch]
        sentences_hi = [pair[1] for pair in batch]
        
        loss = training_step(sentences_en, sentences_hi, loss_fn, model, optimizer, device)
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}: Average Loss = {total_loss / (len(dataset_pair)//batch_size):.4f}")


Epoch 1: Average Loss = 0.0570
Epoch 2: Average Loss = 0.0446
Epoch 3: Average Loss = 0.0375
Epoch 4: Average Loss = 0.0357
Epoch 5: Average Loss = 0.0274


In [28]:
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_similarity(model, sentence1, sentence2, device='cuda'):

    emb1 = model.encode(sentence1, convert_to_tensor=True, device=device)
    emb2 = model.encode(sentence2, convert_to_tensor=True, device=device)
    
    emb1_np = emb1.cpu().detach().numpy().reshape(1, -1)
    emb2_np = emb2.cpu().detach().numpy().reshape(1, -1)
    
    sim = cosine_similarity(emb1_np, emb2_np)[0][0]
    return sim

base_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
trained_model = model

device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_model.to(device)
trained_model.to(device)


print("Similarity between English and Hindi sentences:")
base_similarities= []
trained_similarities = []
for eng, hin in test_dataset_pair:
    base_sim = compute_similarity(base_model, eng, hin, device)
    trained_sim = compute_similarity(trained_model, eng, hin, device)
    base_similarities.append(base_sim)
    trained_similarities.append(trained_sim)

Similarity between English and Hindi sentences:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
print("Avergae score before aligning:", np.mean(base_similarities))
print("Avergae score after aligning:", np.mean(trained_similarities))

Avergae score before aligning: 0.6921228
Avergae score after aligning: 0.7922775
