## Load these models and check embedding simmilarity for normal hindi, english and hinglish
  - bert-base-multilingual-uncased
  - xlm-roberta-base
  - MuRil (for embedding simmilarity)

##  Check if bert-base-multilingual-uncased can handle multilingual two languages in the same bot
##  Check if xlm-roberta-base satisfies following conditions  

In [1]:
!pip install transformers
!pip install tokenizers
!pip install sentencepiece
!pip install scipy
!pip install -U sentence-transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.2 MB/s[0m eta [36m0:00:0

In [2]:
import torch

In [3]:
access_token = "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"
model_name_1 = "bert-base-multilingual-uncased"
model_name_2 = "xlm-roberta-base"
model_name_3 = "google/muril-large-cased"
model_name_4 = "sentence-transformers/LaBSE"

In [4]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

In [5]:
model_1 = AutoModel.from_pretrained(model_name_1, use_auth_token=access_token)



Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

In [6]:
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [7]:
print(tokenizer_1)

BertTokenizerFast(name_or_path='bert-base-multilingual-uncased', vocab_size=105879, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [8]:
print(model_1)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [9]:
text_1 = "The sun rises in the east."
text_2 = "Elephants are intelligent creatures."

encoded_input_1 = tokenizer_1(text_1, return_tensors='pt',padding=True, truncation=True)
encoded_input_2 = tokenizer_1(text_2, return_tensors='pt',padding=True, truncation=True)
with torch.no_grad():
  embeddings1 = model_1(**encoded_input_1).last_hidden_state.mean(dim=1)  # Taking the mean of token embeddings
  embeddings2 = model_1(**encoded_input_2).last_hidden_state.mean(dim=1)

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(embeddings1,embeddings2)
print(f"Cosine Similarity between {[text_1,text_2]}:", similarity)

Cosine Similarity between ['The sun rises in the east.', 'Elephants are intelligent creatures.']: [[0.51830876]]


## making it generic

In [29]:
def get_embeddings(model, tokenizer, examples:dict):
  embeddings = {}
  for key in examples:
    encoded_input_1 = tokenizer(examples[key][0], return_tensors='pt',padding=True, truncation=True)
    encoded_input_2 = tokenizer(examples[key][1], return_tensors='pt',padding=True, truncation=True)
    with torch.no_grad():
      embeddings1 = model(**encoded_input_1).last_hidden_state.mean(dim=1)  # Taking the mean of token embeddings
      embeddings2 = model(**encoded_input_2).last_hidden_state.mean(dim=1)
    embeddings[key] = [embeddings1,embeddings2]
  print("embedding size : ",embeddings[0][0].shape)
  return embeddings

def get_cosine_simmilarity(examples,embeddings):
  from sklearn.metrics.pairwise import cosine_similarity
  # cosine simmilarity
  results = []
  for key in embeddings:
    similarity = cosine_similarity(embeddings[key][0], embeddings[key][1])
    results.append(similarity)
    print(f"Cosine Similarity between {examples[key]}:", similarity)
  return results


In [30]:
examples = {
    0:["The sun rises in the east.","Elephants are intelligent creatures."]
}
embeddings = get_embeddings(model_1, tokenizer_1, examples)
result = get_cosine_simmilarity(examples,embeddings)

embedding size :  torch.Size([1, 768])
Cosine Similarity between ['The sun rises in the east.', 'Elephants are intelligent creatures.']: [[0.51830876]]


## few examples

In [31]:
examples = {
    0: ["The sun rises in the east.","Elephants are intelligent creatures."],   # it is to check model's accuracy - 2 very dissimilar text
    1: ["i love cats","i love dogs"],                                           # it is to check model's accuracy - 2 very similar text

    2:["में आज पेमेंट कर दूंगा","I'll make payment today"],                            # one hindi one english text
    3:["में आज पेमेंट कर दूंगा","आज साम को ही कर दूंगा पेमेंट"],                              # both hindi text
    4:["i'll complete transaction by end of the day","i will make payment today"], # both english text

    5:["me aaj payment kar dunga", "aaj saam se pehle payment ho jaye ga"],     # both hinglish text
    6:["do din pehle hi payment ho gaya ","payment is already completed 2 days back"],  # hinglish and english text
    7:["do din pehle hi payment ho gaya ", "दो दिन पहले ही पेमेंट हो गया है"]  ,          # hinglish and hindi

    8:["आज नहीं कर पाउगा पेमेंट will do it tomorrow morning","aaj nahi kar payga payment will do it tomorrow morning"],
    9:["आज नहीं कर पाउगा पेमेंट will do it tomorrow morning","i'll not be able to do payment today will do it tomorrow morning"],

    # examples with entity
    10:["में गूगलपे से कर दूंगा","I will pay via googlepay"],
    11:["में गूगलपे से कर दूंगा","me googlepay se kar dunga"],
    12:["me googlepay kar denga", "i will pay using googlepay"],

    13:["में googlepay से कर दूंगा","I will pay via googlepay"],
    14:["में googlepay से कर दूंगा","me googlepay se kar dunga"],

    15:["mera name ganesh he","My name is ganesh"],
    16:["में 31st august को कर दूंगा","me 31st august ko kar dunga"],
    17:["में 31st august को कर दूंगा","i'll do it by 31st august"],

    # some examples where examples are not simmiler

    18:["में पेमेंट नहीं करूँगा","i'll not do payment"],
    19:["में पेमेंट नहीं करूँगा","i'll do payment later"],
    20:["पेमेंट बाद में करूँगा","i'll not do payment"],
    21:["पेमेंट बाद में करूँगा","i'll do payment later"]

}

## check for bert-base-multilingual-uncased

In [32]:
embeddings_1 = get_embeddings(model_1, tokenizer_1, examples)
results_1 = get_cosine_simmilarity(examples,embeddings_1)

embedding size :  torch.Size([1, 768])
Cosine Similarity between ['The sun rises in the east.', 'Elephants are intelligent creatures.']: [[0.51830876]]
Cosine Similarity between ['i love cats', 'i love dogs']: [[0.966812]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', "I'll make payment today"]: [[0.5628961]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', 'आज साम को ही कर दूंगा पेमेंट']: [[0.88884926]]
Cosine Similarity between ["i'll complete transaction by end of the day", 'i will make payment today']: [[0.70899254]]
Cosine Similarity between ['me aaj payment kar dunga', 'aaj saam se pehle payment ho jaye ga']: [[0.7154846]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'payment is already completed 2 days back']: [[0.41329274]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'दो दिन पहले ही पेमेंट हो गया है']: [[0.4324294]]
Cosine Similarity between ['आज नहीं कर पाउगा पेमेंट will do it tomorrow morning', 'aaj nahi kar payga payment will d

In [33]:
print(examples[2],results_1[2])

['में आज पेमेंट कर दूंगा', "I'll make payment today"] [[0.5628961]]


# check for xlm-roberta-base

In [34]:
model_2 = model = AutoModel.from_pretrained(model_name_2, use_auth_token=access_token)
tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)



In [35]:
embeddings_2 = get_embeddings(model_2, tokenizer_2, examples)
results_2 = get_cosine_simmilarity(examples,embeddings_2)

embedding size :  torch.Size([1, 768])
Cosine Similarity between ['The sun rises in the east.', 'Elephants are intelligent creatures.']: [[0.99575526]]
Cosine Similarity between ['i love cats', 'i love dogs']: [[0.9995593]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', "I'll make payment today"]: [[0.99506414]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', 'आज साम को ही कर दूंगा पेमेंट']: [[0.99903774]]
Cosine Similarity between ["i'll complete transaction by end of the day", 'i will make payment today']: [[0.997167]]
Cosine Similarity between ['me aaj payment kar dunga', 'aaj saam se pehle payment ho jaye ga']: [[0.99608314]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'payment is already completed 2 days back']: [[0.99220085]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'दो दिन पहले ही पेमेंट हो गया है']: [[0.99329627]]
Cosine Similarity between ['आज नहीं कर पाउगा पेमेंट will do it tomorrow morning', 'aaj nahi kar payga payment will

## check with google/muril-large-cased

In [36]:
model_3 = model = AutoModel.from_pretrained(model_name_3, use_auth_token=access_token)
tokenizer_3 = AutoTokenizer.from_pretrained(model_name_3)

Some weights of the model checkpoint at google/muril-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
embeddings_3 = get_embeddings(model_3, tokenizer_3, examples)
results_3 = get_cosine_simmilarity(examples,embeddings_3)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


embedding size :  torch.Size([1, 1024])
Cosine Similarity between ['The sun rises in the east.', 'Elephants are intelligent creatures.']: [[0.9125152]]
Cosine Similarity between ['i love cats', 'i love dogs']: [[0.9658783]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', "I'll make payment today"]: [[0.86679196]]
Cosine Similarity between ['में आज पेमेंट कर दूंगा', 'आज साम को ही कर दूंगा पेमेंट']: [[0.952119]]
Cosine Similarity between ["i'll complete transaction by end of the day", 'i will make payment today']: [[0.90588737]]
Cosine Similarity between ['me aaj payment kar dunga', 'aaj saam se pehle payment ho jaye ga']: [[0.60462487]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'payment is already completed 2 days back']: [[-0.67666674]]
Cosine Similarity between ['do din pehle hi payment ho gaya ', 'दो दिन पहले ही पेमेंट हो गया है']: [[-0.51547605]]
Cosine Similarity between ['आज नहीं कर पाउगा पेमेंट will do it tomorrow morning', 'aaj nahi kar payga payment wi

## check with sentence-trasnformer (LaBSE)

In [40]:
model = SentenceTransformer('sentence-transformers/LaBSE')
def get_cosine_simmilarity_for_labse(input_text_pair_list):
  from sklearn.metrics.pairwise import cosine_similarity
  import torch
  # cosine simmilarity
  result = []
  for key, value in input_text_pair_list.items():
    embeddings = model.encode(value)
    results = []
    similarity = cosine_similarity(torch.tensor(embeddings[0].reshape(1, -1)), torch.tensor(embeddings[1].reshape(1, -1)))
    result.append(similarity)
  return result


In [41]:
result_4 = get_cosine_simmilarity_for_labse(examples)

## combine LaBSE + XLM-Roberta

In [86]:
# XLM-Roberta embeddings

# embeddings_2[17][0].shape
from sklearn.metrics.pairwise import cosine_similarity
result_combind = []
result_combind_mul = []
ans={}
for key, value in examples.items():
  ans[key] = model.encode(value)
  example_1_1 = embeddings_2[0][0]
  example_1_2 = embeddings_2[0][1]

  example_2_1 = torch.tensor(ans[0][0].reshape(1, -1))
  example_2_2 = torch.tensor(ans[0][1].reshape(1, -1))

# apply operation here...........
  combined_embeddings_1 = torch.cat((example_1_1, example_2_1), dim=1)
  combined_embeddings_2 = torch.cat((example_1_2, example_2_2), dim=1)

  # vector multiplication is good for machine translation task
  combined_embeddings_1_1 = example_1_1 * example_2_1
  combined_embeddings_2_1 = example_1_2 * example_2_2
# apply operation here...........


  result = cosine_similarity(combined_embeddings_1,combined_embeddings_2)
  result_combind.append(result)

  result_1 = cosine_similarity(combined_embeddings_1_1,combined_embeddings_2_1)
  result_combind_mul.append(result_1)




In [89]:
print(result_combind)
print(result_combind_mul)

[array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32), array([[0.9932801]], dtype=float32)]
[array([[0.60988194]], dtype=float32), array([[0.60988194]], dtype=float32), array([[0.60988194]], dtype=float32), array([[0.60988194]], dtype=float32), array([[0.60988194]], dtype=floa

## export as csv

In [90]:
j = {}
j["examples"] = examples.values()
j["bert-base-multilingual-uncased"] = results_1
j["xlm-roberta-base"] = results_2
j["google/muril-large-cased"] = results_3
j["sentence-transformers/LaBSE"]= result_4
j["concat_XLM_LaBSE"] = result_combind
j["multiply_XLM_LaBSE"]=result_combind_mul

In [91]:
import pandas as pd
data = pd.DataFrame(j)
print(data.head())
data.to_csv("results.csv")
# data.to_excel("output.xlsx")

                                            examples  \
0  [The sun rises in the east., Elephants are int...   
1                         [i love cats, i love dogs]   
2  [में आज पेमेंट कर दूंगा, I'll make payment today]   
3  [में आज पेमेंट कर दूंगा, आज साम को ही कर दूंगा...   
4  [i'll complete transaction by end of the day, ...   

  bert-base-multilingual-uncased xlm-roberta-base google/muril-large-cased  \
0                 [[0.51830876]]   [[0.99575526]]            [[0.9125152]]   
1                   [[0.966812]]    [[0.9995593]]            [[0.9658783]]   
2                  [[0.5628961]]   [[0.99506414]]           [[0.86679196]]   
3                 [[0.88884926]]   [[0.99903774]]             [[0.952119]]   
4                 [[0.70899254]]     [[0.997167]]           [[0.90588737]]   

  sentence-transformers/LaBSE concat_XLM_LaBSE multiply_XLM_LaBSE  
0              [[0.13202727]]    [[0.9932801]]     [[0.60988194]]  
1               [[0.9373624]]    [[0.9932801]]     [[0.609

In [92]:
data.tail(4)

Unnamed: 0,examples,bert-base-multilingual-uncased,xlm-roberta-base,google/muril-large-cased,sentence-transformers/LaBSE,concat_XLM_LaBSE,multiply_XLM_LaBSE
18,"[में पेमेंट नहीं करूँगा, i'll not do payment]",[[0.6542227]],[[0.99630284]],[[0.790928]],[[0.97075427]],[[0.9932801]],[[0.60988194]]
19,"[में पेमेंट नहीं करूँगा, i'll do payment later]",[[0.5818778]],[[0.9965709]],[[0.7940219]],[[0.774329]],[[0.9932801]],[[0.60988194]]
20,"[पेमेंट बाद में करूँगा, i'll not do payment]",[[0.5954785]],[[0.9959558]],[[0.9137787]],[[0.7267952]],[[0.9932801]],[[0.60988194]]
21,"[पेमेंट बाद में करूँगा, i'll do payment later]",[[0.58610404]],[[0.9968151]],[[0.91630363]],[[0.9339098]],[[0.9932801]],[[0.60988194]]


## Embedding dimension
- bert-base-multilingual-uncased : 768
- xlm-roberta-base : 768
- google/muril-large-cased : 1024

## The model architecture is one of the supported language models (check that the model_type in config.json is listed in the table's column model_name)
  - bert-base-multilingual-uncased (BERT is supported)
  - xlm-roberta-base (RoBERTa is supported)
  - google/muril-large-cased (it is BERT model trained on 17 indian language so it is also supported)


## The model has pretrained Tensorflow weights (check that the file tf_model.h5 exists)
  - bert-base-multilingual-uncased (.h5 file exists)
  - xlm-roberta-base (.h5 file exists)
  - google/muril-large-cased (.h5 is not exsist but we can get it from pytorch_model.bin)

## The model uses the default tokenizer (config.json should not contain a custom tokenizer_class setting)
  - bert-base-multilingual-uncased (does not have any custom tokenizer)
  - xlm-roberta-base (does not have any custom tokenizer)
  - google/muril-large-cased (does not have any custom tokenizer)

In [None]:
!pip install nbconvert

In [None]:
!jupyter nbconvert --to html /content/Compare_Multilingual_Models.ipynb

In [None]:
!pip install ipywidgets