In [74]:
#installing prerequisites to our environment..
!pip install bert-for-tf2
!pip install tensorflow-text



In [75]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance
import turtle 

In [76]:
def get__model(model_url, maxseq_length):
  inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(maxseq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(maxseq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(maxseq_length,), dtype=tf.int32),
    )

  muril_layer = hub.KerasLayer(model_url, trainable=True)
  outputs = muril_layer(inputs)

  assert 'sequence_output' in outputs
  assert 'pooled_output' in outputs
  assert 'encoder_outputs' in outputs
  assert 'default' in outputs
  return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer

In [77]:
maxseq_length = 128
muril_model, muril_layer = get__model(
    model_url="https://tfhub.dev/google/MuRIL/1", maxseq_length=maxseq_length)#model url This code section takes time much than expected...

In [78]:
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [79]:
def create_input(input_strings, tokenizer, maxseq_length):
  input_ids_all, input_mask_all, input_type_ids_all = [], [], []
  for input_string in input_strings:
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), maxseq_length)
    
    if len(input_ids) >= maxseq_length:
      input_ids = input_ids[:maxseq_length]
    else:
      input_ids = input_ids + [0] * (maxseq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (maxseq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    input_type_ids_all.append([0] * maxseq_length)
  
  return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

In [80]:
def encode(input_text):
  input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       maxseq_length)
  inputs = dict(
      input_word_ids=input_ids,
      input_mask=input_mask,
      input_type_ids=input_type_ids,
  )
  return muril_model(inputs)

In [81]:
sent = ["Sports","క్రీడలు","खेल"]

In [82]:
embeddings = encode(sent)





In [83]:
embeddings.shape
embeddings #returns a tensor with values

<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[ 0.00937692,  0.01861928, -0.00740274, ..., -0.02161782,
        -0.01006092, -0.01130441],
       [ 0.00937908,  0.01894466, -0.00822009, ..., -0.02112483,
        -0.00997349, -0.01120048],
       [ 0.0092281 ,  0.01850078, -0.00721172, ..., -0.02058332,
        -0.01039925, -0.01179705]], dtype=float32)>

In [84]:
for i in range(len(sent)):
    for j in range(i + 1, len(sent)):
        ds= distance.euclidean(embeddings[i],embeddings[i+1])
        print(ds)


0.009948350489139557
0.009948350489139557
0.01155528612434864
