## Full-Text Search using SQLite

In [1]:
#!pip3 install --upgrade tensorflow==2.5.0 numpy==1.19.5 sentencepiece==0.1.95 scikit-learn==0.24.2 tensorflow_text==2.5.0 tqdm==4.61.0

In [2]:
import os
import requests
import tarfile
import tensorflow as tf
import tensorflow_text
import sentencepiece as spm
import numpy as np
from tqdm import tqdm

In [None]:
path_to_save_lite_model = os.path.join(os.getcwd(), "model.tflite")

model_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3?tf-hub-format=compressed"
model_name = model_url.split("?")[0]
model_name = model_name.split("/")[-2] + "_" + model_name.split("/")[-1] + ".tar.gz"
path_to_save = os.path.join(os.getcwd(), model_name)


In [3]:
def download(url, path_to_save):
    r = requests.get(url, stream=True)
    with open(path_to_save, 'wb') as f:
        for chunk in tqdm(r.iter_content(chunk_size=1024), desc=f"Donwloading model from {url}"):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def unarchive(path, path_to_extract):
    if path.endswith("tar.gz"):
        tar = tarfile.open(path, "r:gz")
        tar.extractall(path=path_to_extract)
        tar.close()

In [4]:
#downloading model
download(model_url, path_to_save)
saved_model_dir = path_to_save.replace("tar.gz", "")
#unarchiving model to use it
unarchive(path_to_save, saved_model_dir)

Donwloading model from https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3?tf-hub-format=compressed: 310455it [00:07, 39356.29it/s]


In [7]:
# converting model to tf-lite format
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
tflite_model = converter.convert()
open(path_to_save_lite_model, "wb").write(tflite_model)

339420524

In [8]:
def load_tflite_model(path_to_save_model):
    # Load the TFLite model and allocate tensors.
    interpreter = tf.lite.Interpreter(model_path=path_to_save_model)
    interpreter.allocate_tensors()

    # Get input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    return interpreter, input_details, output_details

def tf_lite_inference(input_str, interpreter, input_details, output_details):
    input_data = np.array([input_str])
    
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    
    output_data = interpreter.get_tensor(output_details[0]['index'])

    return output_data


In [10]:
# trying out tf-lite model
interpreter, input_details, output_details = load_tflite_model(path_to_save_lite_model)

input_str = "I'm just a text"

emb = tf_lite_inference(input_str, interpreter, input_details, output_details)
print(emb)

[[-2.20920071e-02  4.54927161e-02 -4.57857773e-02  4.30787578e-02
   6.46174476e-02  8.54882225e-02 -3.06253638e-02 -6.05369657e-02
  -4.28862032e-03  4.34924588e-02  5.98257687e-03  2.37084907e-02
   1.61413534e-03 -7.31369630e-02 -1.88882090e-02  1.46713550e-03
  -7.78279267e-03  6.29442977e-03 -5.46357259e-02  3.71543281e-02
   4.24982570e-02  2.63022967e-02 -1.17640961e-02  4.08129431e-02
   3.98756191e-02  3.70400585e-03 -1.54423993e-02  1.08362380e-02
   2.07452495e-02  1.43263945e-02 -3.72592770e-02  3.46207209e-02
  -3.84902954e-02 -7.08461255e-02  1.63013134e-02 -4.56018709e-02
  -1.83822494e-02  2.07007974e-02 -2.91418619e-02 -7.38913864e-02
   1.55426934e-02 -5.36717987e-03  1.13091394e-02 -8.70303996e-03
  -1.01367170e-02  7.90371224e-02  4.28929590e-02  5.69643266e-03
   4.40499149e-02  4.72852066e-02 -1.47618195e-02  7.59420320e-02
   1.17016584e-03 -6.17320873e-02  2.17185002e-02  5.31647615e-02
  -9.58393216e-02 -9.15163085e-02 -8.32637399e-02 -8.61318130e-03
   2.18917

In [None]:
# TODO compare embeddings from tf-lite model with embeddings from regular model
# Compare embeddings from different languages 