# Embedding Encoding

## Install Python Packages

In [1]:
%%capture
!pip3 install seaborn
!pip3 install tensorflow
!pip3 install tensorflow_hub
!pip3 install tensorflow_datasets
!pip install google-cloud-aiplatform

## Remove existing metadata and coeffecients

In [2]:
!rm -r tmp
!rm -r tmp2 # for gecko embeddings (not currently used)

## Create metadata path 

In [3]:
import os

log_dir='tmp'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
log_dir='tmp2' # for gecko embedding (not currently used)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

## Load the Universal Sentence Encoder's TF Hub module
https://arxiv.org/pdf/1803.11175.pdf

https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import json # required to save embeddings (then upload to Matching Engine)
import numpy as np
import pandas as pd
import csv
from tensorboard.plugins import projector # tensorboard visualizer
from google.cloud import aiplatform
from vertexai.preview.language_models import TextEmbeddingModel

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

def embed(input):
  return model(input)

2023-08-21 20:15:04.751053: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-21 20:15:06.764128: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-08-21 20:15:06.764310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

## Load Business Descriptions from BigQuery into Pandas Dataframe
Select ~10,000 samples from 100 million rows

In [5]:
%%bigquery df --project cf-data-analytics
SELECT
  name,
  description,
  country,
  language,
  postal,
  city,
  state,
  address_l1,
  industry,
  summary
FROM
  `cf-data-analytics.us_businesses.description_v2`
  WHERE RAND() < 10000/(SELECT COUNT(*) FROM `cf-data-analytics.us_businesses.description_v2`) # select random rows
  and summary is not null
  and summary != 'None'

Query is running:   0%|          |

Downloading:   0%|          |

In [6]:
df.sample(5) # sample of data

Unnamed: 0,name,description,country,language,postal,city,state,address_l1,industry,summary
5476,"Klk Investments, L.P.",Investor,US,EN,92040,Lakeside,CA,12417 Vigilante Rd,"Investors, not elsewhere classified","Klk Investments, L.P. is located in the US and..."
5582,Truckmasters Inc,Ret new/used automobiles,US,EN,85032,Phoenix,AZ,3110 E Bell Rd,Motor vehicle dealers (new and used),Truckmasters Inc is located in the US and best...
3893,"All Purpose Trucking, L.L.C.",Local trucking operator,US,EN,53022,Germantown,WI,W156n10318 Pilgrim Rd,Local trucking without storage,"All Purpose Trucking, L.L.C. is located in the..."
9239,Sharkey's Massage Therapy Llc,Specialty outpatient clinic,US,EN,70422,Amite City,LA,101 S Bay St,"Specialty outpatient facilities, not elsewhere...",Sharkey's Massage Therapy Llc is located in th...
7058,Bustin' For Badges,Whol nondurable goods,US,EN,79701,Midland,TX,601 N Loraine St,"Nondurable goods, not elsewhere classified",Bustin' For Badges is located in the US and be...


## Convert Pandas dataframe to TF Tensor

In [7]:
df['summary'] = df['summary'].str.replace('[^\u0000-\u007F]+', '', regex=True) # remove unicode
lst = df['summary'].values.tolist() # convert to list; required for tensor conversion

In [8]:
df2 = df.head(700) # select top 700 values from original sample, projector visualizer maxes out 1000 vectors
lst2 = df2['summary'].values.tolist() # convert pandas datafram to list; required for tensor conversion

# convert list to tf tensor
input_tensor = tf.convert_to_tensor(
    lst2, dtype=tf.string, dtype_hint=None, name=None
)

## Encode Embeddings using Universal Sentence Encoder _(512 Dimensions)_

In [9]:
message_embeddings = embed(input_tensor) # return array of vectors as tensor

# Visualize Embeddings _(Dimension Reduction)_

https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin

## Create metadata for Embeddings Visualization

In [10]:
lst_meta = df2.to_numpy()

col_lst = [] # create a list of column names
for col in df.columns:
  col_lst.append(col)

col_array = np.array([col_lst]) # convert list to numpy array (for metadata)

In [11]:
x2 = np.array(message_embeddings) # convert tf to numpy array
y2 = np.array(lst_meta)

## Write metadata _.tsv_ and checkpoint _.ckpt_
Declare ProjectorConfig()

In [12]:
def register_embedding(embedding_tensor_name, meta_data_fname, log_dir):
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_tensor_name
    embedding.metadata_path = meta_data_fname
    projector.visualize_embeddings(log_dir, config)

def save_labels_tsv(labels,label_headers, filepath, log_dir):
    with open(os.path.join(log_dir, filepath), 'w') as f:
      writer = csv.writer(f, delimiter='\t')
      writer.writerows(label_headers) # write headers as 
      writer.writerows(labels) # write labels

LOG_DIR = 'tmp'  # tensorboard log dir
META_DATA_FNAME = 'meta.tsv'  # labels will be stored here
EMBEDDINGS_TENSOR_NAME = 'message_embeddings'
EMBEDDINGS_FPATH = os.path.join(LOG_DIR, EMBEDDINGS_TENSOR_NAME + '.ckpt') # checkpoint file contains coeffecients 
STEP = 0

x = x2
y = y2

register_embedding(EMBEDDINGS_TENSOR_NAME, META_DATA_FNAME, LOG_DIR)
save_labels_tsv(y,col_array, META_DATA_FNAME, LOG_DIR) # save .tsv metadata

In [13]:
print(x)

[[ 0.00064967 -0.00977142  0.00123326 ... -0.00765369  0.02207767
  -0.03573458]
 [-0.02549926  0.00014862 -0.05294493 ... -0.01382775 -0.045181
  -0.01690979]
 [-0.03411159  0.03754803 -0.06858061 ...  0.00637971  0.03873222
  -0.04947431]
 ...
 [-0.01361914  0.03074034  0.01503934 ... -0.04498469  0.03818046
  -0.03166247]
 [-0.02490119  0.03474186 -0.00148704 ... -0.02573494  0.03344541
  -0.05940924]
 [-0.01998372  0.03797555 -0.06666351 ... -0.03110051  0.01287225
  -0.02513331]]


In [14]:
tensor_embeddings = tf.Variable(x, name=EMBEDDINGS_TENSOR_NAME) # create tensor variable with embeddings array
saver = tf.compat.v1.train.Saver([tensor_embeddings])  # create checkpoint
saver.save(sess=None, global_step=STEP, save_path=EMBEDDINGS_FPATH)





'tmp/message_embeddings.ckpt-0'

## Load Tensorboard

In [15]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

%load_ext tensorboard

In [43]:
%tensorboard --logdir tmp --load_fast=false

Reusing TensorBoard on port 6006 (pid 3886364), started 0:08:09 ago. (Use '!kill 3886364' to kill it.)

# Create Embeddings from Generative AI Google Model _(textembedding-gecko@001)_
https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings

In [17]:
aiplatform.init(project='cf-data-analytics',)
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [18]:
from typing import Generator, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import math
import functools
from typing import List, Optional
import time

In [19]:
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [20]:
is_successful, question_embeddings = encode_text_to_embedding_batched(lst)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [21]:
question_embeddings.shape

(9998, 768)

# Create Matching Engine ANN Index
https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/matching_engine/sdk_matching_engine_for_indexing.ipynb

## Save Embedding Vectors as .json files for Upload

In [22]:
aiplatform.init(project="cf-data-analytics", location='us-central1', staging_bucket='gs://cf-matching-engine-temp')

In [23]:
DIMENSIONS = 768
DISPLAY_NAME = "business_embeddings2"
BUCKET_URI = 'gs://cf-matching-engine-temp'

In [24]:
with open("values.json", "w") as f:
  embeddings_formatted = [
      json.dumps(
          {
              "id": str(index),
              "embedding": [str(value) for value in embedding],
              "restricts": [
                  {
                      "namespace": "class",
                      "allow_list": ["even" if index % 2 == 0 else "odd"],
                  }
              ],
          }
      )
      + "\n"
      for index, embedding in enumerate(question_embeddings)
  ]
  f.writelines(embeddings_formatted)

## Upload .json files to GCS

In [25]:
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/matching_engine/initial/"
! gsutil cp values.json {EMBEDDINGS_INITIAL_URI}

Copying file://values.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][177.4 MiB/177.4 MiB]                                                
Operation completed over 1 objects/177.4 MiB.                                    


## Create Matching Engine Index

In [26]:
# tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
#     display_name=DISPLAY_NAME,
#     contents_delta_uri=EMBEDDINGS_INITIAL_URI,
#     dimensions=DIMENSIONS,
#     approximate_neighbors_count=10,
#     distance_measure_type="DOT_PRODUCT_DISTANCE",
#     leaf_node_embedding_count=500,
#     leaf_nodes_to_search_percent=7,
#     description="USE Encoder",
#     labels={"label_name": "label_value"},
# )

## Create Matching Engine Endpoint

In [27]:
# my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
#     display_name="business_demo_endpoint",
#     description="business endpoint description",
#     network="projects/1089470781238/global/networks/matching-engine-vpc-network"
# )

## Deploy Index to Matching Engine Endpoint

In [28]:
# deployed_index = my_index_endpoint.deploy_index(
#     index=tree_ah_index, deployed_index_id="deployed_business_endpoint"
# )

In [29]:
# !gcloud ai index-endpoints list --project="cf-data-analytics" --region="us-central1"

In [30]:
# index = aiplatform.MatchingEngineIndex(index_name='projects/1089470781238/locations/us-central1/indexes/8592225974232285184')

In [31]:
# deployed_index = my_index_endpoint.deploy_index(
#     index=index, deployed_index_id="similar_text_index_deployed"
# )

# Query Index

In [32]:
# aiplatform.init(project='cf-data-analytics',)
# model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

# model.get_embeddings()

In [33]:
endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name='4192772088244731904'
)

In [34]:
query_sample = ["sandwich shop in new york city ny"]
embedded_output = model.get_embeddings(query_sample)

In [35]:
response = endpoint.match(
    deployed_index_id="deployed_business_endpoint",
    queries=[embedded_output[0].values],
    num_neighbors=3)

response

[[MatchNeighbor(id='601', distance=0.7504278421401978),
  MatchNeighbor(id='490', distance=0.7379012107849121),
  MatchNeighbor(id='379', distance=0.7363015413284302)]]

In [36]:
response = endpoint.match(
    deployed_index_id="deployed_business_endpoint",
    queries=[embedded_output[0].values],
    num_neighbors=5
)

print(response)

loc = int(response[0][0].id)
df.loc[loc]

[[MatchNeighbor(id='601', distance=0.7504278421401978), MatchNeighbor(id='490', distance=0.7379012107849121), MatchNeighbor(id='379', distance=0.7363015413284302), MatchNeighbor(id='594', distance=0.7311584949493408), MatchNeighbor(id='380', distance=0.7269077897071838)]]


name                                      Sams Italian Foods Inc
description                                         Eating place
country                                                       US
language                                                      EN
postal                                                     04240
city                                                    Lewiston
state                                                         ME
address_l1                                         902 Lisbon St
industry                                           Eating places
summary        Sams Italian Foods Inc is located in the US an...
Name: 601, dtype: object

# Delete Index

In [37]:
# x = aiplatform.MatchingEngineIndexEndpoint.list()

In [38]:
# for item in x:
#     print(item)

In [39]:
# x = aiplatform.MatchingEngineIndex.list()

In [40]:
# for item in x:
#     print(item)

In [41]:
# index = aiplatform.MatchingEngineIndex(
#     index_name='6814078177606893568'
# )

In [42]:
# index.delete()