# Quickstart for creating a Vertex Vector Search Index

## Setup

In [None]:
# !gcloud services enable compute.googleapis.com \
#                         aiplatform.googleapis.com \
#                         storage.googleapis.com \
#                         iam.googleapis.com

### pip installs

In [1]:
# !pip install --upgrade --user google-cloud-aiplatform google-cloud-storage

In [2]:
# # Restart kernel after installs so that your environment can access the new packages
# import IPython
# import time

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

### imports

In [3]:
import pandas as pd
import numpy as np
import json
import uuid

from google.cloud import aiplatform
from google.cloud import aiplatform_v1 as aipv1
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

In [4]:
print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aiplatform.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### Define vars and GCP env config

`CREATE_NEW_ASSETS`
* True creates new GCS buckets and Vector Search instances, etc.
* False skips these steps (in case you need to re-run notebook to include new variables you create)

In [5]:
# create new gcs bucket, vs index, etc.?
CREATE_NEW_ASSETS         = True 

**Edit these to define naming conventions and stay organized across different versions**

In [6]:
# naming convention for all cloud resources
VERSION        = "pubv2"                     # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv2


#### Edit these as needed

`VPC_NETWORK_NAME`
* if index will be **deployed to a private endpoint** within a VPC network, edit this with the name of your VPC network name
* if index will be **deployed to a public endpoint**, leave blank

> For details on configuring VPC for Vertex AI Vector Search, see **[Set up a VPC Network Peering connection](https://cloud.google.com/vertex-ai/docs/vector-search/setup/vpc)**

<div class="alert alert-block alert-warning">
    <b>⚠️ if deploying index to index endpoint within a VPC network, you must interact with the index endpoint from within the VPC network, i.e., run this notebook in a Vertex Workbench instance within that VPC ⚠️</b>
</div>


In [7]:
VPC_NETWORK_NAME = ""  # e.g., "your-vpc-name" | ""

In [8]:
if VPC_NETWORK_NAME:
    USE_PUBLIC_ENDPOINTS = False
else:
    USE_PUBLIC_ENDPOINTS = True
    
print(f"VPC_NETWORK_NAME     = {VPC_NETWORK_NAME}")
print(f"USE_PUBLIC_ENDPOINTS = {USE_PUBLIC_ENDPOINTS}")

VPC_NETWORK_NAME     = 
USE_PUBLIC_ENDPOINTS = True


In [9]:
# locations / regions for cloud resources
REGION            = 'us-central1'        
BQ_REGION         = 'US'

print(f"REGION    = {REGION}")
print(f"BQ_REGION = {BQ_REGION}")

REGION    = us-central1
BQ_REGION = US


#### Let these ride

In [10]:
# let these ride
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

# service account
VERTEX_SA                = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com'

# full VPC network name
VPC_NETWORK_FULL         = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

print(f"PROJECT_ID       = {PROJECT_ID}")
print(f"PROJECT_NUM      = {PROJECT_NUM}")
print(f"VERTEX_SA        = {VERTEX_SA}")
print(f"VPC_NETWORK_FULL = {VPC_NETWORK_FULL}")
print(f"BUCKET_NAME      = {BUCKET_NAME}")
print(f"BUCKET_URI       = {BUCKET_URI}")

PROJECT_ID       = hybrid-vertex
PROJECT_NUM      = 934903580331
VERTEX_SA        = 934903580331-compute@developer.gserviceaccount.com
VPC_NETWORK_FULL = projects/934903580331/global/networks/
BUCKET_NAME      = vvs-vectorio-pubv2-hybrid-vertex
BUCKET_URI       = gs://vvs-vectorio-pubv2-hybrid-vertex


In [11]:
!gcloud config set project $PROJECT_ID

Updated property [core/project].


### Create GCS bucket

In [13]:
if CREATE_NEW_ASSETS:
    
    # create new bucket
    ! gsutil mb -l $REGION $BUCKET_URI
    
    # ### give Service account Admin to GCS
    # !gcloud projects add-iam-policy-binding $PROJECT_ID \
    #     --member=serviceAccount:$VERTEX_SA \
    #     --role=roles/storage.admin
    
    ### uncomment if org policy prevents granting Admin:
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.get $BUCKET_URI
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.create $BUCKET_URI
    # ! gsutil iam ch serviceAccount:{$VERTEX_SA}:roles/storage.objects.list $BUCKET_URI
    
    
print(f"{VERTEX_SA} should have access to {BUCKET_URI}")

Creating gs://vvs-vectorio-pubv2-hybrid-vertex/...
934903580331-compute@developer.gserviceaccount.com should have access to gs://vvs-vectorio-pubv2-hybrid-vertex


### Save notebook config 

> to easily use in other GCP related notebooks

In [14]:
config = f"""
PREFIX                   = \"{PREFIX}\"
VERSION                  = \"{VERSION}\"

PROJECT_ID               = \"{PROJECT_ID}\"
PROJECT_NUM              = \"{PROJECT_NUM}\"

REGION                   = \"{REGION}\"
BQ_REGION                = \"{BQ_REGION}\"

VERTEX_SA                = \"{VERTEX_SA}\"

VPC_NETWORK_NAME         = \"{VPC_NETWORK_NAME}\"
VPC_NETWORK_FULL         = \"{VPC_NETWORK_FULL}\"

USE_PUBLIC_ENDPOINTS     = \"{USE_PUBLIC_ENDPOINTS}\"

BUCKET_NAME              = \"{BUCKET_NAME}\"
BUCKET_URI               = \"{BUCKET_URI}\"
"""
print(config)


PREFIX                   = "vvs-vectorio-pubv2"
VERSION                  = "pubv2"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "projects/934903580331/global/networks/"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv2-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv2-hybrid-vertex"



In [15]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copying from <STDIN>...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              


### Initialize Google Cloud SDK Clients

In [16]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=REGION)

# bigquery client
bq_client = bigquery.Client(
    project=PROJECT_ID,
    # location=BQ_REGION
)

# Prepare StackOverflow data
You use the [Stack Overflow dataset](https://console.cloud.google.com/marketplace/product/stack-exchange/stack-overflow) of question and answers hosted on BigQuery.

> This public dataset is hosted in Google BigQuery and is included in BigQuery's 1TB/mo of free tier processing. This means that each user receives 1TB of free BigQuery processing every month, which can be used to run queries on this public dataset.

The BigQuery table is too large to fit into memory, so you need to write a generator called `query_bigquery_chunks` to yield chunks of the dataframe for processing. Additionally, an extra column `title_with_body` is added, which is a concatenation of the question title and body.

In [17]:
import math
from typing import Any, Generator

In [18]:
QUERY_TEMPLATE = """
        SELECT DISTINCT q.id, 
           q.title, 
           q.body, 
           q.score, 
           q.tags,
        FROM (
            SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` 
            WHERE score > 0 
            ORDER BY view_count DESC
            ) AS q 
        LIMIT {limit} OFFSET {offset};
        """


def query_bigquery_chunks(
    max_rows: int, 
    rows_per_chunk: int, 
    start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = bq_client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["title_with_body"] = df.title + "\n" + df.body
        df['tags_split_1'] = df['tags'].apply(lambda x: x.split('|', maxsplit=1)[0])
        df['tags_split_2'] = df['tags'].apply(lambda x: x.split('|', maxsplit=1)[0])
        df.drop(columns=["body","tags"], inplace=True)
        yield df

In [19]:
# Get a dataframe of 1000 rows for demonstration purposes
df_test = next(
    query_bigquery_chunks(
        max_rows=500, 
        rows_per_chunk=100,
        start_chunk=0
    )
)

# Examine the data
print(f"df shape: {df_test.shape}")
df_test.head()

df shape: (100, 5)


Unnamed: 0,id,title,score,title_with_body,tags_split
0,54839558,Call JS onsubmit only when form is valid,1,Call JS onsubmit only when form is valid\n<p>I...,javascript
1,54736309,How to use output for one function as paramete...,1,How to use output for one function as paramete...,python
2,54991610,How to look up previous values in an R data fr...,1,How to look up previous values in an R data fr...,r
3,55018619,Adding Firebase Crashlytics either crash in ru...,6,Adding Firebase Crashlytics either crash in ru...,android
4,54813658,Shopify create user activation email,1,Shopify create user activation email\n<p>How c...,c#


## Instantiate the text encoding model

> Use the [Vertex AI Embeddings for Text API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) developed by Google for converting text to embeddings

### Define helper functions

* `encode_texts_to_embeddings()`: convert sentences to embeddings

* `generate_batches()`: splits sentences into batches of 5 before sending to the embedding API

* `encode_text_to_embedding_batched()`: calls `generate_batches()` to handle batching, calls embedding API via `encode_texts_to_embeddings()`, handles rate-limiting using `time.sleep` *(Note: For production use cases, use more sophisticated rate-limiting mechanism that takes retries into account)*

In [20]:
import os 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [21]:
from typing import List, Optional

# Load the "Vertex AI Embeddings for Text" model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")


# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [22]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple, Dict

import numpy as np
from tqdm.auto import tqdm


# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], 
    batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], 
    api_calls_per_second: int = 10, 
    batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

*test encoding function*

In [23]:
# Encode a subset of questions for validation
questions = df_test.title.tolist()[:500]

is_successful, question_embeddings = encode_text_to_embedding_batched(
    sentences=df_test.title.tolist()[:500],
)

# Filter for successfully embedded sentences
questions = np.array(questions)[is_successful]

  0%|          | 0/20 [00:00<?, ?it/s]

In [24]:
print(questions.shape)
questions[0]

(100,)


'Call JS onsubmit only when form is valid'

In [25]:
print(question_embeddings.shape)
# question_embeddings[0]

(100, 768)


In [26]:
# len(is_successful)
is_successful[0]

True

In [28]:
DIMENSIONS = len(question_embeddings[0])

print(DIMENSIONS)

768


In [29]:
import random

question_index = random.randint(0, 99)

print(f"Query question = {questions[question_index]}")

# Get similarity scores for each embedding by using dot-product.
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")

Query question = How to look up previous values in an R data frame by ID and year?
	0: How to look up previous values in an R data frame by ID and year?: 0.9999994333182649
	1: Needing YTD Data from SQL: 0.6629148880643077
	2: Multiple Regression - Error in model.frame.default variable lengths differ: 0.6301273325358208
	3: How to add Standard error to a nls plot using ggplot2?: 0.6184790667532307
	4: SQL QUERY : multiple records against one column value . need to compare another column value to fetch one record: 0.5868768786140578
	5: overwrite data in google sheet using python: 0.5858167421303139
	6: How to cache subquery result in WITH clause in Spark SQL: 0.5764415522845228
	7: How to hide repetitive data while retrieving data from database: 0.5739066996016109
	8: How to use output for one function as parameter for another: 0.5704656084304427
	9: How to combine Images of different channel?: 0.5640245568680563
	10: How to use threading with selenium for web scraping?: 0.556584139572

In [30]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

Embeddings directory: /var/tmp/tmp7wy_c7yy


### formatting vectors for Vertex Vector Search

**embedding_vector**
* Encode the file using UTF-8.
* Make each line a valid `.json` object to be interpreted as a record.
* Include in each record a field named `id` that requires a valid UTF-8 string that is the ID of the vector.
* Include in each record a field named embedding that requires an array of numbers. This is the feature vector.

Filtering with **String Namespaces**

The value of the field `restricts`, if present, should be an array of objects, each is turned into a TokenNamespace in restricts.

For each vector's record, add a field called restricts, to contain an array of objects, each of which is a namespace.

* Each object must have a field named namespace. This field is the TokenNamespace.namespace, namespace.
* The value of the field allow, if present, is an array of strings. This array of strings is the TokenNamespace.string_tokens list.
* The value of the field deny, if present, is an array of strings. This array of strings is the TokenNamespace.string_denylist_tokens list.

```
{
    "id": "42", 
    "embedding": [0.5, 1.0], 
    "restricts": [
        {"namespace": "class","allow": ["cat", "pet"]},
        {"namespace": "category", "allow": ["feline"]}
    ]
}
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "restricts": [
        {"namespace": "class", "allow": ["dog", "pet"]},
        {"namespace": "category", "allow": ["canine"]}
    ]
}
```

Filtering with **Numeric namespaces**
For each vector's record, add a field called `numeric_restricts`, to contain an array of objects, each of which is a numeric restrict.

* Each object must have a field named namespace. This field is the NumericRestrictNamespace.namespace, namespace.
* Each object must have one of `value_int`, `value_float`, and `value_double`.
* Each object must not have a field named op. This field is only for query.

```
{
    "id": "42", 
    "embedding": [0.5, 1.0], 
    "numeric_restricts": [
        {"namespace": "size", "value_int": 3},
        {"namespace": "ratio", "value_float": 0.1}
    ]
}
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "numeric_restricts": [
        {"namespace":"weight", "value_double": 0.3}
    ]
}
```

**crowding tag**

The value of the field `crowding_tag`, if present, should be a string

```
{
    "id": "43", 
    "embedding": [0.6, 1.0], 
    "numeric_restricts": [
        {"namespace":"weight", "value_double": 0.3}
    ],
    "crowding_tag": "shoes"
}
```

In [31]:
# print(len(question_embeddings))
# question_embeddings[0]

# df_test.head()

In [32]:
import gc
import json

BQ_NUM_ROWS = 100
BQ_CHUNK_SIZE = 100
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

print(f"BQ_NUM_ROWS          : {BQ_NUM_ROWS}")
print(f"BQ_CHUNK_SIZE        : {BQ_CHUNK_SIZE}")
print(f"BQ_NUM_CHUNKS        : {BQ_NUM_CHUNKS}")
print(f"API_CALLS_PER_SECOND : {API_CALLS_PER_SECOND}")
print(f"ITEMS_PER_REQUEST    : {ITEMS_PER_REQUEST}")


# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, 
            rows_per_chunk=BQ_CHUNK_SIZE, 
            start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    print(f"Starting: {i} of {BQ_NUM_CHUNKS} loops")
    
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.id
        scores_chunk = df.score
        tags_restrict = df.tags_split_1
        tags_crowd = df.tags_split_2

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            # sentences=df.title_with_body,
            sentences=df.title_with_body.tolist(), #[:500]
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [
                        str(value) for value in embedding
                    ],
                    "restricts": [
                        {"namespace": "tags", "allow": [str(tag_r)]}
                    ],
                    "numeric_restricts": [
                        {"namespace": "score", "value_int": int(score)}
                    ],
                    "crowding_tag": str(tag_crowd)
                }
            )
            + "\n"
            # for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
            for id, embedding, tag_r, score, tag_crowd in zip(
                id_chunk[is_successful], 
                question_chunk_embeddings, 
                tags_restrict, 
                scores_chunk, 
                tags_crowd
            )
        ]
        f.writelines(embeddings_formatted)

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

BQ_NUM_ROWS          : 100
BQ_CHUNK_SIZE        : 100
BQ_NUM_CHUNKS        : 1
API_CALLS_PER_SECOND : 5.0
ITEMS_PER_REQUEST    : 5


Chunk of rows from BigQuery:   0%|          | 0/1 [00:00<?, ?it/s]

Starting: 0 of 1 loops


  0%|          | 0/20 [00:00<?, ?it/s]

#### example output

a single `embeddings_formatted` entry should resemble:

```
{
    "id": "67574726", 
    "embedding": [
        "0.875, ..., 1.000"
    ], 
    "restricts": [
        {"namespace": "tags", "allow": ["c#"]}
    ], 
    "numeric_restricts": [
        {"namespace": "score", "value_int": 1}
    ],
    "crowding_tag": "c#"
}
```

In [33]:
print(len(embeddings_formatted))
print(len(embeddings_formatted[0]))
# embeddings_formatted[0]

100
18734


In [34]:
REMOTE_GCS_FOLDER = f"{BUCKET_URI}/{PREFIX}/embedding_indexes/{embeddings_file_path.stem}/"
print(f"REMOTE_GCS_FOLDER: {REMOTE_GCS_FOLDER}")

REMOTE_GCS_FOLDER: gs://vvs-vectorio-pubv2-hybrid-vertex/vvs-vectorio-pubv2/embedding_indexes/tmp7wy_c7yy/


In [35]:
! gsutil -m cp -r {embeddings_file_path}/* {REMOTE_GCS_FOLDER}

Copying file:///var/tmp/tmp7wy_c7yy/tmp7wy_c7yy_0.json [Content-Type=application/json]...
/ [1/1 files][  1.8 MiB/  1.8 MiB] 100% Done                                    
Operation completed over 1 objects/1.8 MiB.                                      


In [36]:
! gsutil ls $REMOTE_GCS_FOLDER

gs://vvs-vectorio-pubv2-hybrid-vertex/vvs-vectorio-pubv2/embedding_indexes/tmp7wy_c7yy/tmp7wy_c7yy_0.json


# Create Vertex Vector Search index and endpoint

## Create VS Index

In [37]:
CREATE_NEW_VS_INDEX = True

# !gcloud ai indexes list \
#   --project=$PROJECT_ID \
#   --region=$REGION

In [38]:
# if using exsiting index
if not CREATE_NEW_VS_INDEX:
    EXISTING_INDEX_ID = "..TODO.."
    EXISTING_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{EXISTING_INDEX_ID}'
    print(f"EXISTING_INDEX_NAME  : {EXISTING_INDEX_NAME}")

In [39]:
# ANN index config
APPROX_NEIGHBORS           = 150
DISTANCE_MEASURE           = "DOT_PRODUCT_DISTANCE"
LEAF_NODE_EMB_COUNT        = 500
LEAF_NODES_SEARCH_PERCENT  = 80
# DIMENSIONS                 = 768
INDEX_UPDATE_METHOD        = "STREAM_UPDATE"

DISPLAY_NAME = f"soverflow_{PREFIX}".replace("-","_")
DESCRIPTION = "sample index for vectorio demo"

print(f"APPROX_NEIGHBORS          : {APPROX_NEIGHBORS}")
print(f"DISTANCE_MEASURE          : {DISTANCE_MEASURE}")
print(f"LEAF_NODE_EMB_COUNT       : {LEAF_NODE_EMB_COUNT}")
print(f"LEAF_NODES_SEARCH_PERCENT : {LEAF_NODES_SEARCH_PERCENT}")
print(f"DIMENSIONS                : {DIMENSIONS}")
print(f"INDEX_UPDATE_METHOD       : {INDEX_UPDATE_METHOD}")
print(f"DISPLAY_NAME              : {DISPLAY_NAME}")
print(f"DESCRIPTION               : {DESCRIPTION}")

APPROX_NEIGHBORS          : 150
DISTANCE_MEASURE          : DOT_PRODUCT_DISTANCE
LEAF_NODE_EMB_COUNT       : 500
LEAF_NODES_SEARCH_PERCENT : 80
DIMENSIONS                : 768
INDEX_UPDATE_METHOD       : 2
DISPLAY_NAME              : soverflow_vvs_vectorio_pubv2
DESCRIPTION               : sample index for vectorio demo


In [50]:
if CREATE_NEW_VS_INDEX:
    
    print(f"Creating new index: {DISPLAY_NAME} ...")
    start = time.time()
    
    tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
        display_name=DISPLAY_NAME,
        contents_delta_uri=REMOTE_GCS_FOLDER,
        dimensions=DIMENSIONS,
        approximate_neighbors_count=APPROX_NEIGHBORS,
        distance_measure_type=DISTANCE_MEASURE,
        leaf_node_embedding_count=LEAF_NODE_EMB_COUNT,
        leaf_nodes_to_search_percent=LEAF_NODES_SEARCH_PERCENT,
        description=DESCRIPTION,
        index_update_method=INDEX_UPDATE_METHOD,
        sync=True,
        labels={
            "prefix": f'{PREFIX}',
        },
    )
    end = time.time()
    print(f"elapsed time: {end - start}")
else:
    tree_ah_index = aiplatform.MatchingEngineIndex(EXISTING_INDEX_NAME)
    
INDEX_RESOURCE_NAME = tree_ah_index.resource_name

print(f"DISPLAY_NAME        : {tree_ah_index.display_name}\n")
print(f"INDEX_RESOURCE_NAME : {INDEX_RESOURCE_NAME}")

Creating new index: soverflow_vvs_vectorio_pubv2 ...
elapsed time: 18.084052801132202
DISPLAY_NAME        : soverflow_vvs_vectorio_pubv2
check display_name  : soverflow_vvs_vectorio_pubv2

INDEX_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexes/7264767993832275968


In [51]:
tree_ah_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/7264767993832275968',
 'displayName': 'soverflow_vvs_vectorio_pubv2',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'etag': 'AMEw9yM-qUn2SNdvuQMsHCRK5yySxjExQqVz3XgPmhye_4Bu8GZjw7O1Mj6DZgTFXoc=',
 'labels': {'prefix': 'vvs-vectorio-pubv2'},
 'createTime': '2024-01-30T03:36:56.951497Z',
 'updateTime': '2024-01-30T03:37:10.208162Z',
 'indexStats': {'vectorsCount': '100', 'shardsCount': 1},
 'indexUpdateMethod': 'STREAM_UPDATE',
 'encryptionSpec': {}}

*list all indexes:*

In [None]:
list_of_indices = tree_ah_index.list()
list_of_indices[0].resource_name

'projects/934903580331/locations/us-central1/indexes/7264767993832275968'

*Using the resource name, you can retrieve an existing MatchingEngineIndex:*

In [53]:
# tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

#### Troubleshooting

If index creation fails, grab `OPERATION_ID` and `FAILED_INDEX_ID` from the operation resource name in the error message, for example:

> `Please check the details in the metadata of operation: projects/934903580331/locations/us-central1/indexes/FAILED_INDEX_ID/operations/OPERATION_ID.`

Then, use `gcloud ai operations describe` to get the error details:

```
!gcloud ai operations describe $OPERATION_ID \
    --index=$FAILED_INDEX_ID \
    --project=$PROJECT_ID \
    --region=$REGION
```

In [54]:
# OPERATION_ID    = "2710742495469240320"
# FAILED_INDEX_ID = "4846053518957608960"

# !gcloud ai operations describe $OPERATION_ID \
#     --index=$FAILED_INDEX_ID \
#     --project=$PROJECT_ID \
#     --region=$REGION

## Create VS Index Endpoint

In [55]:
CREATE_NEW_VS_INDEX_ENDPOINT = True

# !gcloud ai index-endpoints list \
#   --project=$PROJECT_ID \
#   --region=$REGION

In [56]:
# if using exsiting index endpoint
if not CREATE_NEW_VS_INDEX_ENDPOINT:
    EXISTING_ENDPOINT_ID = "..TODO.."
    EXISTING_ENDPOINT_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexEndpoints/{EXISTING_ENDPOINT_ID}'
    print(f"EXISTING_ENDPOINT_NAME  : {EXISTING_ENDPOINT_NAME}")

In [57]:
ENDPOINT_DISPLAY_NAME = f'{DISPLAY_NAME}_endpoint'
ENDPOINT_DESCRIPTION  = "index endpoint for vectorio demo"

# USE_PUBLIC_ENDPOINTS  = True

print(f"ENDPOINT_DISPLAY_NAME : {ENDPOINT_DISPLAY_NAME}")
print(f"ENDPOINT_DESCRIPTION  : {ENDPOINT_DESCRIPTION}")
print(f"USE_PUBLIC_ENDPOINTS  : {USE_PUBLIC_ENDPOINTS}")

ENDPOINT_DISPLAY_NAME : soverflow_vvs_vectorio_pubv2_endpoint
ENDPOINT_DESCRIPTION  : index endpoint for vectorio demo
USE_PUBLIC_ENDPOINTS  : True


In [58]:
if CREATE_NEW_VS_INDEX_ENDPOINT:
    
    print(f"Creating new index endpoint: {ENDPOINT_DISPLAY_NAME} ...")
    start = time.time()
    
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
        display_name=ENDPOINT_DISPLAY_NAME,
        description=ENDPOINT_DESCRIPTION,
        network=VPC_NETWORK_FULL if not USE_PUBLIC_ENDPOINTS else None,
        public_endpoint_enabled=USE_PUBLIC_ENDPOINTS,
        sync=True,
        labels={
            "prefix": f'{PREFIX}',
        },
    )
    end = time.time()
    print(f"elapsed time: {end - start}")
else:
    my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(EXISTING_ENDPOINT_NAME)
    
ENDPOINT_RESOURCE_NAME = my_index_endpoint.resource_name

print(f"ENDPOINT_DISPLAY_NAME  : {ENDPOINT_DISPLAY_NAME}")
print(f"check display_name     : {my_index_endpoint.display_name}\n")
print(f"VPC_NETWORK_FULL       : {VPC_NETWORK_FULL}")
print(f"ENDPOINT_RESOURCE_NAME : {ENDPOINT_RESOURCE_NAME}")

Creating new index endpoint: soverflow_vvs_vectorio_pubv2_endpoint ...
elapsed time: 1.221855878829956
ENDPOINT_DISPLAY_NAME  : soverflow_vvs_vectorio_pubv2_endpoint
check display_name     : soverflow_vvs_vectorio_pubv2_endpoint

VPC_NETWORK_FULL       : projects/934903580331/global/networks/
ENDPOINT_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexEndpoints/5915095480504680448


## Deploy Indexes

* [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/matching_engine/matching_engine_index_endpoint.py#L822)

In [60]:
DEPLOY_NEW_VS_INDEX = True

In [61]:
# if using exsiting deployed index
if not DEPLOY_NEW_VS_INDEX:
    EXISTING_DEPLOYED_INDEX_ID = "..TODO.."
    EXISTING_DEPLOYED_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexEndpoints/{EXISTING_DEPLOYED_INDEX_ID}'
    print(f"EXISTING_DEPLOYED_INDEX_NAME  : {EXISTING_DEPLOYED_INDEX_NAME}")

In [62]:
import time 

TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
DEPLOYED_INDEX_ID = f'deployed_{TIMESTAMP}'.replace("-","_")
# MACHINE_TYPE = "XXXX"
MIN_REPLICAS = 1
MAX_REPLICAS = 1

print(f"DEPLOYED_INDEX_ID : {DEPLOYED_INDEX_ID}")
print(f"MIN_REPLICAS      : {MIN_REPLICAS}")
print(f"MAX_REPLICAS      : {MAX_REPLICAS}")

DEPLOYED_INDEX_ID : deployed_20240130_034024
MIN_REPLICAS      : 1
MAX_REPLICAS      : 1


In [63]:
if DEPLOY_NEW_VS_INDEX:
    
    print(f"Deploying {DISPLAY_NAME} to endpoint: {ENDPOINT_DISPLAY_NAME} ...")
    start = time.time()
    
    deployed_index = my_index_endpoint.deploy_index(
        index=tree_ah_index, 
        deployed_index_id=DEPLOYED_INDEX_ID,
        min_replica_count=MIN_REPLICAS,
        max_replica_count=MAX_REPLICAS,
        
    )
    end = time.time()
    print(f"elapsed time: {end - start}")
    
else:
    deployed_index = aiplatform.MatchingEngineIndexEndpoint(EXISTING_DEPLOYED_INDEX_NAME)

DEPLOYED_INDEX_RESOURCE_NAME = deployed_index.resource_name
DEPLOYED_INDEX_DISPLAY_NAME  = deployed_index.display_name

print(f"DEPLOYED_INDEX_RESOURCE_NAME : {DEPLOYED_INDEX_RESOURCE_NAME}")
print(f"DEPLOYED_INDEX_DISPLAY_NAME  : {DEPLOYED_INDEX_DISPLAY_NAME}")

Deploying soverflow_vvs_vectorio_pubv2 to endpoint: soverflow_vvs_vectorio_pubv2_endpoint ...
elapsed time: 1547.031239271164
DEPLOYED_INDEX_RESOURCE_NAME : projects/934903580331/locations/us-central1/indexEndpoints/5915095480504680448
DEPLOYED_INDEX_DISPLAY_NAME  : soverflow_vvs_vectorio_pubv2_endpoint


In [64]:
print(f"Deployed indexes on the index endpoint:")
for d in my_index_endpoint.deployed_indexes:
    print(f"    {d.id}")

Deployed indexes on the index endpoint:
    deployed_20240130_034024


In [65]:
DEPLOYED_INDEX_ID_TEST = deployed_index.deployed_indexes[0].id
print(f"DEPLOYED_INDEX_ID_TEST : {DEPLOYED_INDEX_ID_TEST}")

DEPLOYED_INDEX_ID_TEST : deployed_20240130_034024


### Confirm matches

In [66]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: {BQ_NUM_ROWS}, Actual: {number_of_vectors}")

Expected: 100, Actual: 100


## Create online queries

After you build your indexes, you may query against the deployed index to find nearest neighbors.

Note: For the `DOT_PRODUCT_DISTANCE` distance type, the "distance" property returned with each MatchNeighbor actually refers to the similarity.

In [67]:
test_embeddings = encode_texts_to_embeddings(sentences=["Install GPU for Tensorflow"])

In [68]:
# Test query
NUM_NEIGHBOURS = 10

start = time.time()

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
    return_full_datapoint=True,
)

elapsed_ann_time = time.time() - start
elapsed_ann_time = round(elapsed_ann_time, 4)
print(f'ANN latency: {elapsed_ann_time} seconds')

ANN latency: 0.1461 seconds


In [69]:
response

[[MatchNeighbor(id='62031683', distance=0.697461724281311),
  MatchNeighbor(id='68085958', distance=0.6485500335693359),
  MatchNeighbor(id='73139204', distance=0.6429802179336548),
  MatchNeighbor(id='68295202', distance=0.6405211687088013),
  MatchNeighbor(id='68127625', distance=0.6347805857658386),
  MatchNeighbor(id='61620454', distance=0.6322963237762451),
  MatchNeighbor(id='70713265', distance=0.6302939057350159),
  MatchNeighbor(id='68352333', distance=0.6260936260223389),
  MatchNeighbor(id='65599903', distance=0.6254304647445679),
  MatchNeighbor(id='73178654', distance=0.6228378415107727)]]

In [70]:
# Test query
NUM_NEIGHBOURS = 10

start = time.time()

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
    return_full_datapoint=False,
)

elapsed_ann_time = time.time() - start
elapsed_ann_time = round(elapsed_ann_time, 4)
print(f'ANN latency: {elapsed_ann_time} seconds')

ANN latency: 0.0557 seconds


In [71]:
response

[[MatchNeighbor(id='62031683', distance=0.697461724281311),
  MatchNeighbor(id='68085958', distance=0.6485500335693359),
  MatchNeighbor(id='73139204', distance=0.6429802179336548),
  MatchNeighbor(id='68295202', distance=0.6405211687088013),
  MatchNeighbor(id='68127625', distance=0.6347805857658386),
  MatchNeighbor(id='61620454', distance=0.6322963237762451),
  MatchNeighbor(id='70713265', distance=0.6302939057350159),
  MatchNeighbor(id='68352333', distance=0.6260936260223389),
  MatchNeighbor(id='65599903', distance=0.6254304647445679),
  MatchNeighbor(id='73178654', distance=0.6228378415107727)]]

In [72]:
for match_index, neighbor in enumerate(response[0]):
    print(f"https://stackoverflow.com/questions/{neighbor.id}")

https://stackoverflow.com/questions/62031683
https://stackoverflow.com/questions/68085958
https://stackoverflow.com/questions/73139204
https://stackoverflow.com/questions/68295202
https://stackoverflow.com/questions/68127625
https://stackoverflow.com/questions/61620454
https://stackoverflow.com/questions/70713265
https://stackoverflow.com/questions/68352333
https://stackoverflow.com/questions/65599903
https://stackoverflow.com/questions/73178654


### Retrieving metadata

In [68]:
# read_response = my_index_endpoint.read_index_datapoints(
#     deployed_index_id=DEPLOYED_INDEX_ID, 
#     ids= ["43070568", "54122858"],
# )
# # read_response
# print(f"crowding_tag       : {read_response[0].crowding_tag}")
# print(f"crowding_attribute : {read_response[0].crowding_tag.crowding_attribute}")
# print(f"datapoint_id       : {read_response[0].datapoint_id}")
# print(f"restricts          : {read_response[0].restricts}")
# print(f"allow_list         : {read_response[0].restricts[0].allow_list}")
# print(f"deny_list          : {read_response[0].restricts[0].deny_list}")
# print(f"namespace          : {read_response[0].restricts[0].namespace}")

# Clean-up

> TODO

In [None]:
# import os

# delete_bucket = False

# # Force undeployment of indexes and delete endpoint
# my_index_endpoint.delete(force=True)

# # Delete indexes
# tree_ah_index.delete()

# if delete_bucket or os.getenv("IS_TESTING"):
#     ! gsutil rm -rf {BUCKET_URI}