# Testing export class for Vertex AI Vector Search

In [1]:
!pwd

/home/jupyter/vector-io/notebooks


In [2]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/vector-io'

In [55]:
# naming convention for all cloud resources
VERSION        = "pubv3"                     # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv3


## Load notebook config

> If already defined in quickstart notebook, define these vairables with the cell below

In [56]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-pubv3"
VERSION                  = "pubv3"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "None"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv3-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv3-hybrid-vertex"

REMOTE_GCS_FOLDER        = "gs://vvs-vectorio-pubv3-hybrid-vertex/vvs-vectorio-pubv3/embedding_indexes/tmpig2laxe1/"

SO_PARQUET_GCS_DIR       = "gs://vvs-vectorio-pubv3-hybrid-vertex/emb_vector_parquet/so_0_1000_200/tmpnb32rkne/"

LOCAL_TEST_DIR           = "data/stack_overflow_parquet"
LOCAL_TEST_DATA_DIR      = "data/stack_overflow_parquet/files"

DIMENSIONS               = "768"

INDEX_DISPLAY_NAME       = "soverflow_vvs_vectorio_pubv3"

## Imports

In [57]:
import pandas as pd
import numpy as np
import itertools
import time 
import json
import uuid

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aip.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [58]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI Vector Search Export class

In [59]:
import sys
import os

# sys.path.append("..")
from src.export_vdf.vertexai_vector_search_export import ExportVertexAIVectorSearch
from src.names import DBNames

# Export from existing Vector Search Index

In [61]:
target_vs_index = aip.MatchingEngineIndex(INDEX_RESOURCE_NAME)
target_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/7264767993832275968',
 'displayName': 'soverflow_vvs_vectorio_pubv2',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/5915095480504680448',
   'deployedIndexId': 'deployed_20240130_034024'}],
 'etag': 'AMEw9yP5YxsiQbcvN9Pwh5Orljpl3PazOp8gePAl9ETVPBCmiaPF2kIvz0I7b4kgwok=',
 'labels': {'prefix': 'vvs-vectorio-pubv2'},
 'createTime': '2024-01-30T03:36:56.951497Z',
 'updateTime': '2024-01-30T03:37:10.208162Z',
 'indexStats': {'ve

In [62]:
DIMENSIONS = 768
VDF_VERSION = "v1"
TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

In [63]:
my_export_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    # "index": TARGET_INDEX_NAME,
    "index": INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
    "model_name":"textembedding-gecko@001",
}
my_export_args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'soverflow_vvs_vectorio_pubv2',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'textembedding-gecko@001'}

In [64]:
export_vvs = ExportVertexAIVectorSearch(
    args=my_export_args 
)

export_vvs

<src.export_vdf.vertexai_vector_search_export.ExportVertexAIVectorSearch at 0x7ff7b2b9b400>

In [65]:
export_vvs.args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'soverflow_vvs_vectorio_pubv2',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'textembedding-gecko@001'}

In [66]:
VDF_EXPORT_DIR_PATH = f"./{export_vvs.vdf_directory}"
print(f"VDF_EXPORT_DIR_PATH: {VDF_EXPORT_DIR_PATH}")

VDF_EXPORT_DIR_PATH: ./vdf_20240130_211022_9437b


In [67]:
export_vvs.get_data()

Fetching indexes:   0%|          | 0/1 [00:00<?, ?it/s]
Exporting soverflow_vvs_vectorio_pubv2:   0%|          | 0/100 [00:00<?, ?it/s][A
Exporting soverflow_vvs_vectorio_pubv2: 100%|██████████| 100/100 [00:00<00:00, 372.98it/s][A
Fetching indexes: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]

{
    "version": "v1",
    "file_structure": [
        "vdf_20240130_211022_9437b/soverflow_vvs_vectorio_pubv2/1.parquet",
        "vdf_20240130_211022_9437b/VDF_META.json"
    ],
    "author": "jupyter",
    "exported_from": "vertexai_vectorsearch",
    "indexes": {
        "projects/934903580331/locations/us-central1/indexes/7264767993832275968": [
            {
                "index_name": "soverflow_vvs_vectorio_pubv2",
                "namespace": "namespace",
                "total_vector_count": 100,
                "exported_vector_count": 100,
                "metric": "Dot",
                "dimensions": 768,
                "model_name": "textembedding-gecko@001",
                "vector_columns": [
                    "vector"
                ],
                "data_path": "vdf_20240130_211022_9437b/soverflow_vvs_vectorio_pubv2"
            }
        ]
    }
}





True

## Check VDF Metadata json

In [68]:
with open(f"{VDF_EXPORT_DIR_PATH}/VDF_META.json") as f:
    d = json.load(f)

In [69]:
d

{'version': 'v1',
 'file_structure': ['vdf_20240130_211022_9437b/soverflow_vvs_vectorio_pubv2/1.parquet',
  'vdf_20240130_211022_9437b/VDF_META.json'],
 'author': 'jupyter',
 'exported_from': 'vertexai_vectorsearch',
 'indexes': {'projects/934903580331/locations/us-central1/indexes/7264767993832275968': [{'index_name': 'soverflow_vvs_vectorio_pubv2',
    'namespace': 'namespace',
    'total_vector_count': 100,
    'exported_vector_count': 100,
    'metric': 'Dot',
    'dimensions': 768,
    'model_name': 'textembedding-gecko@001',
    'vector_columns': ['vector'],
    'data_path': 'vdf_20240130_211022_9437b/soverflow_vvs_vectorio_pubv2'}]}}

In [70]:
file_path = f"{VDF_EXPORT_DIR_PATH}/{INDEX_DISPLAY_NAME}/1.parquet"

test_parquet_df = pd.read_parquet(file_path)
test_parquet_df.head(1)

Unnamed: 0,id,vector,tags
0,52922128,"[-0.020482279360294342, -0.05063493549823761, ...",[jenkins]


# Validate imported records

In [75]:
my_index_endpoint = aip.MatchingEngineIndexEndpoint(ENDPOINT_RESOURCE_NAME)
my_index_endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7ff7b2ba8c70> 
resource name: projects/934903580331/locations/us-central1/indexEndpoints/5915095480504680448

In [76]:
ids_to_check = test_parquet_df['id'][:2].to_list()
# ids_to_check = ['2102980']
# ids_to_check = ['36062183']
ids_to_check

['52922128', '52933161']

In [79]:
read_response = my_index_endpoint.read_index_datapoints(
    deployed_index_id=DEPLOYED_INDEX_ID, 
    ids = ids_to_check,
)
len(read_response)

2