# Testing export class for Vertex AI Vector Search

In [1]:
!pwd

/home/jupyter/vector-io/notebooks


In [2]:
import os

root_path = ".."
os.chdir(root_path)
os.getcwd()

'/home/jupyter/vector-io'

In [3]:
# naming convention for all cloud resources
VERSION = "pubv3"  # TODO
PREFIX = f"vvs-vectorio-{VERSION}"  # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv3


## Load notebook config

> If already defined in quickstart notebook, define these vairables with the cell below

In [4]:
# staging GCS
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME = f"{PREFIX}-{PROJECT_ID}"
BUCKET_URI = f"gs://{BUCKET_NAME}"

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-pubv3"
VERSION                  = "pubv3"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "None"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv3-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv3-hybrid-vertex"

REMOTE_GCS_FOLDER        = "gs://vvs-vectorio-pubv3-hybrid-vertex/vvs-vectorio-pubv3/embedding_indexes/tmpm4k5k6gq/"

SO_PARQUET_GCS_DIR       = "gs://vvs-vectorio-pubv3-hybrid-vertex/emb_vector_parquet/so_2000_5000_1000/tmpsgm4txp8/"

LOCAL_TEST_DIR           = "data/stack_overflow_parquet_pubv3"
LOCAL_TEST_DATA_DIR      = "data/stack_overflow_parquet_pubv3/files"

DIMENSIONS               = "768"

INDEX_DISPLAY_NAME       = "soverflow_vvs

## Imports

In [5]:
import pandas as pd
import time
import json


from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging

logging.disable(logging.WARNING)

# python warning
import warnings

warnings.filterwarnings("ignore")

print(f"BigQuery SDK version      : {bigquery.__version__}")
print(f"Vertex AI SDK version     : {aip.__version__}")
print(f"Cloud Storage SDK version : {storage.__version__}")

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [6]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI Vector Search Export class

In [8]:
import os

# sys.path.append("..")
from vdf_io.export_vdf.vertexai_vector_search_export import ExportVertexAIVectorSearch

# Export from existing Vector Search Index

In [9]:
target_vs_index = aip.MatchingEngineIndex(INDEX_RESOURCE_NAME)
target_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/1081325705452584960',
 'displayName': 'soverflow_vvs_vectorio_pubv3',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/5739455095037231104',
   'deployedIndexId': 'soverflow_vvs_vectorio_pubv3_20240130131739'}],
 'etag': 'AMEw9yPG4wUYBC8xLJctHEy-yYH-P5PDNlwHbqVfcJCg8zt_LsonGiIjUEPYJWTjYpU=',
 'createTime': '2024-01-30T13:26:08.725251Z',
 'updateTime': '2024-01-30T13:26:17.176312Z',
 'indexStats': {'vectorsCount': '1', 'shardsC

In [10]:
DIMENSIONS = 768
VDF_VERSION = "v1"
TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

print(f"INDEX_DISPLAY_NAME : {INDEX_DISPLAY_NAME}")
print(f"DEPLOYED_INDEX_ID  : {DEPLOYED_INDEX_ID}")

INDEX_DISPLAY_NAME : soverflow_vvs_vectorio_pubv3
DEPLOYED_INDEX_ID  : soverflow_vvs_vectorio_pubv3_20240130131739


In [16]:
my_export_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    # "index": DEPLOYED_INDEX_ID,
    "index": INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
    "model_name": "textembedding-gecko@001",
    "max_vectors": 5000,
}
my_export_args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'projects/934903580331/locations/us-central1/indexes/1081325705452584960',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'textembedding-gecko@001',
 'max_vectors': 5000}

In [17]:
export_vvs = ExportVertexAIVectorSearch(args=my_export_args)

export_vvs

<src.export_vdf.vertexai_vector_search_export.ExportVertexAIVectorSearch at 0x7fc110ae2b00>

In [18]:
export_vvs.args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'projects/934903580331/locations/us-central1/indexes/1081325705452584960',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'textembedding-gecko@001',
 'max_vectors': 5000}

In [19]:
VDF_EXPORT_DIR_PATH = f"./{export_vvs.vdf_directory}"
print(f"VDF_EXPORT_DIR_PATH: {VDF_EXPORT_DIR_PATH}")

VDF_EXPORT_DIR_PATH: ./vdf_20240201_181031_a26f5


In [20]:
export_vvs.get_data()

indexes: ['projects/934903580331/locations/us-central1/indexes/1081325705452584960']


Fetching indexes:   0%|          | 0/1 [00:00<?, ?it/s]
Exporting soverflow_vvs_vectorio_pubv3:   0%|          | 0/5000 [00:00<?, ?it/s][A
Exporting soverflow_vvs_vectorio_pubv3:  90%|█████████ | 4501/5000 [00:01<00:00, 2464.60it/s][A
Fetching indexes: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]

{
    "version": "v1",
    "file_structure": [
        "vdf_20240201_181031_a26f5/soverflow_vvs_vectorio_pubv3/1.parquet",
        "vdf_20240201_181031_a26f5/VDF_META.json"
    ],
    "author": "jupyter",
    "exported_from": "vertexai_vectorsearch",
    "indexes": {
        "projects/934903580331/locations/us-central1/indexes/1081325705452584960": [
            {
                "index_name": "soverflow_vvs_vectorio_pubv3",
                "namespace": "namespace",
                "total_vector_count": 5000,
                "exported_vector_count": 4501,
                "metric": "Dot",
                "dimensions": 768,
                "model_name": "textembedding-gecko@001",
                "vector_columns": [
                    "vector"
                ],
                "data_path": "vdf_20240201_181031_a26f5/soverflow_vvs_vectorio_pubv3"
            }
        ]
    }
}





True

## Check VDF Metadata json

In [24]:
with open(f"{VDF_EXPORT_DIR_PATH}/VDF_META.json") as f:
    d = json.load(f)

In [25]:
d

{'version': 'v1',
 'file_structure': ['vdf_20240201_162120_4fb82/soverflow_vvs_vectorio_pubv3/1.parquet',
  'vdf_20240201_162120_4fb82/VDF_META.json'],
 'author': 'jupyter',
 'exported_from': 'vertexai_vectorsearch',
 'indexes': {'projects/934903580331/locations/us-central1/indexes/1081325705452584960': [{'index_name': 'soverflow_vvs_vectorio_pubv3',
    'namespace': 'namespace',
    'total_vector_count': 5000,
    'exported_vector_count': 4501,
    'metric': 'Dot',
    'dimensions': 768,
    'model_name': 'textembedding-gecko@001',
    'vector_columns': ['vector'],
    'data_path': 'vdf_20240201_162120_4fb82/soverflow_vvs_vectorio_pubv3'}]}}

In [28]:
if isinstance(my_export_args["index"], str):
    file_path = f"{VDF_EXPORT_DIR_PATH}/{my_export_args['index']}/1.parquet"

if isinstance(my_export_args["index"], list):
    file_path = f"{VDF_EXPORT_DIR_PATH}/{my_export_args['index'][0]}/1.parquet"

print(f"file_path: {file_path}")

test_parquet_df = pd.read_parquet(file_path)
print(f"df shape: {test_parquet_df.shape}")
test_parquet_df.head(3)

file_path: ./vdf_20240201_162120_4fb82/soverflow_vvs_vectorio_pubv3/1.parquet
df shape: (4501, 3)


Unnamed: 0,id,vector,tag
0,019c30f2-6f90-4bb4-b155-8a3d61c1c6b6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
1,72719042,"[-0.02314985916018486, -0.0009600448538549244,...",[python]
2,72508354,"[-0.01787417009472847, -0.04592113569378853, 0...",[python]


# Validate imported records

In [22]:
my_index_endpoint = aip.MatchingEngineIndexEndpoint(ENDPOINT_RESOURCE_NAME)
# my_index_endpoint

In [23]:
ids_to_check = test_parquet_df["id"][:2].to_list()
# ids_to_check = ['2102980']
# ids_to_check = ['36062183']
# ids_to_check = ['48876786', '48821717']

In [24]:
read_response = my_index_endpoint.read_index_datapoints(
    deployed_index_id=DEPLOYED_INDEX_ID,
    ids=ids_to_check,
)
len(read_response)

2

In [27]:
# read_response

**Finished**