# Testing import and export classes for Vertex AI Vecto Search

> TODO: test public and private endpoints

In [45]:
!pwd

/home/jupyter/vector-io


In [2]:
# create new gcs bucket, vs index, etc.?
CREATE_NEW_ASSETS         = False 

In [3]:
# naming convention for all cloud resources
VERSION        = "vpc1"                        # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-vpc1


## Load notebook config

In [4]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-vpc1"
VERSION                  = "vpc1"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

USE_PUBLIC_ENDPOINTS     = "False"

BUCKET_NAME              = "vvs-vectorio-vpc1-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-vpc1-hybrid-vertex"



## Imports

In [5]:
import pandas as pd
import numpy as np
import time 
import json
import uuid

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aip.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [6]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI import & export classes

In [7]:
import sys
import os

sys.path.append("..")
from src.import_vdf.vertexai_vector_search_import import ImportVertexAIVectorSearch
from src.export_vdf.vertexai_vector_search_export import ExportVertexAIVectorSearch
from src.names import DBNames

## Create test import data

## VDF Metadata json

In [29]:
# TEST_DATA_DIR = "data/stack_overflow_parquet"
TEST_DATA_DIR = "notebooks/data/1500_2000_100/tmpqxi_pbxv"

TEST_FILE = f"{TEST_DATA_DIR}/tmpmee85mkz_1502.parquet"

TEST_VDF_META = "notebooks/data/1500_2000_100/VDF_META.json"

TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

DIMENSIONS = 768
VDF_VERSION = "v1"

print(f"TEST_FILE     : {TEST_FILE}")
print(f"TEST_VDF_META : {TEST_VDF_META}")

print(f"TIMESTAMP_vdf : {TIMESTAMP_vdf}")
print(f"DIMENSIONS    : {DIMENSIONS}")

TEST_FILE     : notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet
TEST_VDF_META : notebooks/data/1500_2000_100/VDF_META.json
TIMESTAMP_vdf : 20240130-031033
DIMENSIONS    : 768


In [30]:
my_vdf = {
    "author": "jordantotten",
    "exported_from": "vertex",
    "file_structure": [
        TEST_FILE,
        TEST_VDF_META
    ],
    "version": VDF_VERSION,
    "exported_at": TIMESTAMP_vdf, #'2024-01-28T00:00:00Z',
    "indexes": {
            "test_tmp_index": [
                {
                    "data_path": TEST_DATA_DIR,
                    "dimensions": DIMENSIONS,
                    "exported_vector_count": 100,
                    "metric": "Dot",
                    "model_name": "vertex",
                    "namespace": "so_questions",
                    "total_vector_count": 100,
                    "vector_columns": ["embedding"]
                }
            ]
    }
}
pprint(my_vdf)

{'author': 'jordantotten',
 'exported_at': '20240130-031033',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
                    'notebooks/data/1500_2000_100/VDF_META.json'],
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
                                 'dimensions': 768,
                                 'exported_vector_count': 100,
                                 'metric': 'Dot',
                                 'model_name': 'vertex',
                                 'namespace': 'so_questions',
                                 'total_vector_count': 100,
                                 'vector_columns': ['embedding']}]},
 'version': 'v1'}


In [31]:
for index_name, index_meta in my_vdf["indexes"].items():
    # print(index_name)
    # print(index_meta)
    for namespace_meta in index_meta:
        # pprint(namespace_meta)
        print(namespace_meta['data_path'])

notebooks/data/1500_2000_100/tmpqxi_pbxv


In [32]:
import json
with open(f"{TEST_VDF_META}", 'w') as fp:
    json.dump(my_vdf, fp)

In [33]:
# with open('VDF_META.json') as f:
#     d = json.load(f)
#     pprint(d)

# Vertex AI Vector Search Index

In [56]:
TARGET_INDEX_DISPLAY_NAME = "soverflow_vvs_vectorio_pubv2"
TARGET_INDEX_ID = "7264767993832275968"

# # tmp
# TARGET_INDEX_DISPLAY_NAME = "vectorstore_thd_v1"
# TARGET_INDEX_ID = "5271278645883699200"

print(f"TARGET_INDEX_DISPLAY_NAME : {TARGET_INDEX_DISPLAY_NAME}")

TARGET_INDEX_DISPLAY_NAME : soverflow_vvs_vectorio_pubv2


### Existing VDF json

In [57]:
# VDF_DIR_PATH = "./vdf_20240129_230804_742d6"
VDF_DIR_PATH = "./notebooks/data/1500_2000_100"

In [58]:
with open(f"{VDF_DIR_PATH}/VDF_META.json") as f:
    d = json.load(f)
    pprint(d)

{'author': 'jordantotten',
 'exported_at': '20240130-031033',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
                    'notebooks/data/1500_2000_100/VDF_META.json'],
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
                                 'dimensions': 768,
                                 'exported_vector_count': 100,
                                 'metric': 'Dot',
                                 'model_name': 'vertex',
                                 'namespace': 'so_questions',
                                 'total_vector_count': 100,
                                 'vector_columns': ['embedding']}]},
 'version': 'v1'}


## Initialize import class

> Pass config for target index to import vectors to

In [59]:
# sys.path.append("..")
# from src.import_vdf.vertexai_vector_search_import_v13 import ImportVertexAIVectorSearch

In [60]:
my_import_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "project_num" : PROJECT_NUM,
    "target_index_id": TARGET_INDEX_ID,
    "batch_size": 50,
    "library_version": VDF_VERSION,
    "dir": VDF_DIR_PATH,
}
pprint(my_import_args)

{'batch_size': 50,
 'dir': './notebooks/data/1500_2000_100',
 'library_version': 'v1',
 'location': 'us-central1',
 'project_id': 'hybrid-vertex',
 'project_num': '934903580331',
 'target_index_id': '7264767993832275968'}


In [61]:
import_vvs = ImportVertexAIVectorSearch(
    args=my_import_args 
)

import_vvs

Importing to index : soverflow_vvs_vectorio_pubv2
Full resource name : projects/934903580331/locations/us-central1/indexes/7264767993832275968
Target index config:
{
    "dimensions": 768.0,
    "approximateNeighborsCount": 150.0,
    "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
    "algorithmConfig": {
        "treeAhConfig": {
            "leafNodeEmbeddingCount": "500",
            "leafNodesToSearchPercent": 80.0
        }
    },
    "shardSize": "SHARD_SIZE_MEDIUM"
}


<src.import_vdf.vertexai_vector_search_import.ImportVertexAIVectorSearch at 0x7fa19e37b580>

In [62]:
import_vvs.vdf_meta

{'author': 'jordantotten',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
  'notebooks/data/1500_2000_100/VDF_META.json'],
 'version': 'v1',
 'exported_at': '20240130-031033',
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
    'dimensions': 768,
    'exported_vector_count': 100,
    'metric': 'Dot',
    'model_name': 'vertex',
    'namespace': 'so_questions',
    'total_vector_count': 100,
    'vector_columns': ['embedding']}]}}

In [63]:
import_vvs.upsert_data_jt()

Importing data from: test_tmp_index
index_meta: [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv', 'dimensions': 768, 'exported_vector_count': 100, 'metric': 'Dot', 'model_name': 'vertex', 'namespace': 'so_questions', 'total_vector_count': 100, 'vector_columns': ['embedding']}]
data_path: notebooks/data/1500_2000_100/tmpqxi_pbxv
vector_column_name    : embedding
vector_metadata_names : ['embedding']


Inserting data: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]

Index import complete
Updated soverflow_vvs_vectorio_pubv2 with 400 vectors





In [42]:
# import_vvs.upsert_data(
#     index_names = EXISTING_INDEX_ID,
#     data = [init_embedding]
# )

### check index vector count

In [64]:
# # set index full resource name
TARGET_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{TARGET_INDEX_ID}'
print(f"TARGET_INDEX_NAME : {TARGET_INDEX_NAME}")

TARGET_INDEX_NAME : projects/934903580331/locations/us-central1/indexes/7264767993832275968


In [65]:
my_vs_index = aip.MatchingEngineIndex(TARGET_INDEX_NAME)

my_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/7264767993832275968',
 'displayName': 'soverflow_vvs_vectorio_pubv2',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'etag': 'AMEw9yMmXtdMcYDbw_cvxb6ePZW1Rq-5PadBCPK4fZb9FwDvahCVYaxFp7WqouqrzJE=',
 'labels': {'prefix': 'vvs-vectorio-pubv2'},
 'createTime': '2024-01-30T03:36:56.951497Z',
 'updateTime': '2024-01-30T03:37:10.208162Z',
 'indexStats': {'vectorsCount': '100', 'shardsCount': 1},
 'indexUpdateMethod': 'STREAM_UPDATE',
 'encryptionSpec': {}}

## Export from existing Vertex AI Vector Search index

In [66]:
# # set index full resource name
TARGET_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{TARGET_INDEX_ID}'
print(f"TARGET_INDEX_NAME : {TARGET_INDEX_NAME}")

TARGET_INDEX_NAME : projects/934903580331/locations/us-central1/indexes/7264767993832275968


In [67]:
my_export_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    # "index": TARGET_INDEX_NAME,
    "index": TARGET_INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
    "model_name":"jt-emb-model",
}

export_vvs = ExportVertexAIVectorSearch(
    args=my_export_args 
)

export_vvs

<src.export_vdf.vertexai_vector_search_export.ExportVertexAIVectorSearch at 0x7fa19e384f70>

In [68]:
export_vvs.args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'soverflow_vvs_vectorio_pubv2',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'jt-emb-model'}

In [69]:
export_vvs.vdf_directory

'vdf_20240130_033945_6b1d0'

In [70]:
export_vvs.get_data()

Fetching indexes:   0%|          | 0/1 [00:00<?, ?it/s]


Exception: Index not deployed to an endpoint. Cannot export index data

In [51]:
VDF_EXPORT_DIR_PATH = "./vdf_20240130_031423_0e2ed"

In [55]:
with open(f"{VDF_EXPORT_DIR_PATH}/VDF_META.json") as f:
    d = json.load(f)
    print(d)

{'version': 'v1', 'file_structure': ['vdf_20240130_031423_0e2ed/vectorstore_thd_v1/1.parquet', 'vdf_20240130_031423_0e2ed/VDF_META.json'], 'author': 'jupyter', 'exported_from': 'vertexai_vectorsearch', 'indexes': {'projects/934903580331/locations/us-central1/indexes/5271278645883699200': [{'index_name': 'vectorstore_thd_v1', 'namespace': 'namespace', 'total_vector_count': 631, 'exported_vector_count': 631, 'metric': 'Dot', 'dimensions': 768, 'model_name': 'jt-emb-model', 'vector_columns': ['vector'], 'data_path': 'vdf_20240130_031423_0e2ed/vectorstore_thd_v1'}]}}


In [53]:
for index_name, index_meta in d["indexes"].items():
    # print(index_name)
    # print(index_meta)
    for namespace_meta in index_meta:
        pprint(namespace_meta)

{'data_path': 'vdf_20240130_031423_0e2ed/vectorstore_thd_v1',
 'dimensions': 768,
 'exported_vector_count': 631,
 'index_name': 'vectorstore_thd_v1',
 'metric': 'Dot',
 'model_name': 'jt-emb-model',
 'namespace': 'namespace',
 'total_vector_count': 631,
 'vector_columns': ['vector']}


In [54]:
file_path = f"{VDF_EXPORT_DIR_PATH}/{TARGET_INDEX_DISPLAY_NAME}/1.parquet"

test_parquet_df = pd.read_parquet(file_path)
test_parquet_df.head(1)

Unnamed: 0,id,vector
0,817b6e36-3529-4d8b-afb8-bbf3ca25c9f5,"[-0.023853905498981476, -0.027847183868288994,..."
