# Testing import and export classes for Vertex AI Vecto Search

> TODO: test public and private endpoints

In [1]:
!pwd

/home/jupyter/vector-io


In [2]:
# create new gcs bucket, vs index, etc.?
CREATE_NEW_ASSETS         = False 

In [3]:
# naming convention for all cloud resources
VERSION        = "pubv2"                     # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv2


## Load notebook config

In [4]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-pubv2"
VERSION                  = "pubv2"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "projects/934903580331/global/networks/"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv2-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv2-hybrid-vertex"



## Imports

In [5]:
import pandas as pd
import numpy as np
import time 
import json
import uuid

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aip.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [6]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI import & export classes

In [7]:
import sys
import os

sys.path.append("..")
from src.import_vdf.vertexai_vector_search_import import ImportVertexAIVectorSearch
from src.export_vdf.vertexai_vector_search_export import ExportVertexAIVectorSearch
from src.names import DBNames

## Create test import data

> TODO

## VDF Metadata json

In [30]:
TEST_DIR = "notebooks/data/1500_2000_100"
TEST_DATA_DIR = f"{TEST_DIR}/tmpqxi_pbxv"

# local vdf json file
TEST_VDF_META = f"{TEST_DIR}/VDF_META.json"

# test parquet files
TEST_FILE_1 = f"{TEST_DATA_DIR}/tmpmee85mkz_1501.parquet"
TEST_FILE_2 = f"{TEST_DATA_DIR}/tmpmee85mkz_1502.parquet"
TEST_FILE_3 = f"{TEST_DATA_DIR}/tmpmee85mkz_1503.parquet"
TEST_FILE_4 = f"{TEST_DATA_DIR}/tmpmee85mkz_1504.parquet"

DIMENSIONS = 768
VDF_VERSION = "v1"
TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

print(f"TEST_FILE_1   : {TEST_FILE_1}")
print(f"TEST_VDF_META : {TEST_VDF_META}")
print(f"DIMENSIONS    : {DIMENSIONS}")
print(f"VDF_VERSION   : {VDF_VERSION}")
print(f"TIMESTAMP_vdf : {TIMESTAMP_vdf}")

TEST_FILE_1   : notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1501.parquet
TEST_VDF_META : notebooks/data/1500_2000_100/VDF_META.json
DIMENSIONS    : 768
VDF_VERSION   : v1
TIMESTAMP_vdf : 20240130-044708


In [31]:
my_vdf = {
    "author": "jordantotten",
    "exported_from": "vertex",
    "file_structure": [
        # TEST_FILE_1,
        TEST_FILE_2,
        # TEST_FILE_3,
        # TEST_FILE_4,
        TEST_VDF_META
    ],
    "version": VDF_VERSION,
    "exported_at": TIMESTAMP_vdf, #'2024-01-28T00:00:00Z',
    "indexes": {
            "test_tmp_index": [
                {
                    "data_path": TEST_DATA_DIR,
                    "dimensions": DIMENSIONS,
                    "exported_vector_count": 100,
                    "metric": "Dot",
                    "model_name": "vertex",
                    "namespace": "so_questions",
                    "total_vector_count": 100,
                    "vector_columns": ["embedding"]
                }
            ]
    }
}
pprint(my_vdf)

{'author': 'jordantotten',
 'exported_at': '20240130-044708',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
                    'notebooks/data/1500_2000_100/VDF_META.json'],
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
                                 'dimensions': 768,
                                 'exported_vector_count': 100,
                                 'metric': 'Dot',
                                 'model_name': 'vertex',
                                 'namespace': 'so_questions',
                                 'total_vector_count': 100,
                                 'vector_columns': ['embedding']}]},
 'version': 'v1'}


In [32]:
for index_name, index_meta in my_vdf["indexes"].items():
    # print(index_name)
    # print(index_meta)
    for namespace_meta in index_meta:
        # pprint(namespace_meta)
        print(namespace_meta['data_path'])

notebooks/data/1500_2000_100/tmpqxi_pbxv


In [33]:
import json
with open(f"{TEST_VDF_META}", 'w') as fp:
    json.dump(my_vdf, fp)

# Vertex AI Vector Search Index

### Existing VDF json

In [34]:
with open(f"{TEST_VDF_META}") as f:
    d = json.load(f)
    pprint(d)

{'author': 'jordantotten',
 'exported_at': '20240130-044708',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
                    'notebooks/data/1500_2000_100/VDF_META.json'],
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
                                 'dimensions': 768,
                                 'exported_vector_count': 100,
                                 'metric': 'Dot',
                                 'model_name': 'vertex',
                                 'namespace': 'so_questions',
                                 'total_vector_count': 100,
                                 'vector_columns': ['embedding']}]},
 'version': 'v1'}


## Initialize import class

> Pass config for target index to import vectors to

In [35]:
TARGET_INDEX_DISPLAY_NAME = "soverflow_vvs_vectorio_pubv2"
TARGET_INDEX_ID = "7264767993832275968"

print(f"TARGET_INDEX_DISPLAY_NAME : {TARGET_INDEX_DISPLAY_NAME}")

TARGET_INDEX_DISPLAY_NAME : soverflow_vvs_vectorio_pubv2


In [36]:
my_import_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "project_num" : PROJECT_NUM,
    "target_index_id": TARGET_INDEX_ID,
    "batch_size": 50,
    "library_version": VDF_VERSION,
    "dir": VDF_DIR_PATH,
}
pprint(my_import_args)

{'batch_size': 50,
 'dir': './notebooks/data/1500_2000_100',
 'library_version': 'v1',
 'location': 'us-central1',
 'project_id': 'hybrid-vertex',
 'project_num': '934903580331',
 'target_index_id': '7264767993832275968'}


In [37]:
import_vvs = ImportVertexAIVectorSearch(
    args=my_import_args 
)

import_vvs

Importing to index : soverflow_vvs_vectorio_pubv2
Full resource name : projects/934903580331/locations/us-central1/indexes/7264767993832275968
Target index config:
{
    "dimensions": 768.0,
    "approximateNeighborsCount": 150.0,
    "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
    "algorithmConfig": {
        "treeAhConfig": {
            "leafNodeEmbeddingCount": "500",
            "leafNodesToSearchPercent": 80.0
        }
    },
    "shardSize": "SHARD_SIZE_MEDIUM"
}


<src.import_vdf.vertexai_vector_search_import.ImportVertexAIVectorSearch at 0x7f10da13b160>

In [38]:
import_vvs.vdf_meta

{'author': 'jordantotten',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/1500_2000_100/tmpqxi_pbxv/tmpmee85mkz_1502.parquet',
  'notebooks/data/1500_2000_100/VDF_META.json'],
 'version': 'v1',
 'exported_at': '20240130-044708',
 'indexes': {'test_tmp_index': [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv',
    'dimensions': 768,
    'exported_vector_count': 100,
    'metric': 'Dot',
    'model_name': 'vertex',
    'namespace': 'so_questions',
    'total_vector_count': 100,
    'vector_columns': ['embedding']}]}}

In [39]:
import_vvs.upsert_data()

Importing data from: test_tmp_index
index_meta: [{'data_path': 'notebooks/data/1500_2000_100/tmpqxi_pbxv', 'dimensions': 768, 'exported_vector_count': 100, 'metric': 'Dot', 'model_name': 'vertex', 'namespace': 'so_questions', 'total_vector_count': 100, 'vector_columns': ['embedding']}]
data_path: notebooks/data/1500_2000_100/tmpqxi_pbxv
vector_column_name    : embedding
vector_metadata_names : ['embedding']


Inserting data: 100%|██████████| 4/4 [00:02<00:00,  1.54it/s]

Index import complete
Updated soverflow_vvs_vectorio_pubv2 with 400 vectors





### check index vector count

In [40]:
# # set index full resource name
TARGET_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{TARGET_INDEX_ID}'
print(f"TARGET_INDEX_NAME : {TARGET_INDEX_NAME}")

TARGET_INDEX_NAME : projects/934903580331/locations/us-central1/indexes/7264767993832275968


In [41]:
my_vs_index = aip.MatchingEngineIndex(TARGET_INDEX_NAME)

my_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/7264767993832275968',
 'displayName': 'soverflow_vvs_vectorio_pubv2',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/5915095480504680448',
   'deployedIndexId': 'deployed_20240130_034024'}],
 'etag': 'AMEw9yN-Ep-j9WByszWVykHkJk0cYb0HKSIbqnOjVE7XljP8lIm9JcQCP2Zbr7jUgtU=',
 'labels': {'prefix': 'vvs-vectorio-pubv2'},
 'createTime': '2024-01-30T03:36:56.951497Z',
 'updateTime': '2024-01-30T03:37:10.208162Z',
 'indexStats': {'ve

## Export from existing Vertex AI Vector Search index

In [42]:
# # set index full resource name
TARGET_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{TARGET_INDEX_ID}'
print(f"TARGET_INDEX_NAME : {TARGET_INDEX_NAME}")

TARGET_INDEX_NAME : projects/934903580331/locations/us-central1/indexes/7264767993832275968


In [43]:
my_export_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    # "index": TARGET_INDEX_NAME,
    "index": TARGET_INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
    "model_name":"jt-emb-model",
}

export_vvs = ExportVertexAIVectorSearch(
    args=my_export_args 
)

export_vvs

<src.export_vdf.vertexai_vector_search_export.ExportVertexAIVectorSearch at 0x7f10da138190>

In [44]:
export_vvs.args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'soverflow_vvs_vectorio_pubv2',
 'library_version': 'v1',
 'dir': '.',
 'model_name': 'jt-emb-model'}

In [45]:
export_vvs.vdf_directory

'vdf_20240130_044740_6b1d0'

In [46]:
export_vvs.get_data()

Fetching indexes:   0%|          | 0/1 [00:00<?, ?it/s]
Exporting soverflow_vvs_vectorio_pubv2:   0%|          | 0/100 [00:00<?, ?it/s][A
Exporting soverflow_vvs_vectorio_pubv2: 100%|██████████| 100/100 [00:00<00:00, 265.93it/s][A
Fetching indexes: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]

{
    "version": "v1",
    "file_structure": [
        "vdf_20240130_044740_6b1d0/soverflow_vvs_vectorio_pubv2/1.parquet",
        "vdf_20240130_044740_6b1d0/VDF_META.json"
    ],
    "author": "jupyter",
    "exported_from": "vertexai_vectorsearch",
    "indexes": {
        "projects/934903580331/locations/us-central1/indexes/7264767993832275968": [
            {
                "index_name": "soverflow_vvs_vectorio_pubv2",
                "namespace": "namespace",
                "total_vector_count": 100,
                "exported_vector_count": 100,
                "metric": "Dot",
                "dimensions": 768,
                "model_name": "jt-emb-model",
                "vector_columns": [
                    "vector"
                ],
                "data_path": "vdf_20240130_044740_6b1d0/soverflow_vvs_vectorio_pubv2"
            }
        ]
    }
}





True

In [47]:
VDF_EXPORT_DIR_PATH = "./vdf_20240130_040709_6b1d0"

In [77]:
with open(f"{VDF_EXPORT_DIR_PATH}/VDF_META.json") as f:
    d = json.load(f)
    print(d)

{'version': 'v1', 'file_structure': ['vdf_20240130_040709_6b1d0/soverflow_vvs_vectorio_pubv2/1.parquet', 'vdf_20240130_040709_6b1d0/VDF_META.json'], 'author': 'jupyter', 'exported_from': 'vertexai_vectorsearch', 'indexes': {'projects/934903580331/locations/us-central1/indexes/7264767993832275968': [{'index_name': 'soverflow_vvs_vectorio_pubv2', 'namespace': 'namespace', 'total_vector_count': 100, 'exported_vector_count': 100, 'metric': 'Dot', 'dimensions': 768, 'model_name': 'jt-emb-model', 'vector_columns': ['vector'], 'data_path': 'vdf_20240130_040709_6b1d0/soverflow_vvs_vectorio_pubv2'}]}}


In [78]:
for index_name, index_meta in d["indexes"].items():
    # print(index_name)
    # print(index_meta)
    for namespace_meta in index_meta:
        pprint(namespace_meta)

{'data_path': 'vdf_20240130_040709_6b1d0/soverflow_vvs_vectorio_pubv2',
 'dimensions': 768,
 'exported_vector_count': 100,
 'index_name': 'soverflow_vvs_vectorio_pubv2',
 'metric': 'Dot',
 'model_name': 'jt-emb-model',
 'namespace': 'namespace',
 'total_vector_count': 100,
 'vector_columns': ['vector']}


In [79]:
file_path = f"{VDF_EXPORT_DIR_PATH}/{TARGET_INDEX_DISPLAY_NAME}/1.parquet"

test_parquet_df = pd.read_parquet(file_path)
test_parquet_df.head(1)

Unnamed: 0,id,vector,tags
0,52922128,"[-0.020482279360294342, -0.05063493549823761, ...",[jenkins]
