# Testing import and export classes for Vertex AI Vecto Search

> TODO: test public and private endpoints

In [1]:
!pwd

/home/jupyter/vector-io


In [2]:
# create new gcs bucket, vs index, etc.?
CREATE_NEW_ASSETS         = False 

In [3]:
# naming convention for all cloud resources
VERSION        = "vpc1"                        # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-vpc1


## Load notebook config

In [4]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-vpc1"
VERSION                  = "vpc1"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"
VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

USE_PUBLIC_ENDPOINTS     = "False"

BUCKET_NAME              = "vvs-vectorio-vpc1-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-vpc1-hybrid-vertex"



## Imports

In [5]:
import pandas as pd
import numpy as np
import time 
import json
import uuid

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aip.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [6]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

# bigquery client
bq_client = bigquery.Client(
    project=PROJECT_ID,
    # location=BQ_REGION
)

### Vertex AI import & export classes

In [7]:
import sys
import os

# TODO - consolidate names: (vertexai_vector_search) vs (vertexai_vectorsearch)
from src.import_vdf.vertexai_vector_search_import import ImportVertexAIVectorSearch
from src.export_vdf.vertexai_vector_search_export import ExportVertexAIVectorSearch
from src.names import DBNames

## VDF Metadata json

In [30]:
with open('VDF_META_jw.json') as f:
    d = json.load(f)
    pprint(d)

{'author': 'jwortz',
 'exported_from': 'vertex vector store',
 'file_structure': '.jsonl',
 'indexes': {'exported_at': '2024-01-28T00:00:00Z',
             'glove100': {'data_path': 'gs://jsw-book-qa/me-json',
                          'dimensions': 100,
                          'exported_vector_count': 0,
                          'metric': 'dot',
                          'model_name': 'vertex_vector_search',
                          'namespace': 'glove100',
                          'total_vector_count': 0,
                          'vector_colum   ns': 'na'}},
 'version': '0.1'}


# Vertex AI Vector Search Index

In [8]:
# INDEX_DISPLAY_NAME = "soverflow_vvs_vectorio_vpc1"
# EXISTING_INDEX_ID = "7417608906186162176"

INDEX_DISPLAY_NAME = "vectorstore_thd_v1"
EXISTING_INDEX_ID = "5271278645883699200"

### 

EXISTING_INDEX_NAME = f'projects/{PROJECT_NUM}/locations/{REGION}/indexes/{EXISTING_INDEX_ID}'

TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

INIT_EMB_FILENAME = "embeddings_0.json"
EMBEDDING_DIR_BUCKET_URI = f"{BUCKET_URI}/init_index"
EMBEDDING_INIT_FILE_URI = f"{EMBEDDING_DIR_BUCKET_URI}/{INIT_EMB_FILENAME}"

GCS_IMPORT_JSON = f"{BUCKET_URI}/vvs-vectorio-vpc1/embedding_indexes/tmp8ewwgaaq"
INDEX_EXPORT_DATA_PATH = f"{BUCKET_URI}/export/{TIMESTAMP_vdf}"

DIMENSIONS = 768
VDF_VERSION = "0.0.1.dev"


print(f"INDEX_DISPLAY_NAME       : {INDEX_DISPLAY_NAME}")
print(f"TIMESTAMP_vdf            : {TIMESTAMP_vdf}")
print(f"EMBEDDING_DIR_BUCKET_URI : {EMBEDDING_DIR_BUCKET_URI}")
print(f"EMBEDDING_INIT_FILE_URI  : {EMBEDDING_INIT_FILE_URI}")
print(f"GCS_IMPORT_JSON          : {GCS_IMPORT_JSON}")
print(f"INDEX_EXPORT_DATA_PATH   : {INDEX_EXPORT_DATA_PATH}")
print(f"DIMENSIONS               : {DIMENSIONS}")
print(f"EXISTING_INDEX_NAME      : {EXISTING_INDEX_NAME}")

INDEX_DISPLAY_NAME       : vectorstore_thd_v1
TIMESTAMP_vdf            : 20240129-192338
EMBEDDING_DIR_BUCKET_URI : gs://vvs-vectorio-vpc1-hybrid-vertex/init_index
EMBEDDING_INIT_FILE_URI  : gs://vvs-vectorio-vpc1-hybrid-vertex/init_index/embeddings_0.json
GCS_IMPORT_JSON          : gs://vvs-vectorio-vpc1-hybrid-vertex/vvs-vectorio-vpc1/embedding_indexes/tmp8ewwgaaq
INDEX_EXPORT_DATA_PATH   : gs://vvs-vectorio-vpc1-hybrid-vertex/export/20240129-192338
DIMENSIONS               : 768
EXISTING_INDEX_NAME      : projects/934903580331/locations/us-central1/indexes/5271278645883699200


### Create dummy file to import

In [46]:
# dummy embedding
init_embedding = {
    "datapointId": str(uuid.uuid4()),
    "featureVector": list(np.zeros(DIMENSIONS))
}

# dump embedding to a local file
with open(f"{INIT_EMB_FILENAME}", "w") as f:
    json.dump(init_embedding, f)

# write embedding to Cloud Storage
! gsutil cp $INIT_EMB_FILENAME $EMBEDDING_INIT_FILE_URI

Copying file://embeddings_0.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


In [33]:
!gsutil ls $EMBEDDING_DIR_BUCKET_URI

gs://vvs-vectorio-vpc1-hybrid-vertex/init_index/embeddings_0.json


### Create VDF json

In [55]:
my_vdf = {
    'author': 'jordantotten',
    'exported_from': 'vertex vector store',
    'file_structure': '.jsonl',
    'version': VDF_VERSION,
    'indexes': {
         'exported_at': TIMESTAMP_vdf, #'2024-01-28T00:00:00Z',
         INDEX_DISPLAY_NAME: {
             'data_path': EMBEDDING_DIR_BUCKET_URI,
             'dimensions': DIMENSIONS,
             'exported_vector_count': 1,
             'metric': 'Euclid',
             'model_name': 'vertex_vector_search',
             'namespace': 'so_questions',
             'total_vector_count': 1,
             'vector_columns': 'embedding'
         }
     }
}
pprint(my_vdf)

{'author': 'jordantotten',
 'exported_from': 'vertex vector store',
 'file_structure': '.jsonl',
 'indexes': {'exported_at': '20240129-192026',
             'vectorstore_thd_v1': {'data_path': 'gs://vvs-vectorio-vpc1-hybrid-vertex/init_index',
                                    'dimensions': 768,
                                    'exported_vector_count': 1,
                                    'metric': 'Euclid',
                                    'model_name': 'vertex_vector_search',
                                    'namespace': 'so_questions',
                                    'total_vector_count': 1,
                                    'vector_columns': 'embedding'}},
 'version': '0.0.1.dev'}


In [56]:
import json
with open('VDF_META.json', 'w') as fp:
    json.dump(my_vdf, fp)

## Initialize existing or create new index

In [57]:
my_import_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "index": INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
}

import_vvs = ImportVertexAIVectorSearch(
    args=my_import_args 
)

import_vvs

<src.import_vdf.vertexai_vector_search_import.ImportVertexAIVectorSearch at 0x7f885c706ec0>

In [58]:
import_vvs.upsert_data(
    index_names = EXISTING_INDEX_ID,
    data = [init_embedding]
)

Upserted datapoints


### check index vector count

In [59]:
my_vs_index = aip.MatchingEngineIndex(EXISTING_INDEX_NAME)

my_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/5271278645883699200',
 'displayName': 'vectorstore_thd_v1',
 'description': 'Index for LangChain demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 7.0}},
   'shardSize': 'SHARD_SIZE_SMALL'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/3260843624727838720',
   'deployedIndexId': 'vectorstore_thd_v1_20230606140136',
   'displayName': 'vectorstore_thd_v1_20230606140136'},
  {'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/3260843624727838720',
   'deployedIndexId': 'vectorstore_thd_v1_20230606143229',
   'displayName': 'vectorstore_

## Export from existing Vertex AI Vector Search index

In [13]:
# INDEX_DISPLAY_NAME = "soverflow_vvs_vectorio_vpc1"
INDEX_DISPLAY_NAME = "vectorstore_thd_v1"

my_export_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "index": INDEX_DISPLAY_NAME,
    "library_version": VDF_VERSION,
    "dir": ".",
    "model_name":"jt-emb-model",
}

export_vvs = ExportVertexAIVectorSearch(
    args=my_export_args 
)

export_vvs

<src.export_vdf.vertexai_vectorsearch_export.ExportVertexAIVectorSearch at 0x7fb2541c38b0>

In [14]:
export_vvs.args

{'project_id': 'hybrid-vertex',
 'location': 'us-central1',
 'index': 'vectorstore_thd_v1',
 'library_version': '0.0.1.dev',
 'dir': '.',
 'model_name': 'jt-emb-model'}

In [15]:
export_vvs.vdf_directory

'vdf_20240129_192550_cd2e0'

In [16]:
export_vvs.get_data()

Fetching indexes:   0%|          | 0/1 [00:00<?, ?it/s]

Exporting vectorstore_thd_v1:   0%|          | 0/631 [00:00<?, ?it/s][A[A

Exporting vectorstore_thd_v1: 100%|██████████| 631/631 [00:00<00:00, 2058.19it/s][A[A
Fetching indexes: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]

[A                                                                              
Exporting vectorstore_thd_v1: 100%|██████████| 631/631 [01:37<00:00, 1622.75it/s][A

{
    "version": "0.0.1.dev",
    "file_structure": [
        "vdf_20240129_192550_cd2e0/vectorstore_thd_v1/1.parquet",
        "vdf_20240129_192550_cd2e0/VDF_META.json"
    ],
    "author": "jupyter",
    "exported_from": "vertexai_vectorsearch",
    "indexes": {
        "projects/934903580331/locations/us-central1/indexes/5271278645883699200": {
            "": [
                {
                    "index_name": "vectorstore_thd_v1",
                    "namespace": "",
                    "total_vector_count": 631,
                    "exported_vector_count": 631,
                    "metric": "Dot",
                    "dimensions": 768,
                    "model_name": "jt-emb-model",
                    "vector_columns": [
                        "vector"
                    ],
                    "data_path": "vdf_20240129_192550_cd2e0/vectorstore_thd_v1"
                }
            ]
        }
    }
}


True