# Testing import class for Vertex AI Vector Search

In [1]:
import os

root_path = ".."
os.chdir(root_path)
os.getcwd()

'/home/jupyter/vector-io'

In [2]:
!pwd

/home/jupyter/vector-io


In [3]:
# naming convention for all cloud resources
VERSION = "pubv3"  # TODO
PREFIX = f"vvs-vectorio-{VERSION}"  # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv3


## Load notebook config

In [4]:
# staging GCS
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME = f"{PREFIX}-{PROJECT_ID}"
BUCKET_URI = f"gs://{BUCKET_NAME}"

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-pubv3"
VERSION                  = "pubv3"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "None"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv3-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv3-hybrid-vertex"

REMOTE_GCS_FOLDER        = "gs://vvs-vectorio-pubv3-hybrid-vertex/vvs-vectorio-pubv3/embedding_indexes/tmpm4k5k6gq/"

SO_PARQUET_GCS_DIR       = "gs://vvs-vectorio-pubv3-hybrid-vertex/emb_vector_parquet/so_2000_5000_1000/tmpsgm4txp8/"

LOCAL_TEST_DIR           = "data/stack_overflow_parquet_pubv3"
LOCAL_TEST_DATA_DIR      = "data/stack_overflow_parquet_pubv3/files"

DIMENSIONS               = "768"

INDEX_DISPLAY_NAME       = "soverflow_vvs

In [5]:
LOCAL_PARQUEST_FILE_LIST = LOCAL_PARQUEST_FILE_LIST.split("|")
LOCAL_PARQUEST_FILE_LIST

['data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
 'data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
 'data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet']

## Imports

In [6]:
import pandas as pd
import time
import json

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging

logging.disable(logging.WARNING)

# python warning
import warnings

warnings.filterwarnings("ignore")

print(f"BigQuery SDK version      : {bigquery.__version__}")
print(f"Vertex AI SDK version     : {aip.__version__}")
print(f"Cloud Storage SDK version : {storage.__version__}")

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [7]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI Vector Search Import class

In [8]:
import os

# sys.path.append("..")
# from vdf_io.import_vdf.vertexai_vector_search_import import ImportVertexAIVectorSearch # TODO
# from vdf_io.names import DBNames

## Get test data

In [9]:
# !tree notebooks/data

In [36]:
# LOCAL_TEST_DIR            # 'data/stack_overflow_parquet'
# LOCAL_TEST_DATA_DIR       # 'data/stack_overflow_parquet/files'

# local vdf json file
TEST_VDF_META_DIR = f"notebooks/{LOCAL_TEST_DIR}"
TEST_VDF_META = f"{TEST_VDF_META_DIR}/VDF_META.json"

VDF_VERSION = "v1"
TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

print(f"TEST_VDF_META : {TEST_VDF_META}")
print(f"DIMENSIONS    : {DIMENSIONS}")
print(f"VDF_VERSION   : {VDF_VERSION}")
print(f"TIMESTAMP_vdf : {TIMESTAMP_vdf}\n")

FILE_STRUCTURE_V1 = []
FILE_STRUCTURE_V1.append(f"{TEST_VDF_META}")
for element in LOCAL_PARQUEST_FILE_LIST:
    FILE_STRUCTURE_V1.append(f"notebooks/{element}")

FILE_STRUCTURE_V1

TEST_VDF_META : notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json
DIMENSIONS    : 768
VDF_VERSION   : v1
TIMESTAMP_vdf : 20240205-142539



['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet']

In [11]:
# validate parquet files
df_from_pq = pd.read_parquet(FILE_STRUCTURE_V1[1])

print(df_from_pq.shape)
df_from_pq.head(3)

(1000, 5)


Unnamed: 0,id,embedding,tag,score,crowding_tag
0,43154170,"[-0.024622129276394844, -0.005234652664512396,...",security,16,cors
1,43441856,"[0.01128651574254036, -0.0018839503172785044, ...",javascript,387,ecmascript-6
2,43460880,"[-0.04847663268446922, -0.01450541615486145, 0...",unit-testing,20,xunit


### Create VDF Metadata json

In [12]:
# target_vs_index = aip.MatchingEngineIndex(INDEX_RESOURCE_NAME)
# target_vs_index.to_dict()

In [41]:
DATA_PATH = "vdf_20240201_181031_a26f5"
my_vdf = {
    "author": "jordantotten",
    "exported_from": "vertex",
    "file_structure": FILE_STRUCTURE_V1,
    "version": VDF_VERSION,
    "exported_at": TIMESTAMP_vdf,
    "indexes": {
        "soverflow_vvs_vectorio_pubv3": [
            {
                "data_path": f"{DATA_PATH}/soverflow_vvs_vectorio_pubv3",
                "dimensions": int(DIMENSIONS),
                "exported_vector_count": 1000,
                "metric": "Dot",
                "model_name": "textembedding-gecko@001",
                "namespace": "",
                "total_vector_count": 1000,
                "vector_columns": ["vector"],
            }
        ]
    },
}
pprint(my_vdf)

{'author': 'jordantotten',
 'exported_at': '20240205-142539',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'indexes': {'soverflow_vvs_vectorio_pubv3': [{'data_path': 'vdf_20240201_181031_a26f5/soverflow_vvs_vectorio_pubv3',
                                               'dimensions': 768,
                                               'exported_vector_count': 1000,
                                               'metric': 'Dot',
                                               'model_name': 'textembedding-gecko@001',
                                               'namespace': '',
                                               

*Write local VDF metadata json file*

In [42]:
with open(f"{TEST_VDF_META}", "w") as fp:
    json.dump(my_vdf, fp)

*Validate json file*

In [43]:
with open(f"{TEST_VDF_META}") as f:
    d = json.load(f)
    pprint(d)

{'author': 'jordantotten',
 'exported_at': '20240205-142539',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'indexes': {'soverflow_vvs_vectorio_pubv3': [{'data_path': 'vdf_20240201_181031_a26f5/soverflow_vvs_vectorio_pubv3',
                                               'dimensions': 768,
                                               'exported_vector_count': 1000,
                                               'metric': 'Dot',
                                               'model_name': 'textembedding-gecko@001',
                                               'namespace': '',
                                               

### run via Command Line

In [50]:
# --create_new
# --deploy_new_index

CL_STRING = f"""
src/import_vdf.py --dir {TEST_VDF_META_DIR} vertexai_vectorsearch -p {PROJECT_ID} -l {REGION}
"""
print(CL_STRING)


src/import_vdf.py --dir notebooks/data/stack_overflow_parquet_pubv3 vertexai_vectorsearch -p hybrid-vertex -l us-central1



# Initialize import class

> Pass config for target index to import vectors t

In [22]:
print(f"INDEX_DISPLAY_NAME : {INDEX_DISPLAY_NAME}")
print(f"DEPLOYED_INDEX_ID  : {DEPLOYED_INDEX_ID}")

INDEX_DISPLAY_NAME : soverflow_vvs_vectorio_pubv3
DEPLOYED_INDEX_ID  : soverflow_vvs_vectorio_pubv3_20240130131739


In [23]:
TARGET_INDEX_ARG = (
    "new_index_vv4"  # INDEX_DISPLAY_NAME | DEPLOYED_INDEX_ID | INDEX_RESOURCE_NAME
)

my_import_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "batch_size": 100,
    "library_version": VDF_VERSION,
    "dir": DATA_PATH,
    "filter_restricts": [
        {
            "namespace": "tag",  # vertex VS namespace
            "allow_list": ["tag"],  # col name
        },
    ],
    "numeric_restricts": [{"namespace": "score", "data_type": "value_int"}],
    "crowding_tag": "crowding_tag",
    "create_new_index": False,
    "gcs_bucket": BUCKET_NAME,
    "machine_type": "e2-standard-16",
    "approx_nn_count": 150,
    "leaf_node_emb_count": 1000,
    "leaf_nodes_percent": 7,
    "distance_measure": "DOT_PRODUCT_DISTANCE",
    "deploy_new_index": False,
    "machine_type": "e2-standard-16",
    "min_replicas": 1,
    "max_replicas": 1,
}
pprint(my_import_args)

{'approx_nn_count': 150,
 'batch_size': 100,
 'create_new_index': False,
 'crowding_tag': 'crowding_tag',
 'deploy_new_index': False,
 'dir': 'notebooks/data/stack_overflow_parquet_pubv3/files',
 'distance_measure': 'DOT_PRODUCT_DISTANCE',
 'filter_restricts': [{'allow_list': ['tag'], 'namespace': 'tag'}],
 'gcs_bucket': 'vvs-vectorio-pubv3-hybrid-vertex',
 'leaf_node_emb_count': 1000,
 'leaf_nodes_percent': 7,
 'library_version': 'v1',
 'location': 'us-central1',
 'machine_type': 'e2-standard-16',
 'max_replicas': 1,
 'min_replicas': 1,
 'numeric_restricts': [{'data_type': 'value_int', 'namespace': 'score'}],
 'project_id': 'hybrid-vertex',
 'project_num': '934903580331'}


In [19]:
import_vvs = ImportVertexAIVectorSearch(args=my_import_args)

import_vvs

list_restrict_entries : [{'namespace': 'tag', 'allow_list': ['tag']}]
list_of_numeric_entries : [{'namespace': 'score', 'data_type': 'value_int'}]
checking if new_index_vv4 already exists...
checking existing display_names and resource_names
found existing index: projects/934903580331/locations/us-central1/indexes/2486448789192179712
Importing to index : new_index_vv4
Full resource name : projects/934903580331/locations/us-central1/indexes/2486448789192179712
Target index config:
{
    "dimensions": 768.0,
    "approximateNeighborsCount": 150.0,
    "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
    "algorithmConfig": {
        "treeAhConfig": {
            "leafNodeEmbeddingCount": "1000",
            "leafNodesToSearchPercent": 10.0
        }
    },
    "shardSize": "SHARD_SIZE_MEDIUM"
}


<src.import_vdf.jt_vertexai_vector_search_import.ImportVertexAIVectorSearch at 0x7f003c82b550>

In [20]:
import_vvs.vdf_meta

{'author': 'jordantotten',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'version': 'v1',
 'exported_at': '20240203-144741',
 'indexes': {'test_so_index': [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files',
    'dimensions': '768',
    'exported_vector_count': 1000,
    'metric': 'Dot',
    'model_name': 'textembedding-gecko@001',
    'namespace': 'so_questions',
    'total_vector_count': 1000,
    'vector_columns': ['embedding']}]}}

In [21]:
import_vvs.upsert_data()

Importing data from: test_so_index
index_meta: [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files', 'dimensions': '768', 'exported_vector_count': 1000, 'metric': 'Dot', 'model_name': 'textembedding-gecko@001', 'namespace': 'so_questions', 'total_vector_count': 1000, 'vector_columns': ['embedding']}]
data_path: notebooks/data/stack_overflow_parquet_pubv3/files
vector_column_name    : embedding
vector_metadata_names : ['embedding']


Inserting data: 100%|██████████| 3/3 [00:11<00:00,  3.94s/it]

Index import complete
Updated new_index_vv4 with 3000 vectors





In [22]:
# import_vvs.list_index_endpoints

## Validate imported vectors

Vectors upserted via streaming updates are immeadetaley available for querying, but they will not be reflected in the indexes `vectorsCount` until the index is compacted (see details on [index compaction here](https://cloud.google.com/vertex-ai/docs/vector-search/update-rebuild-index#compaction))

> Index compaction occurs when the uncompacted data size is > 1 GB or the oldest uncompacted data is at least three days old. 

Below, note the `indexStats`. Then verify ability to query newly upserted vector ID 

In [23]:
print(f"target_index_resource_name:\n {import_vvs.target_index_resource_name}")

target_vs_index = aip.MatchingEngineIndex(import_vvs.target_index_resource_name)
target_vs_index.to_dict()

target_index_resource_name:
 projects/934903580331/locations/us-central1/indexes/2486448789192179712


{'name': 'projects/934903580331/locations/us-central1/indexes/2486448789192179712',
 'displayName': 'new_index_vv4',
 'description': 'created during vectorio import at 20240204_222701',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '1000',
     'leafNodesToSearchPercent': 10.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'etag': 'AMEw9yNrFJej2PV6Vw34fz0RE2voRmjkD_LQe_mmn5J2AgTyUWw2JbgCynnIEoBVjCjM',
 'createTime': '2024-02-04T22:27:01.899233Z',
 'updateTime': '2024-02-04T22:27:08.611341Z',
 'indexStats': {'vectorsCount': '1', 'shardsCount': 1},
 'indexUpdateMethod': 'STREAM_UPDATE',
 'encryptionSpec': {}}

In [24]:
my_index_endpoint = aip.MatchingEngineIndexEndpoint(ENDPOINT_RESOURCE_NAME)
my_index_endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7effdcbaf820> 
resource name: projects/934903580331/locations/us-central1/indexEndpoints/5739455095037231104

In [25]:
ids_to_check = df_from_pq["id"][:3].to_list()

ids_to_check

['43154170', '43441856', '43460880']

In [26]:
read_response = my_index_endpoint.read_index_datapoints(
    deployed_index_id=DEPLOYED_INDEX_ID,
    ids=ids_to_check,
)
len(read_response)

3

# Notes

### Format vectors

```
        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [
                        str(value) for value in embedding
                    ],
                    "restricts": [
                        {"namespace": "tags", "allow": [str(tag_r)]}
                    ],
                    "numeric_restricts": [
                        {"namespace": "score", "value_int": int(score)}
                    ],
                    "crowding_tag": str(tag_crowd)
                }
            )
            + "\n"
            # for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
            for id, embedding, tag_r, score, tag_crowd in zip(
                id_chunk[is_successful], 
                question_chunk_embeddings, 
                tags_restrict, 
                scores_chunk, 
                tags_crowd
            )
        ]
        f.writelines(embeddings_formatted)
```

**Finished**