# Testing import class for Vertex AI Vector Search

In [1]:
import os

root_path = '..'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/vector-io'

In [2]:
!pwd

/home/jupyter/vector-io


In [3]:
# naming convention for all cloud resources
VERSION        = "pubv3"                     # TODO
PREFIX         = f'vvs-vectorio-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = vvs-vectorio-pubv3


## Load notebook config

In [4]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PREFIX                   = "vvs-vectorio-pubv3"
VERSION                  = "pubv3"

PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"

REGION                   = "us-central1"
BQ_REGION                = "US"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

VPC_NETWORK_NAME         = ""
VPC_NETWORK_FULL         = "None"

USE_PUBLIC_ENDPOINTS     = "True"

BUCKET_NAME              = "vvs-vectorio-pubv3-hybrid-vertex"
BUCKET_URI               = "gs://vvs-vectorio-pubv3-hybrid-vertex"

REMOTE_GCS_FOLDER        = "gs://vvs-vectorio-pubv3-hybrid-vertex/vvs-vectorio-pubv3/embedding_indexes/tmpm4k5k6gq/"

SO_PARQUET_GCS_DIR       = "gs://vvs-vectorio-pubv3-hybrid-vertex/emb_vector_parquet/so_2000_5000_1000/tmpsgm4txp8/"

LOCAL_TEST_DIR           = "data/stack_overflow_parquet_pubv3"
LOCAL_TEST_DATA_DIR      = "data/stack_overflow_parquet_pubv3/files"

DIMENSIONS               = "768"

INDEX_DISPLAY_NAME       = "soverflow_vvs

In [5]:
LOCAL_PARQUEST_FILE_LIST = LOCAL_PARQUEST_FILE_LIST.split("|")
LOCAL_PARQUEST_FILE_LIST

['data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
 'data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
 'data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet']

## Imports

In [6]:
import pandas as pd
import numpy as np
import itertools
import time 
import json
import uuid

from pprint import pprint

from google.cloud import aiplatform as aip
from google.cloud import storage
from google.cloud import bigquery

# logging
import logging
logging.disable(logging.WARNING)

#python warning 
import warnings
warnings.filterwarnings("ignore")

print(f'BigQuery SDK version      : {bigquery.__version__}')
print(f'Vertex AI SDK version     : {aip.__version__}')
print(f'Cloud Storage SDK version : {storage.__version__}')

BigQuery SDK version      : 3.15.0
Vertex AI SDK version     : 1.39.0
Cloud Storage SDK version : 2.14.0


### init Google Cloud SDK clients

In [7]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
aip.init(project=PROJECT_ID, location=REGION)

### Vertex AI Vector Search Import class

In [8]:
import sys
import os

# sys.path.append("..")
from src.import_vdf.vertexai_vector_search_import import ImportVertexAIVectorSearch
from src.names import DBNames

## Get test data

In [9]:
# !tree notebooks/data

In [10]:
# LOCAL_TEST_DIR            # 'data/stack_overflow_parquet'
# LOCAL_TEST_DATA_DIR       # 'data/stack_overflow_parquet/files'

# local vdf json file
TEST_VDF_META = f"notebooks/{LOCAL_TEST_DIR}/VDF_META.json"

VDF_VERSION = "v1"
TIMESTAMP_vdf = time.strftime("%Y%m%d-%H%M%S")

print(f"TEST_VDF_META : {TEST_VDF_META}")
print(f"DIMENSIONS    : {DIMENSIONS}")
print(f"VDF_VERSION   : {VDF_VERSION}")
print(f"TIMESTAMP_vdf : {TIMESTAMP_vdf}\n")

FILE_STRUCTURE_V1 = []
FILE_STRUCTURE_V1.append(f"{TEST_VDF_META}")
for element in LOCAL_PARQUEST_FILE_LIST:
    FILE_STRUCTURE_V1.append(f"notebooks/{element}")

FILE_STRUCTURE_V1

TEST_VDF_META : notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json
DIMENSIONS    : 768
VDF_VERSION   : v1
TIMESTAMP_vdf : 20240201-140843



['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
 'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet']

In [11]:
# validate parquet files
df_from_pq = pd.read_parquet(FILE_STRUCTURE_V1[1])

print(df_from_pq.shape)
df_from_pq.head(3)

(1000, 5)


Unnamed: 0,id,embedding,tag,score,crowding_tag
0,43154170,"[-0.024622129276394844, -0.005234652664512396,...",security,16,cors
1,43441856,"[0.01128651574254036, -0.0018839503172785044, ...",javascript,387,ecmascript-6
2,43460880,"[-0.04847663268446922, -0.01450541615486145, 0...",unit-testing,20,xunit


### Create VDF Metadata json

In [12]:
target_vs_index = aip.MatchingEngineIndex(INDEX_RESOURCE_NAME)
target_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/1081325705452584960',
 'displayName': 'soverflow_vvs_vectorio_pubv3',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/5739455095037231104',
   'deployedIndexId': 'soverflow_vvs_vectorio_pubv3_20240130131739'}],
 'etag': 'AMEw9yP976g4WQR_AzYx_UpBnsUsaRsBxhPGi_MH7faQGG-WrTSCpnKxhmVq7oMPlBo=',
 'createTime': '2024-01-30T13:26:08.725251Z',
 'updateTime': '2024-01-30T13:26:17.176312Z',
 'indexStats': {'vectorsCount': '1', 'shardsC

In [13]:
my_vdf = {
    "author": "jordantotten",
    "exported_from": "vertex",
    "file_structure": FILE_STRUCTURE_V1,
    "version": VDF_VERSION,
    "exported_at": TIMESTAMP_vdf,
    "indexes": {
            "test_so_index": [
                {
                    "data_path": f"notebooks/{LOCAL_TEST_DATA_DIR}",
                    "dimensions": DIMENSIONS,
                    "exported_vector_count": 1000,
                    "metric": "Dot",
                    "model_name": "textembedding-gecko@001",
                    "namespace": "so_questions",
                    "total_vector_count": 1000,
                    "vector_columns": ["embedding"]
                }
            ]
    }
}
pprint(my_vdf)

{'author': 'jordantotten',
 'exported_at': '20240201-140843',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'indexes': {'test_so_index': [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files',
                                'dimensions': '768',
                                'exported_vector_count': 1000,
                                'metric': 'Dot',
                                'model_name': 'textembedding-gecko@001',
                                'namespace': 'so_questions',
                                'total_vector_count': 1000,
                                'vector_columns': ['embedding']}]},


*Write local VDF metadata json file*

In [14]:
with open(f"{TEST_VDF_META}", 'w') as fp:
    json.dump(my_vdf, fp)

*Validate json file*

In [15]:
with open(f"{TEST_VDF_META}") as f:
    d = json.load(f)
    pprint(d)

{'author': 'jordantotten',
 'exported_at': '20240201-140843',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
                    'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'indexes': {'test_so_index': [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files',
                                'dimensions': '768',
                                'exported_vector_count': 1000,
                                'metric': 'Dot',
                                'model_name': 'textembedding-gecko@001',
                                'namespace': 'so_questions',
                                'total_vector_count': 1000,
                                'vector_columns': ['embedding']}]},


# Initialize import class

> Pass config for target index to import vectors t

In [21]:
print(f"INDEX_DISPLAY_NAME : {INDEX_DISPLAY_NAME}")
print(f"DEPLOYED_INDEX_ID  : {DEPLOYED_INDEX_ID}")

INDEX_DISPLAY_NAME : soverflow_vvs_vectorio_pubv3
DEPLOYED_INDEX_ID  : soverflow_vvs_vectorio_pubv3_20240130131739


In [19]:
my_import_args = {
    "project_id": PROJECT_ID,
    "location": REGION,
    "project_num" : PROJECT_NUM,
    "target_index_id": DEPLOYED_INDEX_ID, # INDEX_DISPLAY_NAME | DEPLOYED_INDEX_ID
    "batch_size": 100,
    "library_version": VDF_VERSION,
    "dir": f"notebooks/{LOCAL_TEST_DIR}",
    "filter_restricts": [
        {
            "namespace": "tag",     # vertex VS namespace
            "allow_list": ["tag"],  # col name
        },
    ],
    "numeric_restricts" : [
        {
            "namespace": "score", 
            "data_type": "value_int"
        }
    ],
    "crowding_tag" : "crowding_tag"
}
pprint(my_import_args)

{'batch_size': 100,
 'crowding_tag': 'crowding_tag',
 'dir': 'notebooks/data/stack_overflow_parquet_pubv3',
 'filter_restricts': [{'allow_list': ['tag'], 'namespace': 'tag'}],
 'library_version': 'v1',
 'location': 'us-central1',
 'numeric_restricts': [{'data_type': 'value_int', 'namespace': 'score'}],
 'project_id': 'hybrid-vertex',
 'project_num': '934903580331',
 'target_index_id': 'soverflow_vvs_vectorio_pubv3_20240130131739'}


In [20]:
import_vvs = ImportVertexAIVectorSearch(
    args=my_import_args 
)

import_vvs

Checking undeployed indexes...

No undeployed indexes named: soverflow_vvs_vectorio_pubv3_20240130131739

Checking deployed indexes...

Found target_index: soverflow_vvs_vectorio_pubv3
currently deployed to soverflow_vvs_vectorio_pubv3_endpoint
list_of_ns_restrict_entries : [{'namespace': 'tag', 'allow_list': ['tag']}]
list_of_numeric_entries : [{'namespace': 'score', 'data_type': 'value_int'}]
Importing to index : soverflow_vvs_vectorio_pubv3
Full resource name : projects/934903580331/locations/us-central1/indexes/1081325705452584960
Target index config:
{
    "dimensions": 768.0,
    "approximateNeighborsCount": 150.0,
    "distanceMeasureType": "DOT_PRODUCT_DISTANCE",
    "algorithmConfig": {
        "treeAhConfig": {
            "leafNodeEmbeddingCount": "500",
            "leafNodesToSearchPercent": 80.0
        }
    },
    "shardSize": "SHARD_SIZE_MEDIUM"
}


<src.import_vdf.vertexai_vector_search_import.ImportVertexAIVectorSearch at 0x7faee76de410>

In [35]:
import_vvs.vdf_meta

{'author': 'jordantotten',
 'exported_from': 'vertex',
 'file_structure': ['notebooks/data/stack_overflow_parquet_pubv3/VDF_META.json',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2001.parquet',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2000.parquet',
  'notebooks/data/stack_overflow_parquet_pubv3/files/so_tmpm4k5k6gq_2002.parquet'],
 'version': 'v1',
 'exported_at': '20240201-134348',
 'indexes': {'test_so_index': [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files',
    'dimensions': '768',
    'exported_vector_count': 1000,
    'metric': 'Dot',
    'model_name': 'textembedding-gecko@001',
    'namespace': 'so_questions',
    'total_vector_count': 1000,
    'vector_columns': ['embedding']}]}}

In [36]:
import_vvs.upsert_data()

Importing data from: test_so_index
index_meta: [{'data_path': 'notebooks/data/stack_overflow_parquet_pubv3/files', 'dimensions': '768', 'exported_vector_count': 1000, 'metric': 'Dot', 'model_name': 'textembedding-gecko@001', 'namespace': 'so_questions', 'total_vector_count': 1000, 'vector_columns': ['embedding']}]
data_path: notebooks/data/stack_overflow_parquet_pubv3/files
vector_column_name    : embedding
vector_metadata_names : ['embedding']


Inserting data: 100%|██████████| 3/3 [00:11<00:00,  3.93s/it]

Index import complete
Updated soverflow_vvs_vectorio_pubv3 with 3000 vectors





## Validate imported vectors

Vectors upserted via streaming updates are immeadetaley available for querying, but they will not be reflected in the indexes `vectorsCount` until the index is compacted (see details on [index compaction here](https://cloud.google.com/vertex-ai/docs/vector-search/update-rebuild-index#compaction))

> Index compaction occurs when the uncompacted data size is > 1 GB or the oldest uncompacted data is at least three days old. 

Below, note the `indexStats`. Then verify ability to query newly upserted vector ID 

In [28]:
target_vs_index.to_dict()

{'name': 'projects/934903580331/locations/us-central1/indexes/1081325705452584960',
 'displayName': 'soverflow_vvs_vectorio_pubv3',
 'description': 'sample index for vectorio demo',
 'metadataSchemaUri': 'gs://google-cloud-aiplatform/schema/matchingengine/metadata/nearest_neighbor_search_1.0.0.yaml',
 'metadata': {'config': {'dimensions': 768.0,
   'approximateNeighborsCount': 150.0,
   'distanceMeasureType': 'DOT_PRODUCT_DISTANCE',
   'algorithmConfig': {'treeAhConfig': {'leafNodeEmbeddingCount': '500',
     'leafNodesToSearchPercent': 80.0}},
   'shardSize': 'SHARD_SIZE_MEDIUM'}},
 'deployedIndexes': [{'indexEndpoint': 'projects/934903580331/locations/us-central1/indexEndpoints/5739455095037231104',
   'deployedIndexId': 'soverflow_vvs_vectorio_pubv3_20240130131739'}],
 'etag': 'AMEw9yMTiBrKPnTl5ruVnJGogw4HCNKEl1GlQbweMdFp_wiCeGK9jXkO3oJ1C9MQ4gg=',
 'createTime': '2024-01-30T13:26:08.725251Z',
 'updateTime': '2024-01-30T13:26:17.176312Z',
 'indexStats': {'vectorsCount': '1', 'shardsC

In [21]:
my_index_endpoint = aip.MatchingEngineIndexEndpoint(ENDPOINT_RESOURCE_NAME)
my_index_endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f42c1ecf640> 
resource name: projects/934903580331/locations/us-central1/indexEndpoints/5739455095037231104

In [22]:
ids_to_check = df_from_pq['id'][:3].to_list()
# ids_to_check = ['2102980']
# ids_to_check = ['36062183']

ids_to_check

['43154170', '43441856', '43460880']

In [23]:
read_response = my_index_endpoint.read_index_datapoints(
    deployed_index_id=DEPLOYED_INDEX_ID, 
    ids=ids_to_check,
)
len(read_response)

3

# Notes

### Format vectors

```
        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [
                        str(value) for value in embedding
                    ],
                    "restricts": [
                        {"namespace": "tags", "allow": [str(tag_r)]}
                    ],
                    "numeric_restricts": [
                        {"namespace": "score", "value_int": int(score)}
                    ],
                    "crowding_tag": str(tag_crowd)
                }
            )
            + "\n"
            # for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
            for id, embedding, tag_r, score, tag_crowd in zip(
                id_chunk[is_successful], 
                question_chunk_embeddings, 
                tags_restrict, 
                scores_chunk, 
                tags_crowd
            )
        ]
        f.writelines(embeddings_formatted)
```

**Finished**