# Set variables

In [59]:
PROJECT_ID = 'image-similarity-393315'
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [60]:
REGION = 'us-central1'

In [71]:
BUCKET_URI = "gs://xray_data_us_central1"  # @param {type:"string"}

In [72]:
BUCKET = 'xray_data_us_central1'

In [63]:
import pickle
from google.cloud import storage
import numpy as np

In [64]:
SERVICE_ACCOUNT = 'default-service-account@image-similarity-393315.iam.gserviceaccount.com' # change to your service account

# Download data from bucket

In [73]:
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.get_bucket(BUCKET)
blobs = bucket.list_blobs()

In [8]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # source_blob_name = "storage-object-name"

    # The path to which the file should be downloaded
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Downloaded storage object {} from bucket {} to local file {}.".format(
            source_blob_name, bucket_name, destination_file_name
        )
    )


In [9]:
download_blob(BUCKET, 'embeddings/efficientnet_b0_vectors.pkl', 'matching_engine/embeddings.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'matching_engine/embeddings.pkl'

In [10]:
#function to load the dictionary from a pickle file
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [407]:
embeddings = load_obj('embeddings')

# pandas

In [415]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [423]:
df = pd.DataFrame(embeddings.items(), columns=['img_name', 'array'])

In [424]:
df.head(1)

Unnamed: 0,img_name,array
0,IM-0001-0001.jpeg,"[-0.24213864, 2.1718588, -0.12323886, -0.16982..."


In [422]:
df.info

<bound method DataFrame.info of                              0  \
0            IM-0001-0001.jpeg   
1            IM-0003-0001.jpeg   
2            IM-0005-0001.jpeg   
3            IM-0006-0001.jpeg   
4            IM-0007-0001.jpeg   
...                        ...   
5851   person99_virus_183.jpeg   
5852  person9_bacteria_38.jpeg   
5853  person9_bacteria_39.jpeg   
5854  person9_bacteria_40.jpeg   
5855  person9_bacteria_41.jpeg   

                                                      1  
0     [-0.24213864, 2.1718588, -0.12323886, -0.16982...  
1     [-0.12668636, 0.388102, -0.16648227, -0.206382...  
2     [-0.25819594, 1.5601717, -0.09210002, 1.401620...  
3     [-0.10521202, 1.0930141, -0.21762829, -0.04048...  
4     [-0.27805325, 1.6878662, -0.20222458, 4.357917...  
...                                                 ...  
5851  [-0.25678584, -0.1522446, -0.0336709, -0.15572...  
5852  [-0.21303566, 1.8309299, 0.5107095, -0.2414375...  
5853  [-0.14925578, 1.7872566, -0.209

In [425]:
train, test = train_test_split(df, test_size=0.1, random_state=1)

In [431]:
train.shape

(5270, 2)

In [430]:
test.shape

(586, 2)

In [433]:
embeddings = np.array(list(embeddings.values()))

In [434]:
embeddings

array([[-0.24213864,  2.1718588 , -0.12323886, ..., -0.24300715,
        -0.0537712 , -0.18774644],
       [-0.12668636,  0.388102  , -0.16648227, ..., -0.23803757,
        -0.06480425,  2.0608761 ],
       [-0.25819594,  1.5601717 , -0.09210002, ..., -0.1997578 ,
        -0.09321513, -0.02120378],
       ...,
       [-0.14925578,  1.7872566 , -0.20936623, ..., -0.2784421 ,
        -0.27832803,  2.2654784 ],
       [-0.2712467 ,  0.7748953 , -0.15583466, ..., -0.18580486,
        -0.23280355,  1.3082702 ],
       [-0.24619427, -0.03436108,  0.7781509 , ..., -0.0994812 ,
        -0.18152872, -0.19130498]], dtype=float32)

In [446]:
train['array'] = train['array'].to_numpy()

In [449]:
test['array'] = test['array'].to_numpy()

In [448]:
train['array'].values

array([array([ 0.5584762 ,  1.7270546 ,  0.9511839 , ...,  0.5438247 ,
              -0.18814966, -0.27733767], dtype=float32)               ,
       array([-0.08214644, -0.21455409,  0.5339077 , ..., -0.27389807,
              -0.02904631,  2.8180537 ], dtype=float32)               ,
       array([-0.2696867 , -0.26122242,  0.05249351, ..., -0.20173565,
              -0.11204528,  0.59696203], dtype=float32)               ,
       ...,
       array([-0.25587174,  2.8557713 ,  1.1684427 , ..., -0.26464173,
              -0.15293182, -0.15197457], dtype=float32)               ,
       array([-0.18286645, -0.27846202,  0.7890189 , ..., -0.25566828,
              -0.11964542, -0.16736534], dtype=float32)               ,
       array([-0.2235914 ,  2.449529  , -0.12374002, ..., -0.27680856,
              -0.11873614, -0.22669679], dtype=float32)               ],
      dtype=object)

In [445]:
train['img_name'].values

array(['person547_virus_1086.jpeg', 'person520_bacteria_2205.jpeg',
       'NORMAL2-IM-1234-0001.jpeg', ..., 'person348_bacteria_1604.jpeg',
       'IM-0353-0001.jpeg', 'person653_virus_1235.jpeg'], dtype=object)

In [356]:
train.shape

(5766, 1280)

In [358]:
test.shape

(90, 1280)

In [453]:
import json

with open('image_embeddings.json', 'w') as f:
    embeddings_formatted = [
        json.dumps(
            {
                'id': str(index),
                'embedding': [str(value) for value in embedding],
                    }
        )
        + '\n'
        for index, embedding in train.values
    ]
    f.writelines(embeddings_formatted)

In [454]:
EMBEDDINGS_INITIAL_URI = f"{BUCKET_URI}/embeddings/json"
! gsutil cp image_embeddings.json {EMBEDDINGS_INITIAL_URI}

Copying file://image_embeddings.json [Content-Type=application/json]...
\ [1 files][ 94.3 MiB/ 94.3 MiB]                                                
Operation completed over 1 objects/94.3 MiB.                                     


# Create index

In [457]:
train['array'][0].shape

(1280,)

In [458]:
DIMENSIONS = train['array'][0].shape
DISPLAY_NAME = 'image_similarity_ann'

In [461]:
import os
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [462]:
tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=EMBEDDINGS_INITIAL_URI,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=7,
    description='Image Similarity ANN index',
    labels={'label_name': 'label_value'},
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/327163206482/locations/us-central1/indexes/5492482793206185984/operations/6681458589757865984
MatchingEngineIndex created. Resource name: projects/327163206482/locations/us-central1/indexes/5492482793206185984
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/327163206482/locations/us-central1/indexes/5492482793206185984')


In [463]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

'projects/327163206482/locations/us-central1/indexes/5492482793206185984'

In [464]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

# Update index

In [311]:
with open('image_embeddings_incremental.json', 'w') as f:
    index = 0
    f.write(
        json.dumps(
            {
                'id': str(index),
                'embedding': [str(0) for _ in train[index]],
            }
        )
        + '\n'
    )

In [91]:
EMBEDDINGS_UPDATE_URI = f"{BUCKET_URI}/embeddings/incremental/"

! gsutil cp image_embeddings_incremental.json {EMBEDDINGS_UPDATE_URI}

In [None]:
tree_ah_index = tree_ah_index.update_embeddings(
    contents_delta_uri=EMBEDDINGS_UPDATE_URI,
)

In [None]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

# Create IndexEndpoint with VPC network

In [465]:
# Retrieve the project number
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

VPC_NETWORK = 'imagesimilarityvpc'
VPC_NETWORK_FULL = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, VPC_NETWORK)
VPC_NETWORK_FULL

'projects/327163206482/global/networks/imagesimilarityvpc'

In [466]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name="ann_index_endpoint",
    description="index endpoint description",
    network=VPC_NETWORK_FULL,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248/operations/5364155698751995904
MatchingEngineIndexEndpoint created. Resource name: projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248')


In [467]:
INDEX_ENDPOINT_NAME = my_index_endpoint.resource_name
INDEX_ENDPOINT_NAME

'projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248'

# Deploy Index

In [468]:
DEPLOYED_INDEX_ID = "tree_ah_image_similarity_deployed_named"

In [469]:
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248/operations/7188113547837046784
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/327163206482/locations/us-central1/indexEndpoints/7798888752373301248


[id: "tree_ah_image_similarity_deployed_named"
index: "projects/327163206482/locations/us-central1/indexes/5492482793206185984"
create_time {
  seconds: 1690139123
  nanos: 767995000
}
private_endpoints {
  match_grpc_address: "172.24.0.5"
}
index_sync_time {
  seconds: 1690139123
  nanos: 767995000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

# Create Queries

In [470]:
NUM_NEIGHBOURS = 10

In [492]:
# Test query
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import \
    Namespace

# Test query
response = my_index_endpoint.match(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test['array'].iloc[11:12],
    num_neighbors=NUM_NEIGHBOURS,
)

response

[[MatchNeighbor(id='NORMAL2-IM-0859-0001.jpeg', distance=1575.8304443359375),
  MatchNeighbor(id='person934_bacteria_2859.jpeg', distance=1443.3421630859375),
  MatchNeighbor(id='IM-0700-0001.jpeg', distance=1385.1085205078125),
  MatchNeighbor(id='person1411_bacteria_3598.jpeg', distance=1363.507568359375),
  MatchNeighbor(id='IM-0502-0001.jpeg', distance=1280.161376953125),
  MatchNeighbor(id='IM-0501-0001.jpeg', distance=1263.3941650390625),
  MatchNeighbor(id='person359_bacteria_1646.jpeg', distance=1249.193603515625),
  MatchNeighbor(id='person38_virus_84.jpeg', distance=1233.211669921875),
  MatchNeighbor(id='person1716_bacteria_4533.jpeg', distance=1222.0478515625),
  MatchNeighbor(id='person443_bacteria_1923.jpeg', distance=1208.154541015625)]]

In [491]:
test.iloc[11:12]

Unnamed: 0,img_name,array
169,IM-0274-0001.jpeg,"[0.21890673, -0.2678706, 1.7154611, -0.2759268..."
