# Create index endpoint for hybrid search

## Setup

In [None]:
%pip install --upgrade --quiet --user google-cloud-aiplatform google-cloud-storage

Note: you may need to restart the kernel to use updated packages.




In [None]:
# Inbuild python packages
import os
from datetime import datetime
import json

# Data manipulation packages
import pandas as pd
import numpy as np

# GCP imports
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import HybridQuery

# Embeddings methods
from movie_bot import SparseVectorizer, DenseVectorizer

In [None]:
# Set Google Cloud project information
UID = datetime.now().strftime("%m%d%H%M")                       # timestamp to create Unique IDs (UID)
PROJECT_ID = 'synthetic-diode-459014-v9'                        # ID of project that has already been created
LOCATION = "europe-west1"
BUCKET_URI = f"gs://{PROJECT_ID}"                               # URI for new bucket to store embeddings

INDEX_NAME = "WhatMovie-1-index"                                # name for the index we will create
INDEX_ENDPOINT_NAME = "WhatMovie-1-index-endpoint"              # name for the index endpoint we will create
DEPLOYED_INDEX_NAME = "WhatMovie-1-deployed-index"              # name for the index we will deploy

# Initialize Vertex AI SDK
aiplatform.init(project=PROJECT_ID, location=LOCATION)

print(f"UID: {UID}")
print(f"Project ID: {PROJECT_ID}")
print(f"Region: {LOCATION}")
print(f"Bucket URI: {BUCKET_URI}")
print(f"Deployed Hybrid Index ID: {DEPLOYED_HYBRID_INDEX_ID}")

Project ID: synthetic-diode-459014-v9
Region: europe-west1
Bucket URI: gs://synthetic-diode-459014-v9
Deployed Hybrid Index ID: vs_hybridsearch_deployed_07041616


In [None]:
# Saving setup information to config.json
setup_info = {
    "UID": UID,
    "project_id": PROJECT_ID,
    "location": LOCATION,
}

## Get dataset

In [4]:
# load in dataset
df = pd.read_csv(
    'data/processed_dataset.csv',
    index_col = 'id'
)
df.head()

Unnamed: 0_level_0,title,vote_average,vote_count,release_date,overview,popularity,tagline,genres,production_companies,keywords,movie_details
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
27205,Inception,8.364,34495,2010-07-15,"Cobb, a skilled thief who commits corporate es...",83.952,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","rescue, mission, dream, airplane, paris, franc...",title: Inception release_date: 2010-07-15 genr...
157336,Interstellar,8.417,32571,2014-11-05,The adventures of a group of explorers who mak...,140.241,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","rescue, future, spacecraft, race against time,...",title: Interstellar release_date: 2014-11-05 g...
155,The Dark Knight,8.512,30619,2008-07-16,Batman raises the stakes in his war on crime. ...,130.643,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","joker, sadism, chaos, secret identity, crime f...",title: The Dark Knight release_date: 2008-07-1...
19995,Avatar,7.573,29815,2009-12-15,"In the 22nd century, a paraplegic Marine is di...",79.932,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","future, society, culture clash, space travel, ...",title: Avatar release_date: 2009-12-15 genres:...
24428,The Avengers,7.71,29166,2012-04-25,When an unexpected enemy emerges and threatens...,98.082,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,"new york city, superhero, shield, based on com...",title: The Avengers release_date: 2012-04-25 g...


In [5]:
sample_movie_details = df['movie_details'].sample(500).reset_index()
sample_movie_details

Unnamed: 0,id,movie_details
0,834586,title: Astralis release_date: 1972-05-08 genre...
1,1431329,title: David Hurn: A Life in Pictures release_...
2,316340,"title: Two, Mississippi release_date: 2015-07-..."
3,1021328,title: THE SILENT VOYAGE release_date: 2003-01...
4,849757,title: God Knows I Tried release_date: 2019-08...
...,...,...
495,1342122,title: Beyond the Binary release_date: 2024-04...
496,195859,title: Hunters Since the Beginning of Time rel...
497,215877,title: Dolls release_date: 2013-03-13 genres: ...
498,681767,title: Free Your Mind release_date: 2003-01-01...


In [6]:
sample_movie_details['id'].is_unique

True

## Generate embeddings

In [20]:
# Set vectorisers
sparse_vectorizer = TfidfVectorizer()
sparse_vectorizer.fit_transform(sample_movie_details['movie_details'])

dense_vectorizer = TextEmbeddingModel.from_pretrained("text-embedding-005")

# Embedding methods
def get_sparse_embedding(text):
    '''Vectorises movie details using TF-IDF encoding'''
    # Transform Text into TF-IDF Sparse Vector
    tfidf_vector = sparse_vectorizer.transform([text])
    # Convert to google embedding format
    return {"values": list(tfidf_vector.data), "dimensions": list(tfidf_vector.indices)}

def get_dense_embedding(text):
    '''Wrapper for dense embedding method'''
    return dense_vectorizer.get_embeddings([text])[0].values



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

class SparseVectorSet():
    vectorizer = TfidfVectorizer()
    
    def __init__(self, corpus):
        '''Vectorizes movie details using TF-IDF encoding'''
        self.vectors = self.vectorizer.fit_transform(corpus)
    
    def similarity(self, other_vectors):
        '''Calculates dot product similarity between two texts'''
        return self.v.dot(other_vectors.vectors.T).toarray()[0][0]
    
    def to_json(self, vector):
        '''Converts sparse vector to JSON format'''
        return {"values": list(vector.data), "dimensions": list(vector.indices)}
    
    def __repr__(self):
        return f'{self.vectors[0]}'
        #SparseVector({len(self.values)} values: [{self.values[0:5].join(', ')}...], {len(self.dimensions)} dimensions: [{self.dimensions[0:5].join(', ')}...]


In [26]:
vectors = SparseVectorSet(sample_movie_details['movie_details'])
f'{vectors.vectors[0]}'

'  (0, 6113)\t0.05024573002612159\n  (0, 543)\t0.32777595575718965\n  (0, 5033)\t0.05024573002612159\n  (0, 86)\t0.2929482696544073\n  (0, 6)\t0.16915016490372892\n  (0, 9)\t0.1724642711464672\n  (0, 2584)\t0.05024573002612159\n  (0, 431)\t0.18254711673481253\n  (0, 5938)\t0.05024573002612159\n  (0, 4130)\t0.10950925385921373\n  (0, 3395)\t0.05024573002612159\n  (0, 4413)\t0.05024573002612159\n  (0, 3774)\t0.15945757937458863\n  (0, 3228)\t0.10224114453819809\n  (0, 3665)\t0.3074030654001747\n  (0, 3069)\t0.08020249732256808\n  (0, 2902)\t0.11576371075977117\n  (0, 6176)\t0.2929482696544073\n  (0, 357)\t0.32777595575718965\n  (0, 2817)\t0.13763658504368487\n  (0, 1010)\t0.2929482696544073\n  (0, 706)\t0.22024676906811827\n  (0, 1242)\t0.258120583551625\n  (0, 6624)\t0.10635713015823627\n  (0, 443)\t0.23372588891428564\n  (0, 4559)\t0.24211964335203198'

In [None]:
# Get vectorizers
sparse_vectorizer = SparseVectorizer(sample_movie_details['movie_details'])
dense_vectorizer = DenseVectorizer()

# Generate embeddings
sample_movie_details['sparse_embedding'] = sample_movie_details["movie_details"].map(sparse_vectorizer.embed)
sample_movie_details['embedding'] = sample_movie_details["movie_details"].map(dense_vectorizer.embed)
sample_movie_details.to_json("embeddings.json", orient="records", lines = True, force_ascii = False)

(dict_keys(['id', 'movie_details', 'sparse_embedding', 'embedding']),
 768,
 98,
 98)

## Save embeddings to cloud object storage

In [21]:
# Create cloud bucket
!gsutil mb -c standard -l europe-west1 $BUCKET_URI

Creating gs://synthetic-diode-459014-v9/...


In [45]:
# Upload file to bucket
!gsutil cp embeddings.json $BUCKET_URI

Copying file://embeddings.json [Content-Type=application/json]...
/ [0 files][    0.0 B/  5.6 MiB]                                                
-
- [0 files][  2.8 MiB/  5.6 MiB]                                                
\
\ [1 files][  5.6 MiB/  5.6 MiB]                                                
|

Operation completed over 1 objects/5.6 MiB.                                      


## Create and deploy index

In [None]:
# Create index endpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = INDEX_ENDPOINT_NAME, 
    public_endpoint_enabled = True
)

In [None]:
# Create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = INDEX_NAME,
    contents_delta_uri = BUCKET_URI,
    dimensions = 768,
    approximate_neighbors_count = 10,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/128273250653/locations/europe-west1/indexes/4006963419243610112/operations/5719318364208234496
MatchingEngineIndex created. Resource name: projects/128273250653/locations/europe-west1/indexes/4006963419243610112
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/128273250653/locations/europe-west1/indexes/4006963419243610112')


In [None]:
# Deploy index to endpoint
my_index_endpoint.deploy_index(
    index = my_index,
    deployed_index_display_name = DEPLOYED_INDEX_NAME,
    deployed_index_id = f"{DEPLOYED_INDEX_NAME}-{UID}"
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/128273250653/locations/europe-west1/indexEndpoints/5309946270809325568


Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/128273250653/locations/europe-west1/indexEndpoints/5309946270809325568/operations/8448921950859821056
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/128273250653/locations/europe-west1/indexEndpoints/5309946270809325568


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x0000025803111370> 
resource name: projects/128273250653/locations/europe-west1/indexEndpoints/5309946270809325568

## Query index endpoint

In [74]:
sample = pd.read_csv('movie_dataset_sample.csv', index_col = 'id')
sample

Unnamed: 0_level_0,title,vote_average,vote_count,release_date,overview,popularity,tagline,genres,production_companies,keywords,movie_details
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
294176,Zwei Seiten der Liebe,5.00,1,2002-09-13,Carola and Dieter Martin have been married for...,0.600,,"Romance, Drama, TV Movie",ARD,,title: Zwei Seiten der Liebe release_date: 200...
493621,Blood Circus,6.00,2,2017-12-19,A retired MMA world champion gets caught up in...,4.325,,"Action, Thriller","DTLA Entertainment Group, LLC, Octane Entertai...",,title: Blood Circus release_date: 2017-12-19 g...
812289,A Fistful of Barrels,0.00,0,2006-01-02,A FISTFUL OF BARRELS is the first surf film of...,0.859,,,Seven Films,,title: A Fistful of Barrels release_date: 2006...
225175,Aquila Nera,6.55,10,1946-09-21,An evil Czarist landowner had mistreated the f...,1.301,,"Romance, Adventure",Lux Film,,title: Aquila Nera release_date: 1946-09-21 ge...
652722,In the Arms of an Assassin,7.98,348,2019-12-06,Victor (William Levy) is one of the world’s mo...,17.049,,"Romance, Thriller",,,title: In the Arms of an Assassin release_date...
...,...,...,...,...,...,...,...,...,...,...,...
640450,Brouillard #13,0.00,0,2013-01-01,A colour reversal film shot on 35mm.,0.600,,,,,title: Brouillard #13 release_date: 2013-01-01...
1327857,"Среди мертвых, среди живых",0.00,0,2024-09-06,Not far from the Great Patriotic War burial si...,0.000,,Documentary,,,"title: Среди мертвых, среди живых release_date..."
385690,Lord Tokugawa Ieyasu,8.50,2,1965-01-03,"In turbulent 16th-century Japan, the leaders o...",1.487,,Drama,Toei Company,"samurai, hostage, feudal japan",title: Lord Tokugawa Ieyasu release_date: 1965...
729250,Bananarama: Live At The London Eventim Hammers...,9.00,1,2018-09-14,They were the girl group that defined the 80s....,0.600,,Music,PledgeMusic,,title: Bananarama: Live At The London Eventim ...


In [None]:
'''Victor (William Levy) is one of the world’s most handsome  men, but he has a deep secret – he is a cold blooded assassin. 
Smooth talking and seductive, Victor was raised to do one thing only, which is to kill for money. 
When he is sent to the home of a brutal drug lord to collect payment for his most recent hit, he encounters the beautiful Sarai (Alicia Sanz), 
who has been forced to spend the last 9 years of her life with the drug lord.'''

In [None]:
def query_movie_index(query):
    '''Queries movie index endpoint given movie query'''
    # Create HybridQuery
    query_dense_emb = get_dense_embedding(query)
    query_sparse_emb = get_sparse_embedding(query)
    query = HybridQuery(
        dense_embedding = query_dense_emb,
        sparse_embedding_values = query_sparse_emb["values"],
        sparse_embedding_dimensions = query_sparse_emb["dimensions"],
        rrf_ranking_alpha = 0.5,
    )
    
    # Run hybrid query
    return my_index_endpoint.find_neighbors(
        deployed_index_id = 5309946270809325568,
        queries = [query],
        num_neighbors = 10,
    )

def display_query_response(response):
    for idx, neighbor in enumerate(response[0]):
        title = df.title[int(neighbor.id)]
        dense_dist = neighbor.distance if neighbor.distance else 0.0
        sparse_dist = neighbor.sparse_distance if neighbor.sparse_distance else 0.0
        print(f"{title:<40}: dense_dist: {dense_dist:.3f}, sparse_dist: {sparse_dist:.3f}")

In [None]:
queries = "In the Arms of an Assassin"
search_results = query_movie_index(query)
display_query_response(search_results)

NameError: name 'query_movie_index' is not defined

: 