In [18]:
import random
import numpy as np
import pandas as pd
import time
from redis import Redis
import redisearch

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


# Load Amazon Product Data

Truncate text selected fields on load.  

The Max Length supported by the pre-trained sentence embedding generator is 512

In [19]:
MAX_TEXT_LENGTH=512

def auto_truncate(val):
    return val[:MAX_TEXT_LENGTH]

#Load Product data and truncate long text fields
all_prods_df = pd.read_csv("data/product_data.csv", converters={'bullet_point': auto_truncate,'item_keywords':auto_truncate,'item_name':auto_truncate})
all_prods_df['primary_key'] = all_prods_df['item_id'] + '-' + all_prods_df['domain_name']
all_prods_df = all_prods_df.fillna('')



In [20]:
all_prods_df.head(5)

Unnamed: 0,item_id,marketplace,country,main_image_id,domain_name,bullet_point,item_keywords,material,brand,color,item_name,model_name,model_number,product_type,primary_key
0,B07T6RZ2CM,Amazon,IN,71dZhpsferL,amazon.in,3D Printed Hard Back Case Mobile Cover for Len...,mobile cover back cover mobile case phone case...,,Amazon Brand - Solimo,Others,Amazon Brand - Solimo Designer Couples Sitting...,Lenovo K4 Note,gz8115-SL40423,CELLULAR_PHONE_CASE,B07T6RZ2CM-amazon.in
1,B07T2JY31Y,Amazon,IN,71vX7qIEAIL,amazon.in,3D Printed Hard Back Case Mobile Cover for Son...,mobile cover back cover mobile case phone case...,Wood,Amazon Brand - Solimo,others,Amazon Brand - Solimo Designer Leaf on Wood 3D...,Sony Xperia Z1 L39H,gz8056-SL40528,CELLULAR_PHONE_CASE,B07T2JY31Y-amazon.in
2,B0849YGSCZ,Amazon,AE,A1EZF-2mB5L,amazon.ae,,small de fur rooms navidad woven girls shag pa...,,Stone & Beam,,Stone & Beam Contemporary Doily Wool Farmhouse...,,I59I8044IVYGRYC00-Parent,HOME_FURNITURE_AND_DECOR,B0849YGSCZ-amazon.ae
3,B081K6TCML,Amazon,IN,81o9EyZ-fAL,amazon.in,Solimo Plastic Multipurpose Modular Drawer; sm...,drawer modular drawer 3 rack modular drawer ki...,Plastic,Amazon Brand - Solimo,Multicolor,Amazon Brand - Solimo Plastic Multipurpose Mod...,,sol_cujo_13,HOME,B081K6TCML-amazon.in
4,B0854774X5,Amazon,IN,81xaJCVnl3L,amazon.in,"Snug fit for Nokia 8.1, with perfect cut-outs ...",Back Cover Designer Case Designer Take It Easy...,Silicon,Amazon Brand - Solimo,Multicolor,Amazon Brand - Solimo Designer Take It Easy UV...,Nokia 8.1,UV10714-SL40617,CELLULAR_PHONE_CASE,B0854774X5-amazon.in


# Connect to Redis

In [21]:
host = 'localhost'
port = 6379
redis_conn = Redis(host = host, port = port)
print ('Connected to redis')

Connected to redis


# Generate Embeddings

We will use a pre-trained sentence embedding generator from

https://huggingface.co/sentence-transformers/all-distilroberta-v1

In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


Generate vector representations of the "item_keywords"  field for 1000 products

The distilroberta-v1 generates a 768-float vector for a given sentence of up to 512 characters

In [23]:
%%time
NUMBER_PRODUCTS=1000

subset_df = all_prods_df.head(NUMBER_PRODUCTS)
item_keywords_vectors = [ model.encode(sentence) for sentence in subset_df['item_keywords']]




CPU times: user 1min 18s, sys: 3.31 s, total: 1min 21s
Wall time: 1min 19s


### Check the dimensions of one of the vectors generated

In [25]:
item_keywords_vectors[0].shape

(768,)

# Utility Functions to Load Product Data
Each product will be stored in a redis hash
* **Hash Key** = **'product:'** + **product_key**
* **Hash Fields:** 
    * Item Id
    * Item Name
    * Item Keywords (text)
    * Item Keywords vector - 768-float vector
 

In [26]:
def load_vectors(client:Redis, product_df, vector_data,vector_field_name):
    p = client.pipeline(transaction=False)
    for index, row in product_df.iterrows():    
        #hash key
        key='product:'+ product_df.iloc[index]['primary_key']
        #hash fields
        item_id=product_df.iloc[index]['item_id']
        item_keywords=product_df.iloc[index]['item_keywords']
        item_name=product_df.iloc[index]['item_name']
        item_keywords_vector = vector_data[index].astype(np.float32).tobytes()
        product_data_values ={'item_id':item_id,'item_keywords':item_keywords,
                 'item_name':item_name,
                 vector_field_name:item_keywords_vector}
        
        p.hset(key,mapping=product_data_values)
    p.execute()
    
        
def delete_data(client: Redis):
    client.flushall()

# Utility Functions to Create Indexes on Vector field

In [27]:
def create_bf_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2'):
    bf_index = redisearch.Client(index_name, conn=redis_conn)
    bf_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA",vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "BF", "INITIAL_CAP", number_of_vectors)
    return bf_index

def create_hnsw_index (redis_conn,index_name,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    hnsw_index = redisearch.Client(index_name, conn=redis_conn)
    hnsw_index.redis.execute_command("FT.CREATE", index_name, "SCHEMA", vector_field_name, "VECTOR", "FLOAT32", vector_dimensions, distance_metric, "HNSW", "INITIAL_CAP", number_of_vectors, "M", M, "EF", EF)
    return hnsw_index

def delete_index(vector_index):
    delete_data(vector_index.redis)
    

# Utility Functions to Perform Similarity Search 
Using different indexing methods

In [28]:
def find_similar_products_bf(product_query, query_encoder, vector_index,vector_field_name, topK=5):
    #vectorize the query
    query_vector = query_encoder.encode(product_query).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','item_name','item_keywords')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 

def find_similar_products_hnsw(product_query, query_encoder, vector_index,vector_field_name, topK=5,EF=5):
    #vectorize the query
    query_vector = query_encoder.encode(product_query).astype(np.float32).tobytes()
    #prepare the query
    q = redisearch.Query(f'@{vector_field_name}:[$vec_param TOPK {topK}]  => {{$EFRUNTIME : {EF}}}').sort_by(f'{vector_field_name}_score').paging(0,topK).return_fields(f'{vector_field_name}_score','item_name','item_keywords')
    #Execute the query
    results = vector_index.search(q, query_params = {'vec_param': query_vector})
    return results 



# Brute-Force - Load and Index Product Data
Load and index product data using a brute-force Index on the 'item_keywords_vector' field.
This index is used to calculate Top K Exact Nearest Neighbors of a given vector

In [30]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
#my_bf_index = create_bf_index(redis_conn,'my_bf_index','item_keywords_vector',NUMBER_PRODUCTS,768,'L2')
load_vectors(my_bf_index.redis,subset_df,item_keywords_vectors,'item_keywords_vector')

Loading and Indexing + 1000 products
CPU times: user 403 ms, sys: 18.3 ms, total: 422 ms
Wall time: 537 ms


# Brute-Force - Query The Top 5 Similar Products
Let's use the brute-force index to find the exact top k nearest neighbors of a given text query

Check the output for 2 very different queries:
* Query 1 = 'Fantastic piece of handmade jewllery for a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my phone'



In [32]:
%%time
#product_query='Fantastic piece of handmade jewllery for a special occasion'
product_query='cool way to pimp up my cell'

results = find_similar_products_bf (product_query,model,my_bf_index,'item_keywords_vector',5)
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Keywords = ' +  color.END  + product.item_keywords_vector_score)

***************Product  found ************
[1mhash key = [0mproduct:B07GSKNLT4-amazon.com
[93mItem Name = [0mAmazonBasics 4.8 Amp 24W Dual USB Car Charger for Apple and Android Devices, Black and Red, 4-Pack
[93mItem Keywords = [0mcell phone car chargers Car Charger
***************Product  found ************
[1mhash key = [0mproduct:B07GSKNQP6-amazon.com.au
[93mItem Name = [0mAmazonBasics 4.8 Amp 24W Dual USB Car Charger for Apple and Android Devices, Black, 10-Pack
[93mItem Keywords = [0mcell phone car chargers Car Charger
***************Product  found ************
[1mhash key = [0mproduct:B07P3F764N-amazon.in
[93mItem Name = [0mAmazon Brand - Solimo Protective Mobile Cover (Soft & Flexible Back case) for Samsung Galaxy M10
[93mItem Keywords = [0msamsung mobile phone
***************Product  found ************
[1mhash key = [0mproduct:B07PZGSM67-amazon.in
[93mItem Name = [0mAmazon Brand - Solimo Mobile Cover for Realme 1 (Soft & Flexible Back Case), Transparent
[

## Check one of the Search Results

In [13]:
results.docs[0]

Document {'id': 'product:B01MTY0Q3P-amazon.com', 'payload': None, 'item_keywords_vector_score': 'Hearts fake diamond imitation diamond cheap diamond simulated diamond jewelry for women anniversary gifts for her jewlery cz ring^ imitation diamond ring^ cheap diamond rings^ cheap diamond ring^ simulated diamond ring swarovski crystal^ swarovski elements fake diamond imitation diamond cheap diamond simulated diamond jewelry for women anniversary gifts for her jewlery cz ring^ imitation diamond ring^ cheap diamond rings^ cheap diamond ring^ simulated diamond ring swarovski crystal^ swarovski elements fake ', 'item_name': 'Platinum-Plated Sterling Silver Swarovski Zirconia Red White Heart All-Around Band Ring, Size 7'}

In [14]:
#Delete index and underlying data
delete_data(my_bf_index.redis)



# HNSW - Load and Index Product Data
Load and index product data using an HNSW Index on the 'item_keywords_vector' field.
This index is used to calculate Top K Approximate Nearest Neighbors of a given vector

In [15]:
%%time
print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')
my_hnsw_index = create_hnsw_index(redis_conn,'my_hnsw_index','item_keywords_vector',NUMBER_PRODUCTS,768,'L2',M=40,EF=200)
load_vectors(my_hnsw_index.redis,subset_df,item_keywords_vectors,'item_keywords_vector')

Loading and Indexing + 1000 products
CPU times: user 327 ms, sys: 11.4 ms, total: 338 ms
Wall time: 566 ms


# HNSW - Query The Top 5 Similar Products
Let's repeat the similarity search but this time using the HNSW index

Check the output for 2 very different queries:
* Query 1 = 'Fantastic piece of handmade jewllery for a special occasion'
* Query 2 = 'Ultra modern cool way to pimp up my phone'



In [16]:
%%time
product_query='Fantastic piece of handmade jewllery for a special occasion'
#product_query='Ultra modern cool way to pimp my cell'

results = find_similar_products_hnsw (product_query,model,my_hnsw_index,'item_keywords_vector',5,EF=5)
for product in results.docs:
    print ('***************Product  found ************')
    print (color.BOLD + 'hash key = ' +  color.END + product.id)
    print (color.YELLOW + 'Item Name = ' +  color.END  + product.item_name)
    print (color.YELLOW + 'Item Keywords = ' +  color.END  + product.item_keywords_vector_score)

***************Product  found ************
[1mhash key = [0mproduct:B01MTY0Q3P-amazon.com
[93mItem Name = [0mPlatinum-Plated Sterling Silver Swarovski Zirconia Red White Heart All-Around Band Ring, Size 7
[93mItem Keywords = [0mHearts fake diamond imitation diamond cheap diamond simulated diamond jewelry for women anniversary gifts for her jewlery cz ring^ imitation diamond ring^ cheap diamond rings^ cheap diamond ring^ simulated diamond ring swarovski crystal^ swarovski elements fake diamond imitation diamond cheap diamond simulated diamond jewelry for women anniversary gifts for her jewlery cz ring^ imitation diamond ring^ cheap diamond rings^ cheap diamond ring^ simulated diamond ring swarovski crystal^ swarovski elements fake 
***************Product  found ************
[1mhash key = [0mproduct:B07QB8JXQX-amazon.com
[93mItem Name = [0mAmazon Brand – Rivet Modern Handtufted Cotton and Wool Area Rug, 5' x 8', Distressed Blue and Ivory
[93mItem Keywords = [0msmall foundry n

In [18]:
#cleanup
delete_index(my_hnsw_index)

In [16]:
%%time
delete_index(my_bf_index)

CPU times: user 1.07 ms, sys: 1.64 ms, total: 2.7 ms
Wall time: 6.48 ms
