# Vector Similarity for RediSearch - Hybrid queries


## Python examples

### Packages

In [1]:
import numpy as np
from redis import Redis
from redis.commands.search.field import VectorField, TagField, NumericField, TextField
from redis.commands.search.query import Query

### Create redis client

In [2]:
host = "localhost"
port = 6379

redis_conn = Redis(host = host, port = port, password = "p@$$w0rdw!th0ut")

In [3]:
# Index fields and configurations

n_vec = 10000
dim = 128
M = 40
EF = 200
vector_field_name = "vector"
title_field_name = "title"
genre_field_name = "genre"
rating_field_name = "rating"
k = 10

In [4]:
def load_docs(client : Redis, n, d):
    for i in range(1, n+1):
        np_vector = np.random.rand(1, d).astype(np.float64)
        if i%5 != 0:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),  # ratings ranges from 0-10, proportional the doc id
                                      genre_field_name: "action",
                                      title_field_name: "matrix"})
        else:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),
                                      genre_field_name: "action, drama",
                                      title_field_name: "spiderman"})
        
def delete_data(client: Redis):
    client.flushall()
    
def print_results(res):
    docs = [int(doc.id) for doc in res.docs]
    dists = [float(doc.dist) if hasattr(doc, 'dist') else '-' for doc in res.docs]
    print(f"got {len(docs)} doc ids: ", docs)
    print("\ndistances: ", dists)
        

### Create HNSW index with meta-data

#### Every document in the index represent a movie review. The vector field is a text embedding of the review, while the other fields are some of the movie review metadata.

In [5]:
# build HNSW index
delete_data(redis_conn)

schema = (VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT64", "DIM": dim, "DISTANCE_METRIC": "L2"}),
          NumericField(rating_field_name), TagField(genre_field_name), TextField(title_field_name))
redis_conn.ft().create_index(schema)
redis_conn.ft().config_set("default_dialect", 2)

# load vectors with meta-data
np.random.seed(42)
load_docs(redis_conn, n_vec, dim)

print("index size: ", redis_conn.ft().info()['num_docs'])

query_vector = np.random.rand(1, dim).astype(np.float64)

index size:  10000


## Hybrid queries examples

In [31]:
# Give me the top 10 reviews on action movies similar to mine

q = Query(f'(@{genre_field_name}:{{action}})=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft().search(q, query_params = {'vec_param': query_vector.tobytes()})

print(query_vector.tobytes())

# zzz = np.array([[0.1, 0.2]])

# print(zzz.tobytes())

print_results(res)

b'DPm\x8ekB\xc7?\xe0\xac\xf5mN\xe0\xd5?\x98\x1f\x01\xca\xf1y\xe4?L\x18\x98\x95\xe0\xb7\xd2?8\xf5\x91?\xdcL\xdc?\xe2B\xe8\x10\xc6i\xe4?\xf4\x9e\xf8\x18\x9a\xd5\xc8?\xf0\x8e\xdf\x7fZ\xb3\xc6?j\xecB/|\xb8\xd0?\xecx\x9axI\xea\xd8? \xebbf\xb3\x03\x99?\xaecj\x0e\x820\xe7?\xf3Ac\xa5\xee/\xed? \xb2\xc4\xf9Y\xcb\xae?v\xc0\xaa\xde\x9e\xe8\xda?\xabe\xd0\xd3\xfe\x14\xe2?#\xdb\xaai\xd9\xa7\xef?\xe0\xed\xb0\x9f\x9c!\xd0?\xd4^\x16\xfc\xbe\x11\xc0?\x1c<\xe7\xcc\x00\xd9\xd6?\x97\xbc+\x02\xc3\xf6\xe6?\xfe\xefb\xb2W\x1a\xe4?\xc6\xceW\xdc\x15\xf8\xe0?\xdc+1}\xb9U\xc8?\xdbu\xc4W\x12\xcc\xea?T\x80\x16*\xba<\xed?\x80Wn)\x8c)\xe4?\xe4\xc33\xc3<\n\xcc?&\xe6\xe3\xf4\xe6J\xd6?\xe0\x05Y\xd8\xb4U\xe3?\xfd\xcax3Z\xab\xe1?\xc4\x96\xfa^\xac\xa7\xc3?\xaey\xff.\xae\x8d\xde?\x90\x86\x90\xaa_m\xb3?\xce\x14@K\xb2h\xdd?L\x94"\xbd\xfb\xe7\xdd?\xb4\xdb\x08:|\x1c\xec?B\xf0\xdc\x8d\x1e\x04\xd0?\xda\x99\xa0O\x19\x82\xea?\xc1\xb6\x89\xef\x18\xce\xe8?\x9510f\xfa\xe8\xe6?\xc9.\xda\xa0\x03R\xe6?\x88$\x07\xc3\xb5\xaa\xcf?\x88\xacOz&

In [13]:
# Give me the top 10 reviews on action movies similar to mine that got ratings between 5 and 7.
# (ids 5000-7000)

q = Query(f'(@{genre_field_name}:{{action}} @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft().search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [5126, 6268, 5390, 5085, 6741, 6251, 5239, 5487, 5194, 5595]

distances:  [14.7560760961, 15.2173650941, 15.7935849617, 15.8196561477, 15.8495724098, 15.8533288875, 16.0165456443, 16.0417755318, 16.0750138339, 16.2356399735]


In [14]:
# Give me the top 10 reviews on a Spiderman movie that are similar to mine and got ratings between 5 and 7.
#(ids 5000-7000 divided by 5)

q = Query(f'(@{title_field_name}:spiderman @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft().search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [5390, 5085, 5595, 6695, 6285, 5765, 6595, 5795, 5790, 5550]

distances:  [15.7935849617, 15.8196561477, 16.2356399735, 16.4198694829, 16.4199152798, 16.4874357724, 16.59035834, 16.657459116, 16.6816978817, 16.786226798]


In [15]:
# Give me the top 10 reviews on movies that aren't Spiderman that are similar to mine.
#(all ids which are not divided by 5)

q = Query(f'(@{genre_field_name}:{{action}} -@{title_field_name}:spider*)=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft().search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [9386, 83, 5126, 9572, 3492, 6268, 3949, 4437, 1057, 557]

distances:  [14.5484484676, 14.7082952384, 14.7560760961, 14.8371418493, 14.9124708649, 15.2173650941, 15.3307324484, 15.3791827069, 15.488778035, 15.4977867241]


In [16]:
# Give me the top 10 reviews on a "spiderman" movie or movies with at least a 9 rating.
#(ids which are divided by 5 or above 9000)

q = Query(f'((@{title_field_name}:spiderman) | (@{rating_field_name}:[9 inf]))=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft().search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [8770, 9386, 9572, 8400, 9396, 3655, 9526, 9353, 5390, 5085]

distances:  [13.346961015, 14.5484484676, 14.8371418493, 15.4953163405, 15.6169647311, 15.6970910686, 15.722931204, 15.7777110313, 15.7935849617, 15.8196561477]
