# Vector Similarity for RediSearch - Hybrid queries


https://github.com/RediSearch/RediSearch/blob/master/docs/docs/vecsim-hybrid_queries_examples.ipynb

## Python examples

### Packages

In [4]:
import numpy as np
from redis import Redis
from redis.commands.search.field import VectorField, TagField, NumericField, TextField
from redis.commands.search.query import Query

### Create redis client

In [5]:
host = "localhost"
port = 6379

redis_conn = Redis(host = host, port = port, password = "2022db@Qwer")

In [7]:
# Index fields and configurations

n_vec = 200000
dim = 1536
M = 40
EF = 200
vector_field_name = "vector"
title_field_name = "title"
genre_field_name = "genre"
rating_field_name = "rating"
k = 10

index_name = "hnsw-cosine-index-001"

distance_metric = "cosine"

# delete_data(redis_conn)

In [8]:
def load_docs(client : Redis, n, d):
    for i in range(1, n+1):
        np_vector = np.random.rand(1, d).astype(np.float32)
        if i%5 != 0:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),  # ratings ranges from 0-10, proportional the doc id
                                      genre_field_name: "action",
                                      title_field_name: "matrix"})
        else:
            client.hset(i, mapping = {vector_field_name: np_vector.tobytes(),
                                      rating_field_name: 10*(i/n),
                                      genre_field_name: "action, drama",
                                      title_field_name: "spiderman"})
        
def delete_data(client: Redis):
    client.flushall()
    
def print_results(res):
    docs = [int(doc.id) for doc in res.docs]
    dists = [float(doc.dist) if hasattr(doc, 'dist') else '-' for doc in res.docs]
    print(f"got {len(docs)} doc ids: ", docs)
    print("\ndistances: ", dists)
        

### Create HNSW index with meta-data

#### Every document in the index represent a movie review. The vector field is a text embedding of the review, while the other fields are some of the movie review metadata.

In [9]:
# build HNSW index
# delete_data(redis_conn)

schema = (VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": dim, "DISTANCE_METRIC": distance_metric}),
          NumericField(rating_field_name), TagField(genre_field_name), TextField(title_field_name))
redis_conn.ft(index_name).create_index(schema)
redis_conn.ft(index_name).config_set("default_dialect", 2)

# load vectors with meta-data
np.random.seed(42)
load_docs(redis_conn, n_vec, dim)

print("index size: ", redis_conn.ft(index_name).info()['num_docs'])

query_vector = np.random.rand(1, dim).astype(np.float32)

index size:  225000


## Hybrid queries examples

In [44]:
# Give me the top 10 reviews on action movies similar to mine

q = Query(f'(@{genre_field_name}:{{action}})=>[KNN 2 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft(index_name).search(q, query_params = {'vec_param': query_vector.tobytes()})

print(query_vector.tobytes())

# zzz = np.array([[0.1, 0.2]])

# print(zzz.tobytes())

print_results(res)

b'\xca\x7fY?<\xeb\xa4>l\x98\xb4>U\xcd6?\xe6\xb8\x0e?\x19\x87\xba>\xd0\xf6x>\xc4\xffW>w\t7?\xf5`R>\xd4/\xd0>\x9a\'T?_\x80S?S\x0e\x94>\x01\xdb6?x\xdb\x1d?\xfd\xe2g?\x14\x84\xdb>\x8a;y?x\xd7\x04?:x\x88>\x9fn\xbb=\xdd\x1f\x87>gJ\x94>12:>s\xc2"?\xa1}r?p\x1d\x12?X\xab\xbd>|\xff\xdf>#\x02\xc0>\x04\xda\x85=\n\x8b\xcc=\xda\xbfy=\x9e\xb0\x18?d\x8d\x13?\xf4xA?e\xbb0?\xcc}\x12?\xb1\xfdB>R{i?\r\x9b\xf6>\x1b\xa3\xb9>\'\x80Y?\xa7\x1e-?\xb2\x07 ?@\xa0r>o5\x08?\x08z\xb6>\xbb!\x0e?x\xdc.?\xa1\x91\xbc9L\xe5L>\xf2\xbex?\xf74\x9a>\xcbSt?@iE?\xb2#@?\xda\x01\x9a>f5s?\x82\xe1D>\x18D]>\xb5E\x14?\xb2\xa0A?\xe8E>>\x7f\xa0Q?m\x8f\xa6<o6,?m\xed^?uU\xbb>\x19\x03u?\xe562?g\tZ?\xe81\xdc>z2u=\xd9\x8b\xd4>\xf7\\\xf4>\xfcHR>\xd2-P?x\\\xd9>\xe6Lt?\x96m\x95=z\xba\x9b>/\xb3E>w@\xf6>\xb7\xbc\xcb>\xa4\xd4{>\xa3\xdbn?\xf7\xfeg?\x13\xfe0?+\x9d\xda>\xf0"\x10>\x01I\x08>\x1d\tL?\xfe3\xe2=\xfd\xcd~?\xdf\t\xfe;v\xfb4?\xf7\xbb\x81>q\xffL?\x8f\x00F?"\xa6\xd2>\xa5\x97\xa8=U\xcaG>\xe8\xf46=:\x85\xd6>\xcf\xce(?\xce\xbc\x1a?\xf3\x0ek?\x1

In [13]:
# Give me the top 10 reviews on action movies similar to mine that got ratings between 5 and 7.
# (ids 5000-7000)

q = Query(f'(@{genre_field_name}:{{action}} @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft(index_name).search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [5126, 6268, 5390, 5085, 6741, 6251, 5239, 5487, 5194, 5595]

distances:  [14.7560760961, 15.2173650941, 15.7935849617, 15.8196561477, 15.8495724098, 15.8533288875, 16.0165456443, 16.0417755318, 16.0750138339, 16.2356399735]


In [14]:
# Give me the top 10 reviews on a Spiderman movie that are similar to mine and got ratings between 5 and 7.
#(ids 5000-7000 divided by 5)

q = Query(f'(@{title_field_name}:spiderman @{rating_field_name}:[5 7])=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft(index_name).search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [5390, 5085, 5595, 6695, 6285, 5765, 6595, 5795, 5790, 5550]

distances:  [15.7935849617, 15.8196561477, 16.2356399735, 16.4198694829, 16.4199152798, 16.4874357724, 16.59035834, 16.657459116, 16.6816978817, 16.786226798]


In [15]:
# Give me the top 10 reviews on movies that aren't Spiderman that are similar to mine.
#(all ids which are not divided by 5)

q = Query(f'(@{genre_field_name}:{{action}} -@{title_field_name}:spider*)=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft(index_name).search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [9386, 83, 5126, 9572, 3492, 6268, 3949, 4437, 1057, 557]

distances:  [14.5484484676, 14.7082952384, 14.7560760961, 14.8371418493, 14.9124708649, 15.2173650941, 15.3307324484, 15.3791827069, 15.488778035, 15.4977867241]


In [16]:
# Give me the top 10 reviews on a "spiderman" movie or movies with at least a 9 rating.
#(ids which are divided by 5 or above 9000)

q = Query(f'((@{title_field_name}:spiderman) | (@{rating_field_name}:[9 inf]))=>[KNN 10 @{vector_field_name} $vec_param AS dist]').sort_by('dist')
res = redis_conn.ft(index_name).search(q, query_params = {'vec_param': query_vector.tobytes()})

print_results(res)

got 10 doc ids:  [8770, 9386, 9572, 8400, 9396, 3655, 9526, 9353, 5390, 5085]

distances:  [13.346961015, 14.5484484676, 14.8371418493, 15.4953163405, 15.6169647311, 15.6970910686, 15.722931204, 15.7777110313, 15.7935849617, 15.8196561477]
