# Query Bilka Handbook using Redis vector database

For libraries and basic setup of the Redis vector database see `Redis_Query_Bilka_Handbook.ipynb`



In [2]:
{
    "Bilka_handbook": 
    [
         {  "chapter": "Velkommen",
            "description": "Det, vi arbejder sammen om, er at gøre hverdagslivet bedre. Vi elsker hverdagen og ved om nogen, hvad vores kunder har brug for i det daglige... "
         },
         {
            "chapter": "Vores værdier",
            "description": "Kulturen i Salling Group kan være lidt forskellig fra den ene kæde eller afdeling til den anden. Men langt mere binder os sammen."
         },

    ]
}


{'Bilka_handbook': [{'Chapter': 'Velkommen',
   'Description': 'Det, vi arbejder sammen om, er at gøre hverdagslivet bedre. Vi elsker hverdagen og ved om nogen, hvad vores kunder har brug for i det daglige... '},
  {'Chapter': 'Vores værdier',
   'Description': 'Kulturen i Salling Group kan være lidt forskellig fra den ene kæde eller afdeling til den anden. Men langt mere binder os sammen.'}]}

### `Import libraries`

In [4]:
import json
import time

import numpy as np
import pandas as pd
import tabulate

import redis
import requests
from redis.commands.search.field import (
    NumericField,
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
client = redis.Redis(host="localhost", port=6379, decode_responses=True)
embedder = SentenceTransformer('msmarco-distilbert-base-v4')

### `Import data`

In [8]:
with open('./../Bilka_handbook_mini.json') as jsonHandbook:
    handbook = json.load(jsonHandbook)

    for iLine in handbook['Bilka_handbook']:
        print(iLine)



{'Chapter': 'Velkommen', 'Description': 'Det, vi arbejder sammen om, er at gøre hverdagslivet bedre. Vi elsker hverdagen og ved om nogen, hvad vores kunder har brug for i det daglige... '}
{'Chapter': 'Vores værdier', 'Description': 'Kulturen i Salling Group kan være lidt forskellig fra den ene kæde eller afdeling til den anden. Men langt mere binder os sammen.'}


The data used in this example follows the format below:

In [4]:
# {
#   "model": "Jigger",
#   "brand": "Velorim",
#   "price": 270,
#   "type": "Kids bikes",
#   "specs": {
#     "material": "aluminium",
#     "weight": "10"
#   },
#   "description": "Small and powerful, the Jigger is the best ride for the smallest of tikes! ...
# }

In [11]:
json.dumps(handbook['Bilka_handbook'], indent=2)

'[\n  {\n    "Chapter": "Velkommen",\n    "Description": "Det, vi arbejder sammen om, er at g\\u00f8re hverdagslivet bedre. Vi elsker hverdagen og ved om nogen, hvad vores kunder har brug for i det daglige... "\n  },\n  {\n    "Chapter": "Vores v\\u00e6rdier",\n    "Description": "Kulturen i Salling Group kan v\\u00e6re lidt forskellig fra den ene k\\u00e6de eller afdeling til den anden. Men langt mere binder os sammen."\n  }\n]'

In [35]:
pipeline = client.pipeline()
for i, section in enumerate(handbook['Bilka_handbook'], start=1):
    redis_key = f"sections:{i:03}"
    pipeline.json().set(redis_key, "$", section)
    print(redis_key)
    print(section)


sections:001
{'Chapter': 'Velkommen', 'Description': 'Det, vi arbejder sammen om, er at gøre hverdagslivet bedre. Vi elsker hverdagen og ved om nogen, hvad vores kunder har brug for i det daglige... '}
sections:002
{'Chapter': 'Vores værdier', 'Description': 'Kulturen i Salling Group kan være lidt forskellig fra den ene kæde eller afdeling til den anden. Men langt mere binder os sammen.'}


In [36]:
res = pipeline.execute()

In [26]:
keys = sorted(client.keys("sections:*"))

In [31]:
keys

['sections:001', 'sections:002']

In [34]:
descriptions = client.json().mget(keys, "$.description")
descriptions

[[], []]

In [33]:

descriptions = [item for sublist in descriptions for item in sublist]


[]

In [30]:
descriptions = client.json().mget(keys, "$.description")
descriptions = [item for sublist in descriptions for item in sublist]
embeddings = embedder.encode(descriptions).astype(np.float32).tolist()
VECTOR_DIMENSION = len(embeddings[0])

IndexError: list index out of range

In [29]:
embeddings

[]

To retrieve a specific value from one of the JSON documents in Redis use a JSONPath expression:

In [8]:
res = client.json().get("bikes:010", "$.model")

Iterating over all the Redis keys with the prefix `bikes:`:

In [10]:
keys = sorted(client.keys("bikes:*"))

Use the keys as a parameter to the JSON.MGET command, along with the JSONPath expression `$.description` to collect the descriptions in a list. Then, pass the list to the encode method to get a list of vectorized embeddings:

In [11]:
descriptions = client.json().mget(keys, "$.description")
descriptions = [item for sublist in descriptions for item in sublist]
embedder = SentenceTransformer("msmarco-distilbert-base-v4")
embeddings = embedder.encode(descriptions).astype(np.float32).tolist()
VECTOR_DIMENSION = len(embeddings[0])

Adding a vectorized descriptions to the JSON documents in Redis using the JSON.SET command. The following command inserts a new field in each of the documents under the JSONPath `$.description_embeddings`. Using a pipeline for this:

In [12]:
pipeline = client.pipeline()
for key, embedding in zip(keys, embeddings):
    pipeline.json().set(key, "$.description_embeddings", embedding)
pipeline.execute()

[True, True, True, True, True, True, True, True, True, True, True]

Inspecting one of the vectorized bike documents using the JSON.GET command:

In [13]:
res = client.json().get("bikes:010")
# >>>
# {
#   "model": "Summit",
#   "brand": "nHill",
#   "price": 1200,
#   "type": "Mountain Bike",
#   "specs": {
#     "material": "alloy",
#     "weight": "11.3"
#   },
#   "description": "This budget mountain bike from nHill performs well..."
#   "description_embeddings": [
#     -0.538114607334137,
#     -0.49465855956077576,
#     -0.025176964700222015,
#     ...
#   ]
# }

## Vector search

Creating an index to query based on vector metadata or perform vector searches. Using the FT.CREATE command:

In [None]:
schema = (
    TextField("$.model", no_stem=True, as_name="model"),
    TextField("$.brand", no_stem=True, as_name="brand"),
    NumericField("$.price", as_name="price"),
    TagField("$.type", as_name="type"),
    TextField("$.description", as_name="description"),
    VectorField(
        "$.description_embeddings",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
res = client.ft("idx:bikes_vss").create_index(
    fields=schema, definition=definition
)

In [15]:
info = client.ft("idx:bikes_vss").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]

In [16]:
queries = [
    "Bike for small kids",
    "Best Mountain bikes for kids",
    "Cheap Mountain bike for kids",
    "Female specific mountain bike",
    "Road bike for beginners",
    "Commuter bike for people over 60",
    "Comfortable commuter bike",
    "Good bike for college students",
    "Mountain bike for beginners",
    "Vintage bike",
    "Comfortable city bike",
]

In [17]:
encoded_queries = embedder.encode(queries)
len(encoded_queries)

11

#### KNN query

In [18]:
query = (
    Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'id', 'brand', 'model', 'description')
     .dialect(2)
)

In [24]:
# client.ft(INDEX_NAME).search(query, { 'query_vector': np.array(encoded_query, dtype=np.float32).tobytes() }).docs


In [28]:
def create_query_table(query, queries, encoded_queries, extra_params={}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            client.ft("idx:bikes_vss")
            .search(
                query,
                {
                    "query_vector": np.array(
                        encoded_query, dtype=np.float32
                    ).tobytes()
                }
                | extra_params,
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "id": doc.id,
                    "brand": doc.brand,
                    "model": doc.model,
                    "description": doc.description,
                }
            )

    # Optional: convert the table to Markdown using Pandas
    queries_table = pd.DataFrame(results_list)
    queries_table.sort_values(
        by=["query", "score"], ascending=[True, False], inplace=True
    )
    queries_table["query"] = queries_table.groupby("query")["query"].transform(
        lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
    )
    queries_table["description"] = queries_table["description"].apply(
        lambda x: (x[:497] + "...") if len(x) > 500 else x
    )
    queries_table.to_markdown(index=False)

In [29]:
create_query_table(query, queries, encoded_queries)