##  Redis vector search - Py Notebook 🐍📑

    Created by: Domen Žukovec

### Imports and const values for OpenAI 🤖🏗️

In [1]:
import os
import json
import tiktoken
import openai
import numpy as np
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt


# Load environment variables
load_dotenv()

# Configure Azure OpenAI Service API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = "" # put yours here
openai.api_key = "" # put yours here

# Define embedding model and encoding
EMBEDDING_MODEL = 'TextEmbeddingAda002' # put yours here
EMBEDDING_ENCODING = 'cl100k_base'
EMBEDDING_CHUNK_SIZE = 8000
COMPLETION_MODEL = 'TextDavinci003' # put yours here

REDIS_INDEX_NAME = 'domtistestindex' # put yours here
VECTOR_FIELD_IN_REDIS='item_vector'
NUMBER_PRODUCTS_INDEX=1000
CHOSEN_EMB_MODEL = 'TextEmbeddingAda002' # put yours here
REDIS_ADDR = '' # put yours here
REDIS_PORT = 10000 # put yours here
REDIS_PASSWORD = '' # put yours here


# initialize tiktoken for encoding text
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)

ModuleNotFoundError: No module named 'tiktoken'

### Load data into notebook 📒📑

In [3]:
def create_array_from_lines(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Create an empty list to hold the lines
    array_from_lines = []

    for line in lines:
        # Add each line to the list
        line = line.replace("\n", " ")
        line = line.replace("  ", " ")
        array_from_lines.append(line)

    return array_from_lines

# Use the function and print the result
Questions_array = create_array_from_lines('Questions.txt')
Nivo3A_array = create_array_from_lines('Nivo3A.txt')
Nivo3_array = create_array_from_lines('Nivo3.txt')

In [4]:
# print some stats about the questions
print(f"Loaded {len(Questions_array)} documents")
for doc in Questions_array[:3]:
    num_tokens = len(encoding.encode(doc))
    print(f"Content: {doc[:80]}... \n---> Tokens: {num_tokens}\n")

Loaded 37 documents
Content: Kaj je Triglav komplet? ... 
---> Tokens: 10

Content: Kje lahko dobim dodatne informacije o zavarovanju? ... 
---> Tokens: 19

Content: Kakšne so možnosti dodatnih vplačil v zavarovanje? ... 
---> Tokens: 22

Content: Kje lahko dobim dodatne informacije o zavarovanju? ... 
---> Tokens: 19

Content: Kakšne so možnosti dodatnih vplačil v zavarovanje? ... 
---> Tokens: 22

Content: Kakšen je učinek davčne olajšave? ... 
---> Tokens: 18



### Create embeddings for all the questions ⚙️💡

In [15]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

# Create embeddings for all questions
embeddings = [get_embedding(doc) for doc in Questions_array]

# print some stats about the embeddings (first 3)
for e in embeddings[:3]:
    print(e)

[0.01647069863975048, 0.0015815539518371224, 0.005138239823281765, -0.014603417366743088, -0.006850986275821924, 0.016187386587262154, -0.029490146785974503, -0.010237845592200756, -0.0047841002233326435, -0.01465492881834507, 0.026708543300628662, 0.021763470023870468, -0.01187332533299923, -0.008737582713365555, 0.0005943906726315618, -0.00027123853215016425, 0.025279108434915543, -0.023476216942071915, 0.011448358185589314, -0.012356243096292019, 0.0067865969613194466, 0.004004993941634893, -0.005382917821407318, -0.012491459958255291, 0.0006056587444618344, 0.00589480996131897, 0.020655980333685875, -0.017359266057610512, -0.0037635350599884987, -0.014268594793975353, 0.00038593137287534773, 0.0003875411057379097, -0.027996322140097618, -0.008035742677748203, 0.009529567323625088, -0.002664093626663089, -0.00022254437499213964, 0.013766361400485039, 0.008125887252390385, 0.021493038162589073, 0.03368830680847168, 0.01478370651602745, -0.0045297639444470406, 0.0013231933116912842, -

### Working with Redis 📲📮

For more info you can visit: https://redis-py.readthedocs.io/en/stable/examples/search_vector_similarity_examples.html

In [6]:
# Connect to Redis - here change the host, port, password and ssl parameters to match your Redis Enterprise setup
r = redis.StrictRedis(host='redisdbtriguc3.westeurope.redisenterprise.cache.azure.net',
    port=10000, db=0, password='hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0=', ssl=True)

INDEX_NAME = "domzis_index"                       # Vector Index Name
DOC_PREFIX = "doc:"                               # RediSearch Key Prefix for the Index

# function to create the index
def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)


In [7]:
# Test the Redis connection
r.ping()

True

#### Index set-up ⚙️

In [8]:
# Drop the index
r.ft(INDEX_NAME).dropindex(delete_documents=True)

b'OK'

In [9]:
# define vector dimensions
VECTOR_DIMENSIONS = 1536

# Create a new index
create_index(vector_dimensions=VECTOR_DIMENSIONS)

#### Write data to Redis 📝

In [10]:
np_embedding = np.array(embeddings, dtype=np.float32)

# Write to Redis
pipe = r.pipeline()
ct = 0
for i, embedding in enumerate(np_embedding):
    by_em = embedding.tobytes()
    pipe.hset(f"doc:{i}", mapping = {
        "vector": by_em,
        "content": Questions_array[i],
        "nivo3": Nivo3_array[i],
        "nivo3A": Nivo3A_array[i],
        "tag": "openai"
    })
    res = pipe.execute()

#### Now you can query the database by embedding a question and findig the closest one in the Redis DB 🛢️📮

In [11]:
# expected found question: Kaj pomeni odprta zavarovalna doba?

q_em = get_embedding("Dobil sem mail, da potrebujem čas odprte zavarovalne dobe. Kaj to pomeni?")
q_em = np.array(q_em, dtype=np.float32)

In [12]:
query = (
    Query("(@tag:{ openai })=>[KNN 2 @vector $vec as score]")
     .sort_by("score")
     .return_fields("content", "nivo3", "nivo3A", "tag", "score")
     .paging(0, 1)
     .dialect(2)
)

query_params = {"vec": q_em.tobytes()}
temp_json = r.ft(INDEX_NAME).search(query, query_params).docs

In [14]:
for doc in temp_json:
    print("Taxonomy:", doc['content'], "\nNivo3:", doc['nivo3'], "\nNivo3A:", doc['nivo3A'], "\nScore:", doc['score'])

Taxonomy: Kaj pomeni odprta zavarovalna doba?  
Nivo3: Naložbena in investicijska zavarovanja  
Nivo3A: Naložbeno življenjsko zavarovanje  
Score: 0.0750132203102


### This is the end of the notebook 😊