# Setup 

Install the required packages for this notebook to run.

In [None]:
# We'll need to install the Redis client
#!pip install redis

#Install wget to pull zip file
#!pip install wget

#Install required packages
#!pip install openai
#!pip install pandas
#!pip install numpy
#!pip install python-dotenv

In [1]:
import openai

from typing import List, Iterator
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
from ast import literal_eval

# Redis client library for Python
import redis

# Ignore unclosed SSL socket warnings - optional in case you get these errors
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Loading the env from the .env file
load_dotenv()

True

## Setup Redis connection

In [28]:
import redis
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField
)
import os

REDIS_HOST =  os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT")
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD")
# Connect to Redis
redis_client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    ssl=True,
    password=REDIS_PASSWORD
)
# should return True
redis_client.ping()

True

## Load documents into Dataframe

In [22]:
products_df = pd.read_json('../data/text/product_docs_embeddings.json')
products_df.head()

Unnamed: 0,id,title,content,category,title_vector,content_vector
0,1,Azure App Service,Azure App Service is a fully managed platform ...,Web,"[-0.0106361033, -0.021644678, 0.0019778875, -0...","[0.007650437800000001, -0.0236263517, 0.012058..."
1,2,Azure Functions,Azure Functions is a serverless compute servic...,Compute,"[-0.0094142705, -0.0047656302, -0.005457249, -...","[-0.014505187000000001, -0.0127168763, -0.0093..."
2,3,Azure Cognitive Services,Azure Cognitive Services are a set of AI servi...,AI + Machine Learning,"[-0.012958467000000001, -0.0041865818, -0.0088...","[-0.0162250493, -0.0012716976, 0.0024742542, -..."
3,4,Azure Storage,"Azure Storage is a scalable, durable, and high...",Storage,"[-0.0216679014, -0.0159060247, 0.0107527971000...","[-0.019703323000000002, -0.0143261477, 0.00300..."
4,5,Azure SQL Database,Azure SQL Database is a fully managed relation...,Databases,"[-0.0105225565, -0.0080567617, -0.0060751964, ...","[-0.0062997062000000005, -0.0119180158, 0.0005..."


In [23]:
employees_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
employees_df.head()

Unnamed: 0,id,chunk_content,chunk_content_vector
0,0,Contoso Electronics \nEmployee Handbook \n \n...,"[-0.0134241888, 0.0083369836, 0.00018061460000..."
1,1,edge systems that are both reliable and effici...,"[-0.0078642182, 0.0030302808, -0.0163918491, -..."
2,2,edge systems that are both reliable and effici...,"[-0.0107993353, 0.0036727316, -0.009540895, -0..."
3,3,customers. \n \nCompany Values: \n1. Quality...,"[-0.018283184600000002, -0.0022870835000000003..."
4,4,we work and live. \nPerformance Reviews \n \...,"[-0.016625782500000002, -6.20042e-05, 0.031033..."


## Setup schema in Redis

In [29]:
# Define RediSearch fields for each of the columns in the dataset
DISTANCE_METRIC = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
VECTOR_DIM = len(products_df['title_vector'][0]) # length of the vectors
VECTOR_NUMBER = len(products_df)                 # initial number of vectors

title = TextField(name="title")
category = TextField(name="category")
content = TextField(name="content")
title_embedding = VectorField("title_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
content_embedding = VectorField("content_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
product_fields = [title, category, content, title_embedding, content_embedding]

In [30]:
# Define RediSearch fields for each of the columns in the dataset
DISTANCE_METRIC = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
VECTOR_DIM = len(employees_df['chunk_content_vector'][0]) # length of the vectors
VECTOR_NUMBER = len(employees_df)                 # initial number of vectors

chunk_content = TextField(name="chunk_content")

chunk_content_embedding = VectorField("chunk_content_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
employee_fields = [chunk_content, chunk_content_embedding]

## Create indexes

In [31]:
def create_index(index_name, doc_prefix, fields):
    try:
        redis_client.ft(index_name).info()
        print("Index already exists")
    except:
        # Create RediSearch Index
        redis_client.ft(index_name).create_index(
            fields = fields,
            definition = IndexDefinition(prefix=[doc_prefix], index_type=IndexType.HASH)
    )

In [32]:
PRODUCT_TEXT_INDEX_NAME="product-text-index"
PRODUCT_DOC_PREFIX="product"

create_index(PRODUCT_TEXT_INDEX_NAME, PRODUCT_DOC_PREFIX, product_fields)

Index already exists


In [33]:
EMP_TEXT_INDEX_NAME="employee-text-index"
EMP_DOC_PREFIX="employee"

create_index(EMP_TEXT_INDEX_NAME, EMP_DOC_PREFIX, employee_fields)

## Ingest documents into Redis index

In [34]:
# index the products
def index_products(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    for doc in records:
        key = f"{prefix}:{str(doc['id'])}"

        # create byte vectors for title and content
        title_embedding = np.array(doc["title_vector"], dtype=np.float32).tobytes()
        content_embedding = np.array(doc["content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["title_vector"] = title_embedding
        doc["content_vector"] = content_embedding

        client.hset(key, mapping = doc)

index_products(redis_client, PRODUCT_DOC_PREFIX, products_df)
print(f"Loaded {redis_client.info()['db0']['keys']} documents in Redis search index with name: {PRODUCT_TEXT_INDEX_NAME}")

Loaded 92 documents in Redis search index with name: product-text-index


In [35]:
# index the employee doc
def index_employee(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    for doc in records:
        key = f"{prefix}:{str(doc['id'])}"

        # create byte vectors for title and content
        chunk_content_embedding = np.array(doc["chunk_content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["chunk_content_vector"] = chunk_content_embedding

        client.hset(key, mapping = doc)
        
index_employee(redis_client, EMP_DOC_PREFIX, employees_df)
print(f"Loaded {redis_client.info()['db0']['keys']} documents in Redis search index with name: {EMP_TEXT_INDEX_NAME}")

Loaded 108 documents in Redis search index with name: employee-text-index


## Setup OpenAI to create embeddings for the user query

In [36]:
#setup azure open ai
openai.api_type = "azure"
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_base = os.environ.get("OPENAI_BASE_URL")
openai.api_version = "2023-06-01-preview"

# This is the deployment name, running the ada model
EMBEDDING_MODEL = "ada-deployment"

def create_embedding(user_query):
    return openai.Embedding.create(input=user_query,engine=EMBEDDING_MODEL)['data'][0]['embedding']
    

## Search using K-nearest neighbors (KNN) in Redis

In [47]:
def search_redis(
    redis_client: redis.Redis,
    user_query: str,
    index_name: str,
    vector_field: str, 
    return_fields: list = ["title", "category", "content", "vector_score"],
    hybrid_fields = "*",
    k: int = 20,
) -> List[dict]:

    # Creates embedding vector from user query
    embedded_query = create_embedding(user_query=user_query)

    # Prepare the Query
    base_query = f'{hybrid_fields}=>[KNN {k} @{vector_field} $vector AS vector_score]'
    query = (
        Query(base_query)
         .return_fields(*return_fields)
         .sort_by("vector_score")
         .paging(0, k)
         .dialect(2)
    )
    params_dict = {"vector": np.array(embedded_query).astype(dtype=np.float32).tobytes()}

    # perform vector search
    results = redis_client.ft(index_name).search(query, params_dict)
    return results
    

In [49]:
# product search
PRODUCT_SEARCH_FIELD = "content_vector"
PRODUCT_RETURN_FIELDS = ["title", "category", "content", "vector_score"]
results = search_redis(redis_client, 'show some database related products', PRODUCT_TEXT_INDEX_NAME, PRODUCT_SEARCH_FIELD, PRODUCT_RETURN_FIELDS, k=10)
for i, article in enumerate(results.docs):
        score = 1 - float(article.vector_score)
        print(f"{i}. {article.title} (Score: {round(score ,3) })")

0. Azure Database for MySQL (Score: 0.768)
1. Azure SQL Database (Score: 0.763)
2. Azure Database for MariaDB (Score: 0.763)
3. Azure Database for PostgreSQL (Score: 0.76)
4. Azure Data Catalog (Score: 0.754)
5. Azure SQL Data Warehouse (Score: 0.75)
6. Azure Cosmos DB (Score: 0.747)
7. Azure Cosmos DB (Score: 0.744)
8. Azure Database Migration Service (Score: 0.744)
9. Azure Cognitive Search (Score: 0.742)


In [48]:
# employee search
EMP_SEARCH_FIELD = "chunk_content_vector"
EMP_RETURN_FIELDS = ["chunk_content"]
results = search_redis(redis_client, 'company values', EMP_TEXT_INDEX_NAME, EMP_SEARCH_FIELD, EMP_RETURN_FIELDS, k=2)
for i, article in enumerate(results.docs):
        print(f"{i}. {article.chunk_content}")

0. customers.  
 
Company Values:  
1. Quality: We strive to provide the highest quality products and services to our customers.  
2. Integrity: We value honesty, respect, and t rustworthiness in all our interactions.  
3. Innovation: We encourage creativity and support new ideas and approaches to our 
business.  
4. Teamwork: We believe that by working together, we can achieve greater success.  
5. Respect: We treat all our employees, c ustomers, and partners with respect and dignity.  
6. Excellence: We strive to exceed expectations and provide excellent service.   7. Accountability: We take responsibility for our actions and hold ourselves and others 
accountable for their performance.  
8. Co mmunity: We are committed to making a positive impact in the communities in which 
we work and live.  
Performance Reviews  
 
Performance Reviews at Contoso Electronics  
 
At Contoso Electronics, we strive to ensure our employees are getting the feedback they
1. edge systems that are both re