## Ingestion to Azure Cache for Redis

### Set environment variables

In [1]:
import openai

from typing import List, Iterator
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
from ast import literal_eval

# Redis client library for Python
import redis

# Ignore unclosed SSL socket warnings - optional in case you get these errors
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Loading the env from the .env file
load_dotenv()

True

#### Setup Redis connection

In [2]:
import redis
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField
)
import os

REDIS_HOST =  os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT")
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD")
# Connect to Redis
redis_client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    ssl=True,
    password=REDIS_PASSWORD
)
# should return True
redis_client.ping()

True

#### Load data and setup schema for text sample (products)

In [3]:
# load data into a data-frame
products_df = pd.read_json('../data/text/product_docs_embeddings.json')

# Define RediSearch fields for each of the columns in the dataset
DISTANCE_METRIC = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
VECTOR_DIM = len(products_df['title_vector'][0]) # length of the vectors
VECTOR_NUMBER = len(products_df)                 # initial number of vectors

title = TextField(name="title")
category = TextField(name="category")
content = TextField(name="content")
title_embedding = VectorField("title_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
content_embedding = VectorField("content_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
product_fields = [title, category, content, title_embedding, content_embedding]

In [4]:
#load data into a data-frame
employees_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')

# Define RediSearch fields for each of the columns in the dataset
DISTANCE_METRIC = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
VECTOR_DIM = len(employees_df['chunk_content_vector'][0]) # length of the vectors
VECTOR_NUMBER = len(employees_df)                 # initial number of vectors

chunk_content = TextField(name="chunk_content")

chunk_content_embedding = VectorField("chunk_content_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
employee_fields = [chunk_content, chunk_content_embedding]

### Create indexes

In [5]:
def create_index(index_name, doc_prefix, fields):
    try:
        redis_client.ft(index_name).info()
        print("Index already exists")
    except:
        # Create RediSearch Index
        redis_client.ft(index_name).create_index(
            fields = fields,
            definition = IndexDefinition(prefix=[f'{{{doc_prefix}}}:'], index_type=IndexType.HASH)
    )

In [6]:
# creating the product index (text search)
PRODUCT_TEXT_INDEX_NAME="product-text-index"
PRODUCT_DOC_PREFIX="product"

create_index(PRODUCT_TEXT_INDEX_NAME, PRODUCT_DOC_PREFIX, product_fields)

In [7]:
# creating the employee index (doc search)

EMP_TEXT_INDEX_NAME="employee-text-index"
EMP_DOC_PREFIX="employee"

create_index(EMP_TEXT_INDEX_NAME, EMP_DOC_PREFIX, employee_fields)

#### Ingest text sample with embeddings

In [None]:
# ingest the products data
import traceback
def ingest_products(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    pipe = client.pipeline()
    for doc in records:
        key = f"{{{prefix}}}:{str(doc['id'])}"

        # create byte vectors for title and content
        title_embedding = np.array(doc["title_vector"], dtype=np.float32).tobytes()
        content_embedding = np.array(doc["content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["title_vector"] = title_embedding
        doc["content_vector"] = content_embedding
        pipe.hset(key, mapping = doc)
    try:
        pipe.execute()
    except Exception:
        traceback.print_exc()
        

ingest_products(redis_client, PRODUCT_DOC_PREFIX, products_df)

#### Ingest doc sample with embeddings

In [13]:
# ingest the employee doc
def ingest_employees(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    pipe = client.pipeline()
    for doc in records:
        key = f"{{{prefix}}}:{str(doc['id'])}"

        # create byte vectors for title and content
        chunk_content_embedding = np.array(doc["chunk_content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["chunk_content_vector"] = chunk_content_embedding

        pipe.hset(key, mapping = doc)
    try:
        pipe.execute()
    except Exception:
        traceback.print_exc()
                
ingest_employees(redis_client, EMP_DOC_PREFIX, employees_df)

Loaded 108 documents in Redis search index with name: employee-text-index


#### Ingest image sample with embeddings

In [None]:
## TODO