## Data Pipeline - Azure Cache for Redis

### Prerequisites

- Generate embeddings - [generate_embeddings.ipynb](../common/generate_embeddings.ipynb) 

#### Set environment variables

In [1]:
from dotenv import load_dotenv
import os
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

load_dotenv()

redis_host  = os.getenv("REDIS_HOST")
if redis_host is None or redis_host == "":
    print("REDIS_HOST environment variable not set.")
    exit()

redis_port  = os.getenv("REDIS_PORT")
if redis_port is None or redis_port == "":
    print("REDIS_PORT environment variable not set.")
    exit()

redis_password  = os.getenv("REDIS_PASSWORD")
if redis_password is None or redis_password == "":
    print("REDIS_PASSWORD environment variable not set.")
    exit()

#### Setup Redis connection

In [2]:
import redis
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField
)
import os

# Connect to Redis
redis_client = redis.Redis(
    host=redis_host,
    port=redis_port,
    ssl=True,
    password=redis_password
)

# should return True
redis_client.ping()

True

#### Helper method

In [3]:
def create_index(index_name, doc_prefix, fields):
    try:
        redis_client.ft(index_name).info()
        print("Index already exists")
    except:
        # Create RediSearch Index
        redis_client.ft(index_name).create_index(
            fields = fields,
            definition = IndexDefinition(prefix=[f'{{{doc_prefix}}}:'], index_type=IndexType.HASH)
    )

#### Create index - text_sample

In [6]:
import pandas as pd
import numpy as np

text_df = pd.read_json('../data/text/product_docs_embeddings.json')

# Define RediSearch fields for each of the columns in the dataset
distance_metric = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
vector_dim = len(text_df['title_vector'][0]) # length of the vectors
vector_number = len(text_df)                 # initial number of vectors

title = TextField(name="title")
category = TextField(name="category")
content = TextField(name="content")
title_embedding = VectorField("title_vector",
    "HNSW", {
        "TYPE": "FLOAT32",
        "DIM": vector_dim,
        "DISTANCE_METRIC": distance_metric,
        "INITIAL_CAP": vector_number,
        "M": 4,
        "EF_CONSTRUCTION": 400,
        "EF_RUNTIME": 500
    }
)
content_embedding = VectorField("content_vector",
    "HNSW", {
        "TYPE": "FLOAT32",
        "DIM": vector_dim,
        "DISTANCE_METRIC": distance_metric,
        "INITIAL_CAP": vector_number,
        "M": 4,
        "EF_CONSTRUCTION": 400,
        "EF_RUNTIME": 500
    }
)
product_fields = [title, category, content, title_embedding, content_embedding]

text_sample_index_name="text_sample"
text_sample_prefix="text"

create_index(text_sample_index_name, text_sample_prefix, product_fields)

#### Create index - doc_sample

In [7]:
employees_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')

# Define RediSearch fields for each of the columns in the dataset
distance_metric = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
vector_dim = len(employees_df['chunk_content_vector'][0]) # length of the vectors
vector_number = len(employees_df)                 # initial number of vectors

chunk_content = TextField(name="chunk_content")

chunk_content_embedding = VectorField("chunk_content_vector",
    "HNSW", {
        "TYPE": "FLOAT32",
        "DIM": vector_dim,
        "DISTANCE_METRIC": distance_metric,
        "INITIAL_CAP": vector_number,
        "M": 4,
        "EF_CONSTRUCTION": 400,
        "EF_RUNTIME": 500
    }
)
employee_fields = [chunk_content, chunk_content_embedding]

doc_sample_index_name="doc_sample"
doc_sample_prefix="doc"

create_index(doc_sample_index_name, doc_sample_prefix, employee_fields)

#### Create index - image_sample

In [None]:
image_df = pd.read_json('../data/images/images_embeddings.json')

# Define RediSearch fields for each of the columns in the dataset
distance_metric = "COSINE"                      # distance metric for the vectors (ex. COSINE, IP, L2)
vector_dim = len(image_df['image_vector'][0]) # length of the vectors
vector_number = len(image_df)                 # initial number of vectors

image = TextField(name="image")

image_embedding = VectorField("image_vector",
    "HNSW", {
        "TYPE": "FLOAT32",
        "DIM": vector_dim,
        "DISTANCE_METRIC": distance_metric,
        "INITIAL_CAP": vector_number,
    }
)
image_fields = [image, image_embedding]

image_sample_index_name="image_sample"
image_sample_prefix="image"

create_index(image_sample_index_name, image_sample_prefix, image_fields)

#### Ingest text sample with embeddings

In [None]:
import traceback

def ingest_products(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    pipe = client.pipeline()
    for doc in records:
        key = f"{{{prefix}}}:{str(doc['id'])}"

        # create byte vectors for title and content
        title_embedding = np.array(doc["title_vector"], dtype=np.float32).tobytes()
        content_embedding = np.array(doc["content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["title_vector"] = title_embedding
        doc["content_vector"] = content_embedding
        pipe.hset(key, mapping = doc)
    try:
        pipe.execute()
    except Exception:
        traceback.print_exc()
        
ingest_products(redis_client, text_sample_prefix, text_df)

#### Ingest doc sample with embeddings

In [None]:
def ingest_employees(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    pipe = client.pipeline()
    for doc in records:
        key = f"{{{prefix}}}:{str(doc['id'])}"

        # create byte vectors for title and content
        chunk_content_embedding = np.array(doc["chunk_content_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["chunk_content_vector"] = chunk_content_embedding

        pipe.hset(key, mapping = doc)
    try:
        pipe.execute()
    except Exception:
        traceback.print_exc()
                
ingest_employees(redis_client, doc_sample_prefix, employees_df)

#### Ingest image sample with embeddings

In [None]:
def ingest_images(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    pipe = client.pipeline()
    for doc in records:
        key = f"{{{prefix}}}:{str(doc['id'])}"

        # create byte vectors for title and content
        image_embedding = np.array(doc["image_vector"], dtype=np.float32).tobytes()

        # replace list of floats with byte vectors
        doc["image_vector"] = image_embedding

        pipe.hset(key, mapping = doc)
    try:
        pipe.execute()
    except Exception:
        traceback.print_exc()
                
ingest_images(redis_client, image_sample_prefix, image_df)