In [25]:
# ! pip install json
# ! pip install python-dotenv

# ! pip install openai

# ! pip install gradio

In [149]:
# Import the required libraries
import time
import os
import json
import uuid
from dotenv import dotenv_values
from openai import OpenAI, AzureOpenAI
import gradio as gr


#Cosmos DB imports
from azure.cosmos import CosmosClient

## Load configs

In [153]:
# Variables
# specify the name of the .env file name 
env_name = "nosql_vanilla.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

cosmos_conn = config['cosmos_connection_string']
cosmos_key = config['cosmos_key']
cosmos_database = config['cosmos_database_name']
cosmos_container = config['cosmos_container_name']

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_api_version = config['openai_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])

# Create the Azure Cosmos DB for NoSQL client
cosmos_client = CosmosClient(url=cosmos_conn, credential=cosmos_key)

os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

azure_openai_embeddings = AzureOpenAI(
    api_version=openai_api_version,
    api_key= openai_key,
    azure_endpoint= openai_endpoint,
)

## Create database and container

In [151]:
def get_vector_indexing_policy(embedding_path, embedding_type):
    for i in range(0, len(embedding_type)):
        vectorIndexes = []
        vectorIndex = {"path": embedding_path[0], "type": f"{embedding_type[0]}"}
        vectorIndexes.append(vectorIndex)
        
    return {
        "indexingMode": "consistent",
        "includedPaths": [{"path": "/*"}],
        'excludedPaths': [{'path': '/"_etag"/?'}],
        "vectorIndexes": vectorIndexes
    }

In [152]:
def get_vector_embedding_policy(embedding_path, distance_function, data_type, dimensions):
    for i in range(0, len(distance_function)):
        vectorEmbeddings = []
        vectorEmbedding = {
                    "path": embedding_path[0],
                    "dataType": f"{data_type[0]}",
                    "dimensions": dimensions[0],
                    "distanceFunction": f"{distance_function[0]}"
                }
        vectorEmbeddings.append(vectorEmbedding)
        
    return {
        "vectorEmbeddings": vectorEmbeddings
    }

In [154]:
database = cosmos_client.create_database_if_not_exists(cosmos_database)
print('Database with id \'{0}\' created'.format(cosmos_database))

Database with id 'electronicsDB' created


In [155]:
from azure.cosmos.partition_key import PartitionKey
indexing_policy=get_vector_indexing_policy(
    embedding_path=["/EmbeddingCategory", "/EmbeddingName"], 
    embedding_type=["quantizedFlat", "quantizedFlat"])

vector_embedding_policy=get_vector_embedding_policy(
    embedding_path=["/EmbeddingCategory", "/EmbeddingName"], 
    data_type=["float32", "float32"],
    distance_function=["cosine", "cosine"],
    dimensions=[1536,1536]) 

container = database.create_container_if_not_exists(
    id=cosmos_container,
    partition_key=PartitionKey(path="/id"),
    offer_throughput=30000,
    indexing_policy=indexing_policy,
    vector_embedding_policy=vector_embedding_policy   
)

## Load data, and insert in cosmos db

In [120]:
import pandas as pd

df = pd.read_csv("Online Sales Data.csv")
df.head()

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
0,10001,2024-01-01,Electronics,iPhone 14 Pro,2,999.99,1999.98,North America,Credit Card
1,10002,2024-01-02,Home Appliances,Dyson V11 Vacuum,1,499.99,499.99,Europe,PayPal
2,10003,2024-01-03,Clothing,Levi's 501 Jeans,3,69.99,209.97,Asia,Debit Card
3,10004,2024-01-04,Books,The Da Vinci Code,4,15.99,63.96,North America,Credit Card
4,10005,2024-01-05,Beauty Products,Neutrogena Skincare Set,1,89.99,89.99,Europe,PayPal


In [110]:
for index, row in df.iterrows():
    item = {
        'id': str(row['Transaction ID']),  # Ensure 'id' is a string
        'Name': row['Product Name'],
        'Category': row['Product Category'],
        'Price': row['Total Revenue'],
        'PaymentMethod': row["Payment Method"],
        'EmbeddingCategory': generate_embeddings(row['Product Category']),
        'EmbeddingName': generate_embeddings(row['Product Name']),
        
    }
    container.create_item(body=item)

In [156]:
# generate openai embeddings
def generate_embeddings(text):    
    response = azure_openai_embeddings.embeddings.create(input=text, model=openai_embeddings_deployment)
    embeddings =response.model_dump()
    time.sleep(0.5) 
    return embeddings['data'][0]['embedding']

## Perform vector search

In [159]:
embeddings = generate_embeddings("Tell me some shoes that I can buy?")
query = (
            "SELECT TOP 5 c.Name, c.Category, c.Price, VectorDistance(c.EmbeddingName, {}) AS "
            "SimilarityScore FROM c ORDER BY VectorDistance(c.EmbeddingName, {}) ".format(
                embeddings,
                embeddings,
            )
        )
docs_and_scores = []
items = list(
    container.query_items(query=query, enable_cross_partition_query=True)
)
for item in items:
    name = item["Name"]
    category = item["Category"]
    price = item["Price"]
    score = item["SimilarityScore"]
    docs_and_scores.append({"name": name, "category": category,"price": price, "score": score})
docs_and_scores

[{'name': 'Adidas Ultraboost Shoes',
  'category': 'Clothing',
  'price': 359.98,
  'score': 0.4304136592895864},
 {'name': 'Adidas Ultraboost Running Shoes',
  'category': 'Clothing',
  'price': 359.98,
  'score': 0.40716167688055194},
 {'name': 'On Running Cloud Shoes',
  'category': 'Sports',
  'price': 259.98,
  'score': 0.3865027981056846},
 {'name': 'Nike Air Force 1 Sneakers',
  'category': 'Clothing',
  'price': 270,
  'score': 0.38127115146806323},
 {'name': 'Puma Suede Classic Sneakers',
  'category': 'Clothing',
  'price': 239.96,
  'score': 0.37861581307732417}]

## Filtered Vector Search

In [161]:
embeddings = generate_embeddings("Tell me some shoes that I can buy?")
query = (
            "SELECT c.Name, c.Category, c.Price, VectorDistance(c.EmbeddingName, {}) AS "
            "SimilarityScore FROM c WHERE c.Price < 400 ORDER BY VectorDistance(c.EmbeddingName, {}) OFFSET 0 LIMIT 2".format(
                embeddings,
                embeddings,
            )
        )
docs_and_scores = []
items = list(
    container.query_items(query=query, enable_cross_partition_query=True)
)
for item in items:
    name = item["Name"]
    category = item["Category"]
    price = item["Price"]
    score = item["SimilarityScore"]
    docs_and_scores.append({"name": name, "category": category,"price": price, "score": score})
docs_and_scores

[{'name': 'Adidas Ultraboost Shoes',
  'category': 'Clothing',
  'price': 359.98,
  'score': 0.4304136592895864},
 {'name': 'Adidas Ultraboost Running Shoes',
  'category': 'Clothing',
  'price': 359.98,
  'score': 0.40716167688055194}]