
### Ingestion to COSMOSDB 


#### Libraries

In [None]:
! pip install numpy
! pip install openai
! pip install python-dotenv
! pip install azure-core
! pip install azure-cosmos
! pip install tenacity

#### Enviroment Variables

In [1]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataSourceConnection
)

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

import os
from dotenv import load_dotenv


In [None]:

load_dotenv()

cosmos_db_api_endpoint  = os.getenv("cosmos_db_api_endpoint")
if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == "":
    print("cosmos_db_api_endpoint environment variable not set.")
    exit()

cosmos_db_api_key  = os.getenv("cosmos_db_api_key")
if cosmos_db_api_key is None or cosmos_db_api_key == "":
    print("cosmos_db_api_key environment variable not set.")
    exit()

cog_search_endpoint  = os.getenv("cog_search_endpoint")
if cog_search_endpoint is None or cog_search_endpoint == "":
    print("cog_search_endpoint environment variable not set.")
    exit()

cog_search_key  = os.getenv("cog_search_key")
if cog_search_key is None or cog_search_key == "":
    print("cog_search_key environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("aoai_embedding_deployed_model")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("aoai_embedding_deployed_model environment variable not set.")
    exit()   


aoai_key  = os.getenv("aoai_key")
if aoai_key is None or aoai_key == "":
    print("aoai_key environment variable not set.")
    exit()   



text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'





### Cosmos ( Nosql)

#### Opening the connections.


In [68]:
from azure.cosmos import CosmosClient

# Your Cosmos DB connection parameters
database_name = "Vector_DB" #####Replace here the name you want to use for your Database####

# Initialize the Cosmos DB client
client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)

# Create or get a reference to the database
database = client.create_database_if_not_exists(id=database_name)

print(f"Database {database_name} created or retrieved.")



Database Vector_DB created or retrieved.


####  Initialize the Cosmos DB client - Creating containers


In [69]:

def create_or_update_container(container_name, partition_key_path):
    container_definition = {
        "id": container_name,
        "partitionKey": {"paths": [f"/{partition_key_path}"], "kind": "Hash"},
    }
    ###Update here the througput as needed
    options = {"offerThroughput": 400}
    
    # Create or update container
    container = database.create_container_if_not_exists(
        container_definition, partition_key=PartitionKey(path=container_definition["partitionKey"]["paths"][0]), offer_throughput=options["offerThroughput"]
    )

    print(f"Container {container_name} created with partition key: {partition_key_path}")



try:
    result = create_or_update_container("text_sample", "id")
    print(f"Index created or updated successfully: {result}")
    result = create_or_update_container("doc_sample", "id")
    print(f"Index created or updated successfully: {result}")
    result = create_or_update_container("image_sample", "id")
    print(f"Index created or updated successfully: {result}")
except Exception as e:
    print(f"Error creating or updating index: {e}")




Container text_sample created with partition key: id
Index created or updated successfully: None
Container doc_sample created with partition key: id
Index created or updated successfully: None
Container image_sample created with partition key: id
Index created or updated successfully: None


#### Ingest text sample with embeddings

##### Creating function for ingestion of data

In [71]:

## this adaptation, the script uses the Cosmos DB Python SDK to create items in the Cosmos DB container. 
# The to_dict(orient='records') method is used to convert the Pandas DataFrame to a list of dictionaries, where each dictionary represents a record
# Function to insert data into Cosmos DB
def insert_data(container, records):
    try:
        for record in records:
            # Convert the 'id' attribute to a string
            record['id'] = str(record['id'])
            container.create_item(body=record)
        print(f"Document {container} inserted successfully")

    except exceptions.CosmosResourceExistsError as e:
            # Handle conflict error
            print(f"Document {container}  with ID {record['id']} already exists...")
            print(f"Error: {e}")

            # Implement your logic to update the existing document or take appropriate action

    except Exception as e:
            # Handle other exceptions
            print(f"Error: {e}")


   

#####  Text ingestion

In [None]:
import pandas as pd
from azure.cosmos import CosmosClient


cosmosdb_container_name = text_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
text_df = pd.read_json('../data/text/product_docs_embeddings.json')
records = text_df.to_dict(orient='records')


# Insert data into text_sample container
insert_data(container, records)




#####  Doc ingestion

In [None]:
cosmosdb_container_name = doc_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
records = doc_df.to_dict(orient='records')


# Insert data into doc_sample container
insert_data(container, records)




#####  Image ingestion

In [None]:
cosmosdb_container_name = image_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
records = doc_df.to_dict(orient='records')


# Insert data into doc_sample container
insert_data(container, records)




#### Checking the data inserted (optional)


In [None]:

# Specify the container name
Table_name = "text_sample"
container = database.get_container_client(Table_name)

#Number of rows - Top 10 for example
top_x_rows = 10

print(f"Quality test  top ( {top_x_rows} )")

query = f"SELECT TOP {top_x_rows} * FROM c"

# Execute the query
query_result = container.query_items(query, enable_cross_partition_query=True)

# Process the query result
for item in query_result:
    print(item)


#### Create HSNW Index



In [None]:
from azure.core.credentials import AzureKeyCredential  
from azure.core.exceptions import HttpResponseError

cog_search_cred = AzureKeyCredential(cog_search_key)  # https://learn.microsoft.com/en-gb/azure/search/search-create-service-portal

try:
    index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=cog_search_cred)


    # Define index fields for text_sample
    text_index_name = "hsnw_sample_index_vector_search"
    text_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchableField(name="content", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchableField(name="category", type=SearchFieldDataType.String,
                        filterable=True, searchable=True, retrievable=True),
        SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),
        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),
    ]

    # Define index fields for doc_sample
    doc_index_name = "doc_sample_index"
    doc_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="chunk_content", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),        
    ]

    # Define index fields for image_sample
    image_index_name = "image_sample_index"
    image_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="image", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchField(name="image_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),        
    ]
    

    # Your existing vector search configuration
    vector_search_config = VectorSearchAlgorithmConfiguration(
        name="vector-cosmos-config",
        kind="hnsw",
        hnsw_parameters={
            "m": 4,
            "efConstruction": 400,
            "efSearch": 1000,
            "metric": "cosine"
        }
    )
    # Create indexes
    text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]))
    doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]))
    image_index = SearchIndex(name=image_index_name, fields=image_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]))


    # Create or update indexes
    #index_client.create_or_update_index(text_index)
    print(f'Indexes created or updated: {text_index_name}')
    
    index_client.create_or_update_index(doc_index)
    print(f'Indexes created or updated: {doc_index_name}')

    index_client.create_or_update_index(image_index)
    print(f'Indexes created or updated: {image_index_name}')

    
    
except HttpResponseError as e:
    print(f"HTTP Error: {e}")
    print(f"Status Code: {e.status_code}")
    print(f"Error Message: {e.error.message}")



