
## Ingestion to Azure Cosmos DB NoSQL


#### Pip 
##### (Optional: As you can use requirements.txt or Yml if you prefer, instead the pip install)

In [None]:
##You can also use the requirements.txt or yml file.
! pip install numpy
! pip install python-dotenv
! pip install azure-core 
! pip install azure-cosmos
! pip install tenacity
! pip install azure-search-documents===11.4.0
! pip install pandas
! pip install openai==0.28.1
! pip install matplotlib
! pip install plotly
! pip install plotly
! pip install scikit-learn
! pip install scipy
! pip install Pyarrow 

#### Libraries

In [3]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SemanticSearch, 
    SemanticPrioritizedFields,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    SemanticField,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataSourceConnection,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchProfile,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters
)

from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
import pandas as pd

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

import pandas as pd
from azure.cosmos import CosmosClient, partition_key, exceptions

import os
from dotenv import load_dotenv


#### Enviroment Variables

In [None]:

load_dotenv()

## Cosmos db endpoint format: https://<nameofyourcosmosservice>.documents.azure.com
cosmos_db_api_endpoint  = os.getenv("cosmos_db_api_endpoint")
if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == "":
    print("cosmos_db_api_endpoint environment variable not set.")
    exit()

## Cosmos db API Key
cosmos_db_api_key  = os.getenv("cosmos_db_api_key")
if cosmos_db_api_key is None or cosmos_db_api_key == "":
    print("cosmos_db_api_key environment variable not set.")
    exit()

##Cosmos Connection String. Format: 
##AccountEndpoint=https://<nameofthesevice>.documents.azure.com;AccountKey=<value of the key>;Database=<name of the database, suggested here Vector_DB>;
cosmos_db_connection_string  = os.getenv("cosmos_db_connection_string")
if cosmos_db_connection_string is None or cosmos_db_connection_string == "":
    print("cosmos_db_connection_string environment variable not set.")
    exit()
    
##Cognitive Search Service Name, you need to deploy this service. Format: https://<nameoftheservice>.search.windows.net
cog_search_endpoint  = os.getenv("cog_search_endpoint")
if cog_search_endpoint is None or cog_search_endpoint == "":
    print("cog_search_endpoint environment variable not set.")
    exit()

##Cognitive Search Service Key
cog_search_key  = os.getenv("cog_search_key")
if cog_search_key is None or cog_search_key == "":
    print("cog_search_key environment variable not set.")
    exit()

    
##Open AI Service. This must be deployed. Format:https://nameoftheservice.azure.com/    
aoai_endpoint  = os.getenv("AOAI_ENDPOINT") ##api_base 
if aoai_endpoint is None or aoai_endpoint == "":
    print("AOAI_ENDPOINT environment variable not set.")
    exit()

##Version of the Open AI Service. This was build with the "2023-05-15" version
aoai_api_version  = os.getenv("AOAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AOAI_API_VERSION environment variable not set.")
    exit()

##Model  of the Open AI Service. This must be deployed: "text-embedding-ada-002"
aoai_embedding_deployed_model  = os.getenv("AOAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

##Open AI ServikeyKeyce.
azure_openai_key  = os.getenv("AZURE_OPENAI_KEY")
if azure_openai_key is None or azure_openai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

##Container names for the CosmosDB
text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'

database_name = "Vector_DB"
credential = AzureKeyCredential(str(cog_search_key))
openai.api_type = "azure"


### Cosmos ( Nosql)

#### Opening the connections.


In [None]:
from azure.cosmos import CosmosClient


# Initialize the Cosmos DB client
client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)

# Create or get a reference to the database
database = client.create_database_if_not_exists(id=database_name)

print(f"Database {database_name} created or retrieved.")



####  Initialize the Cosmos DB client - Creating containers


##### Function for the New Container

In [5]:

## this adaptation, the script uses the Cosmos DB Python SDK to create items in the Cosmos DB container. 
# Function to insert data into Cosmos DB
def new_container(container):
    try:
        partition_key_ = PartitionKey(path="/id")
        container = database.create_container_if_not_exists(
        id=container,
        partition_key=partition_key_   )

        print(f"Document {container} created successfully")

    except exceptions.CosmosResourceExistsError as e:
            print("Container already exists.")

    except Exception as e:
            # Handle other exceptions
            print(f"Error: {e}")


   

#####  Text ingestion

In [None]:
import pandas as pd

cosmosdb_container_name = text_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
text_df = pd.read_json('../data/text/product_docs_embeddings.json')
records = text_df.to_dict(orient='records')


# Create cntainer
new_container(cosmosdb_container_name)


# Iterate through the data and insert the files with the embeddings into the container
try:
    for item in records:
        title = item['title']
        content = item['content']
        item['title_vector'] = item['title_vector']
        item['content_vector'] = item['content_vector']
        item['@search.action'] = 'upload'

        # Convert the 'id' attribute to a string
        item['id'] = str(item['id'])

        # Insert the item into the container
        container.create_item(body=item)

    print(f"Data items inserted into the Cosmos DB {cosmosdb_container_name}")

except exceptions.CosmosResourceExistsError as e:
    # Handle conflict error
    print(f"Document {container} with ID {item['id']} already exists...")
    print(f"Error: {e}")


except Exception as e:
    # Handle other exceptions
    print(f"Error: {e}")



#####  Doc ingestion

In [None]:
cosmosdb_container_name = doc_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
records = doc_df.to_dict(orient='records')

# Create cntainer
new_container(cosmosdb_container_name)

# Iterate through the data and insert the files with the embeddings into the container
try:
    for item in records:
        chunk_content = item['chunk_content']
        item['chunk_content_vector'] = item['chunk_content_vector']
        item['@search.action'] = 'upload'

        # Convert the 'id' attribute to a string
        item['id'] = str(item['id'])

        # Insert the item into the container
        container.create_item(body=item)

    print(f"Data items inserted into the Cosmos DB {cosmosdb_container_name}")

except exceptions.CosmosResourceExistsError as e:
    # Handle conflict error
    print(f"Document {container} with ID {item['id']} already exists...")
    print(f"Error: {e}")

except Exception as e:
    # Handle other exceptions
    print(f"Error: {e}")


#### Checking the data inserted (optional)


In [None]:

# Specify the container name
Table_name = "text_sample"
container = database.get_container_client(Table_name)

#Number of rows - Top 10 for example
top_x_rows = 10

print(f"Quality test  top ( {top_x_rows} )")

query = f"SELECT TOP {top_x_rows} * FROM c"

# Execute the query
query_result = container.query_items(query, enable_cross_partition_query=True)

# Process the query result
for item in query_result:
    print(item)


#### Create HSNW Index



In [None]:

# Vector search configuration
##adding profiles as there is a change in this library. Note using Azure Search documents 11.4.0
vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="hsnw_config",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE
    )
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="ExhaustiveKnn",
                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE
                )
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="HnswProfile",
                algorithm_configuration_name="hsnw_config",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="ExhaustiveKnn",
            )
        ]
    )


try:
    index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=credential)


    # Define index fields for text_sample
    text_index_name = "text_sample_index"
    text_fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="title", type=SearchFieldDataType.String, searchable=True, retrievable=True),
            SearchableField(name="content", type=SearchFieldDataType.String, searchable=True, retrievable=True),
            SearchableField(name="category", type=SearchFieldDataType.String, filterable=True, searchable=True, retrievable=True),
            # Ensure dimensions and vectorSearchConfiguration are set for title_vector
        SearchField(name="title_vector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  # Change the type to Collection(Edm.String)
                searchable=True, 
                vector_search_dimensions=1536,  # Adjust dimensions as needed
                vector_search_profile_name="HnswProfile"),

        SearchField(name="content_vector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  # Change the type to Collection(Edm.String)
                searchable=True, 
                vector_search_dimensions=1536,  # Adjust dimensions as needed
                vector_search_profile_name="HnswProfile"),

        ]
  
    # Define index fields for doc_sample
    doc_index_name = "doc_sample_index"
    doc_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="chunk_content", type=SearchFieldDataType.Single,
                        searchable=True, retrievable=True),
        SearchField(name="chunk_content_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  # Change the type to Collection(Edm.String)
            searchable=True,
            vector_search_dimensions=1536,  # Adjust dimensions as needed
            vector_search_profile_name="HnswProfile"),
       
    ]


    # Semantic search configuration
    config_text = SemanticConfiguration(
        name="ConfigSemantictext",
        prioritized_fields=SemanticPrioritizedFields (
            title_field=SemanticField(field_name="title"),
            keywords_fields=[SemanticField(field_name="category")],
            content_fields=[SemanticField(field_name="content")]
        )
    )

    config_doc = SemanticConfiguration(
        name="ConfigSemanticdoc",
        prioritized_fields=SemanticPrioritizedFields (
            title_field=SemanticField(field_name="chunk_content")
        )
    )



    # Create the configurration
    settings_text = SemanticSearch(configurations=[config_text])
    settings_doc = SemanticSearch(configurations=[config_doc])

    
    # Create indexes
    text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=vector_search,semantic_search= settings_text)
    doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=vector_search,semantic_search= settings_doc)



    # Create or update indexes
    index_client.create_or_update_index(text_index)
    print(f'Indexes created or updated: {text_index_name}')
    index_client.create_or_update_index(doc_index)
    print(f'Indexes created or updated: {doc_index_name}')
    
except HttpResponseError as e:
    print(f"HTTP Error: {e}")
    print(f"Status Code: {e.status_code}")
    print(f"Error Message: {e.error.message}")




### Create the Azure Cognitve search index
##### Datsource Function
##### Indexer Function


In [10]:

def create_datasource(data_source_name, cosmos_db_connection_string, table_name, indexer_client):
    try:
        container_cosmos = SearchIndexerDataContainer(
            name=table_name,
            query=f"SELECT * FROM {table_name} c WHERE c._ts>@HighWaterMark ORDER BY  c._ts"
        )

        # Define the data source connection
        data_source_connection = SearchIndexerDataSourceConnection(
            name=data_source_name,
            type="cosmosdb",
            connection_string=cosmos_db_connection_string,
            container=container_cosmos
        )

        # Create or update the data source connection
        data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

    except HttpResponseError as ex:
        print(f"Error: {ex}")



In [11]:
def create_indexer_if_not_exists(
    indexer_name, target_index_name, data_source_name, indexer_client

):

    try:
      
            # Create and run the indexer
            indexer = SearchIndexer(
                name=indexer_name,
                data_source_name=data_source_name,
                target_index_name=target_index_name
            )

            indexer_client.create_or_update_indexer(indexer)
            indexer_client.run_indexer(indexer_name)

    except HttpResponseError as ex:
        print(f"Error: {ex}")




### Creating the datasource and index from the functions already defined
##### Indexers to crawl data from the data source and insert them into the indexes
##### Data Source that connect Azure Cognitive Search to Cosmos NoSQL

In [14]:
##You must have a Cognitve search service already created
indexer_client = SearchIndexerClient(cog_search_endpoint, credential)


data_source_name = 'textsample'
table_name =text_table_name
indexer_name = data_source_name
target_index_name_=text_index_name

create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)
create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name, indexer_client)


data_source_name = 'docsample'
table_name =doc_table_name
indexer_name = data_source_name
target_index_name_=doc_index_name

create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)
create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name,indexer_client)

