
### Ingestion to COSMOSDB 


#### Libraries

In [None]:
! pip install numpy
! pip install openai
! pip install python-dotenv
! pip install azure-core 
! pip install azure-cosmos
! pip install tenacity
! pip install azure-search-documents


#### Libraries

In [44]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataSourceConnection
)

from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

import pandas as pd
from azure.cosmos import CosmosClient, partition_key, exceptions

import os
from dotenv import load_dotenv


#### Enviroment Variables

In [None]:

load_dotenv()

cosmos_db_api_endpoint  = os.getenv("cosmos_db_api_endpoint")
if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == "":
    print("cosmos_db_api_endpoint environment variable not set.")
    exit()

cosmos_db_api_key  = os.getenv("cosmos_db_api_key")
if cosmos_db_api_key is None or cosmos_db_api_key == "":
    print("cosmos_db_api_key environment variable not set.")
    exit()

cog_search_endpoint  = os.getenv("cog_search_endpoint")
if cog_search_endpoint is None or cog_search_endpoint == "":
    print("cog_search_endpoint environment variable not set.")
    exit()

cog_search_key  = os.getenv("cog_search_key")
if cog_search_key is None or cog_search_key == "":
    print("cog_search_key environment variable not set.")
    exit()

cosmos_db_connection_string  = os.getenv("cosmos_db_connection_string")
if cosmos_db_connection_string is None or cosmos_db_connection_string == "":
    print("cog_search_key environment variable not set.")
    exit()
    

text_table_name = 'text_sample'
doc_table_name = 'doc_sample'

database_name = "Vector_DB"#####Replace here the name you want to use for your Database####
credential = AzureKeyCredential(str(cog_search_key))


### Cosmos ( Nosql)

#### Opening the connections.


In [None]:
from azure.cosmos import CosmosClient


# Initialize the Cosmos DB client
client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)

# Create or get a reference to the database
database = client.create_database_if_not_exists(id=database_name)

print(f"Database {database_name} created or retrieved.")



####  Initialize the Cosmos DB client - Creating containers


##### Function for new Container

In [66]:

## this adaptation, the script uses the Cosmos DB Python SDK to create items in the Cosmos DB container. 
# Function to insert data into Cosmos DB
def new_container(container):
    try:
        partition_key_ = PartitionKey(path="/id")
        container = database.create_container_if_not_exists(
        id=container,
        partition_key=partition_key_   )

        print(f"Document {container} created successfully")

    except exceptions.CosmosResourceExistsError as e:
            print("Container already exists.")

    except Exception as e:
            # Handle other exceptions
            print(f"Error: {e}")


   

#####  Text ingestion

In [None]:


cosmosdb_container_name = text_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
text_df = pd.read_json('../data/text/product_docs_embeddings.json')
records = text_df.to_dict(orient='records')


# Create cntainer
new_container(cosmosdb_container_name)


# Iterate through the data and insert the files with the embeddings into the container
try:
    for item in records:
        title = item['title']
        content = item['content']
        item['titleVector'] = item['title_vector']
        item['contentVector'] = item['content_vector']
        item['@search.action'] = 'upload'

        # Convert the 'id' attribute to a string
        item['id'] = str(item['id'])

        # Insert the item into the container
        container.create_item(body=item)

    print(f"Data items inserted into the Cosmos DB {cosmosdb_container_name}")

except exceptions.CosmosResourceExistsError as e:
    # Handle conflict error
    print(f"Document {container} with ID {item['id']} already exists...")
    print(f"Error: {e}")

    # Implement your logic to update the existing document or take appropriate action

except Exception as e:
    # Handle other exceptions
    print(f"Error: {e}")



#####  Doc ingestion

In [None]:
cosmosdb_container_name = doc_table_name
container = database.get_container_client(cosmosdb_container_name)

# Read data from the JSON file
doc_df = pd.read_json('../data/docs/employee_handbook_embeddings.json')
records = doc_df.to_dict(orient='records')

# Create cntainer
new_container(cosmosdb_container_name)

# Iterate through the data and insert the files with the embeddings into the container
try:
    for item in records:
        chunk_content = item['chunk_content']
        item['chunk_content_vector'] = item['chunk_content_vector']
        item['@search.action'] = 'upload'

        # Convert the 'id' attribute to a string
        item['id'] = str(item['id'])

        # Insert the item into the container
        container.create_item(body=item)

    print(f"Data items inserted into the Cosmos DB {cosmosdb_container_name}")

except exceptions.CosmosResourceExistsError as e:
    # Handle conflict error
    print(f"Document {container} with ID {item['id']} already exists...")
    print(f"Error: {e}")

    # Implement your logic to update the existing document or take appropriate action

except Exception as e:
    # Handle other exceptions
    print(f"Error: {e}")







#### Checking the data inserted (optional)


In [None]:

# Specify the container name
Table_name = "text_sample"
container = database.get_container_client(Table_name)

#Number of rows - Top 10 for example
top_x_rows = 10

print(f"Quality test  top ( {top_x_rows} )")

query = f"SELECT TOP {top_x_rows} * FROM c"

# Execute the query
query_result = container.query_items(query, enable_cross_partition_query=True)

# Process the query result
for item in query_result:
    print(item)


#### Create HSNW Index



In [None]:

# https://learn.microsoft.com/en-gb/azure/search/search-create-service-portal


# Semantic search configuration
config_text = SemanticConfiguration(
    name="ConfigSemantictext",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Semantic search configuration
config_doc = SemanticConfiguration(
    name="ConfigSemanticdoc",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="chunk_content")
    )
)



# Create the configurration
settings_text = SemanticSettings(configurations=[config_text])
settings_doc = SemanticSettings(configurations=[config_doc])



# Vector search configuration
vector_search_config = VectorSearchAlgorithmConfiguration(
    name="vector-cosmos-config",
    kind="hnsw",
    hnsw_parameters={
        "m": 4,
        "efConstruction": 400,
       "efSearch": 1000,
        "metric": "cosine"
    }
)

try:
    index_client = SearchIndexClient(endpoint=cog_search_endpoint, credential=cog_search_cred)


    # Define index fields for text_sample
    text_index_name = "text_sample_index"
    text_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchableField(name="content", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchableField(name="category", type=SearchFieldDataType.String,
                        filterable=True, searchable=True, retrievable=True),
        SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),
        SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),
    ]



    # Define index fields for doc_sample
    doc_index_name = "doc_sample_index"
    doc_fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="chunk_content", type=SearchFieldDataType.String,
                        searchable=True, retrievable=True),
        SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, dimensions=1536, vector_search_configuration="vector-cosmos-config"),        
    ]




    

 
    # Create indexes
    text_index = SearchIndex(name=text_index_name, fields=text_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_text)
    doc_index = SearchIndex(name=doc_index_name, fields=doc_fields, vector_search=VectorSearch(algorithm_configurations=[vector_search_config]), semantic_settings=settings_doc)



    # Create or update indexes
    index_client.create_or_update_index(text_index)
    print(f'Indexes created or updated: {text_index_name}')
    
    index_client.create_or_update_index(doc_index)
    print(f'Indexes created or updated: {doc_index_name}')

 

    
    
except HttpResponseError as e:
    print(f"HTTP Error: {e}")
    print(f"Status Code: {e.status_code}")
    print(f"Error Message: {e.error.message}")





### Create the Azure Cognitve search index
##### Datsource Function
##### Indexer Function


In [71]:


def create_datasource(data_source_name, cosmos_db_connection_string, table_name, indexer_client):
    try:
        container_cosmos = SearchIndexerDataContainer(
            name=table_name,
            query=f"SELECT * FROM {table_name} c WHERE c._ts>@HighWaterMark ORDER BY  c._ts"
        )

        # Define the data source connection
        data_source_connection = SearchIndexerDataSourceConnection(
            name=data_source_name,
            type="cosmosdb",
            connection_string=cosmos_db_connection_string,
            container=container_cosmos
        )

        # Create or update the data source connection
        data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

    except HttpResponseError as ex:
        print(f"Error: {ex}")



In [72]:
def create_indexer_if_not_exists(
    indexer_name, target_index_name, data_source_name, indexer_client

):

    try:
      
            # Create and run the indexer
            indexer = SearchIndexer(
                name=indexer_name,
                data_source_name=data_source_name,
                target_index_name=target_index_name
            )

            indexer_client.create_or_update_indexer(indexer)
            indexer_client.run_indexer(indexer_name)

    except HttpResponseError as ex:
        print(f"Error: {ex}")




### Creating the datasource and index from the functions already defined
##### Indexers to crawl data from various data sources and insert them into indexes
##### Data Sources that connect Azure Cognitive Search to Cosmos

In [73]:
##You must have a Cognitve search service already created
indexer_client = SearchIndexerClient(cog_search_endpoint, credential)


data_source_name = 'textsample'
table_name =text_table_name
indexer_name = data_source_name
target_index_name_=text_index_name

create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)
create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name, indexer_client)


data_source_name = 'docsample'
table_name =doc_table_name
indexer_name = data_source_name
target_index_name_=doc_index_name

create_datasource(data_source_name,cosmos_db_connection_string,table_name, indexer_client)
create_indexer_if_not_exists(indexer_name, target_index_name_, data_source_name,indexer_client)

