# Creation of Azure Vector Data Base

It is needed a vector data base and its integration with azure search service.

* data is being vectorize with openAI ada embeddings model
* from an existing json some fields are being vectorized for the search
* the created index is uploaded to azure

In [2]:
import os

import json  
import openai      # 1.3.3
from openai import AzureOpenAI
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  

# Blob
from azure.storage.blob import BlobServiceClient, ContentSettings

from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)  

from dotenv import load_dotenv  
import os
import logging
import json
import pandas as pd

In [3]:
# Configure environment variables  

In [4]:
load_dotenv("pfd_conf.env")

connect_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
service_endpoint = os.getenv("AZURE_STORAGE_SERVICE_ENDPOINT")

index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_KEY")

model = os.getenv("AZURE_EMBEDDINGS_MODEL")
credential = AzureKeyCredential(key)
endpoint = os.getenv("AZUREOPENAI_ENDPOINT")
deployment = os.getenv("AZUREOPENAI_DEPLOYMENT")


container_name = "containername"  
blob_name = 'original.json'  
blob_vectorname = 'combinedVector.json'


In [6]:
client_embeddings = AzureOpenAI(
    api_key = os.getenv("OPENAI_EMBEDDINGS_KEY"),  
    api_version = os.getenv("OPENAI_EMBEDDINGS_VERSION"),
    azure_endpoint = os.getenv("OPENAI_EMBEDDINGS_ENDPOINT")
)

In [7]:
# Function to manage azure blob storage 

def update_json_blob(data, name_blob):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)

    blob_data = json.dumps(data, ensure_ascii=False).encode('utf-8')

    blob_client = container_client.get_blob_client(name_blob)
    blob_client.upload_blob(blob_data, overwrite=True, content_settings=ContentSettings(content_type='application/json'))


def read_json_blob(name_blob):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(name_blob)

    try:
        downloaded_blob = blob_client.download_blob()
        data = json.loads(downloaded_blob.readall().decode('utf-8'))
    except:
        data = []
    return data


In [None]:
# Generate embeddings for a given text

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text, model=model):
        return client_embeddings.embeddings.create(input = [text], model=model).data[0].embedding

# Generate Embeddings Fields

This would be generated from an already existing JSON file wich contains:

* filename of the file
* Conent of the file
* folder of the file



In [50]:
_ = read_json_blob(blob_vectorname)
input_data = read_json_blob(blob_name)


In [None]:
for i, item in enumerate(input_data):
    try:
        id = item['id']
        area = item['Area']
        content = item['content']
        content = ''.join(content)
        item['content'] = content
        filename = item['filename']  
        title_embeddings = generate_embeddings(area, model)
        item['AreaVector'] = title_embeddings
        title_embeddings = generate_embeddings(content, model)
        item['contentVector'] = title_embeddings
        title_embeddings = generate_embeddings(filename, model)
        item['filenameVector'] = title_embeddings
        
    except:
        print(f"Failed: {item['filename']}"  )


In [53]:
update_json_blob(input_data, blob_vectorname)

# Creation of an the vectorized index on azure search

In [8]:
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
            SearchableField(name="Area", type=SearchFieldDataType.String, filterable=True, facetable=True, sortable= True),
            SearchableField(name="content", type=SearchFieldDataType.String),
            SearchableField(name="filename", type=SearchFieldDataType.String, filterable=True, facetable=True, sortable= True),
            SearchField(name="AreaVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
            SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
            SearchField(name="filenameVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
        ]

vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw", 
            #name="vector-config-1702077576158",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=10,
                ef_construction=1000,
                ef_search=900,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.DOT_PRODUCT
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            #algorithm_configuration_name="vector-config-1702077576158",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="Area"),
            content_fields=[SemanticField(field_name="content"),
            SemanticField(field_name="filename"),]
        )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])



In [9]:
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 ghcvindex created


# Uploading the index to azure search services

In [10]:
documents = read_json_blob(blob_vectorname)

In [57]:
# Id must be  a string

for item in documents:
    item['id'] = str(item['id']) if 'id' in item else None

In [59]:
# Index upload

# search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
#result = search_client.upload_documents(documents)
#print(f"Uploaded {len(documents)} documents") 


In [None]:
from azure.core.exceptions import AzureError


In [13]:
# Index upload by batch

batch_size = 100  # Adjust the batch size according to your needs

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
total_documents = len(documents)
uploaded_documents = 0

while uploaded_documents < total_documents:
    batch = documents[uploaded_documents:uploaded_documents + batch_size]
    try:
        result = search_client.upload_documents(batch)
        uploaded_documents += len(batch)
        print(f"Uploaded {uploaded_documents} documents out of {total_documents}")
    except AzureError as e:
        if "Request Entity Too Large" in str(e):
            # If the batch is too large, decrease the batch size and try again
            
            batch_size //= 2
            print(f"Batch size reduced to {batch_size} due to 'RequestEntityTooLargeError'")
        else:
            # Handle other Azure errors here
            print(f"An Azure error occurred: {e}")

Uploaded 100 documents out of 4855
Uploaded 200 documents out of 4855
Uploaded 300 documents out of 4855
Uploaded 400 documents out of 4855
Uploaded 500 documents out of 4855
Uploaded 600 documents out of 4855
Uploaded 700 documents out of 4855
Uploaded 800 documents out of 4855
Uploaded 900 documents out of 4855
Uploaded 1000 documents out of 4855
Uploaded 1100 documents out of 4855
Uploaded 1200 documents out of 4855
Uploaded 1300 documents out of 4855
Uploaded 1400 documents out of 4855
Uploaded 1500 documents out of 4855
Uploaded 1600 documents out of 4855
Uploaded 1700 documents out of 4855
Uploaded 1800 documents out of 4855
Uploaded 1900 documents out of 4855
Uploaded 2000 documents out of 4855
Uploaded 2100 documents out of 4855
Uploaded 2200 documents out of 4855
Uploaded 2300 documents out of 4855
Uploaded 2400 documents out of 4855
Uploaded 2500 documents out of 4855
Uploaded 2600 documents out of 4855
Uploaded 2700 documents out of 4855
Uploaded 2800 documents out of 4855
U