# RAG with Azure AI search
#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Azure AI Search


In [1]:
# Import required libraries
import os
import json
from dotenv import load_dotenv

from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Configure environment variables
load_dotenv()

True

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

In [3]:
# Configure environment variables
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

OPENAI_GPT35_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT35_DEPLOYMENT_NAME")
OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
OPENAI_GPT4V_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4V_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_DALLE_DEPLOYMENT_NAME = os.getenv("OPENAI_DALLE_DEPLOYMENT_NAME")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

credential = AzureKeyCredential(key)

In [4]:
# Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:
embeddingmodel = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1)

In [5]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    embeddings = embeddingmodel.embed_query(text)
    return embeddings

In [None]:
doc_title = "Semantic Kernel"
# load pdf and split into pages
fileName = "../data/semantic-kernel.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

doc_with_vector_list = []
doc_id = 0
# Generate embeddings for title and content fields
for page in pages:
    page_with_vector = {}
    page_with_vector['id'] = str(doc_id)
    page_with_vector['title'] = doc_title
    page_with_vector['titleVector'] = generate_embeddings(doc_title)
    page_with_vector['content'] = page.page_content
    page_with_vector['contentVector'] = generate_embeddings(page.page_content)
    doc_with_vector_list.append(page_with_vector)
    doc_id += 1

# Output embeddings to docVectors.json file
with open("../data/semantic-kernel_Vectors.json", "w") as f:
    json.dump(doc_with_vector_list, f)

In [None]:
# Create a search index
# Note: You must create Cognitive Search resource and get the endpoint and key in advance
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

fields = [
    # doc id - mandatory field
    SimpleField(name="id", type=SearchFieldDataType.String, key=True,sortable=True, filterable=True, facetable=True),

    # title and titleVector
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),

    # content and contentVector
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),

]

#The Hierarchical Navigable Small World (HNSW) graph algorithm is a popular method for approximate nearest neighbour search in high-dimensional spaces.
vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="sk-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,                  #maximum number of edges per node in the zero or base layer of the HNSW graph.
                "efConstruction": 400,   #this parameter affects the index building during the construction phase.Increasing efConstruction will usually improve the quality of the constructed graph, leading to better recall. However, it will also slow down the index building process.
                "efSearch": 500,         #this parameter affects the search time of the query phase. A higher value of efSearch increases the search time but usually results in better recall. 
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="sk-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name="sk-cogsrch-vector-index-2", fields=fields,vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [None]:
# Upload documents to the index
with open('../data/semantic-kernel_Vectors.json', 'r') as file:
    documents = json.load(file)
search_client = SearchClient(endpoint=service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")