## Creating a Search Index using Azure AI Search SDK to support RAG based scenarios

#### Prerequisites  
  
Before starting, ensure you have:  
  
- An Azure account with an Azure AI Search service created.
- The service name, admin API key, and endpoint URL.
- Installed the Azure AI Search SDK: `azure-search-documents`. See below to install.

#### Install SDK

In [None]:
%pip install azure-search-documents --quiet
%pip install azure-storage-blob --quiet
%pip install azure-identity --quiet
%pip install openai --quiet

#### Import Libraries

In [None]:
import os
import random
import requests

from typing import Dict
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

from azure.storage.blob import BlobServiceClient

from azure.search.documents import SearchClient, SearchItemPaged
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery, QueryType
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchFieldDataType,
    SearchField,
    SemanticConfiguration,
    SemanticField,
    VectorSearch,
    SemanticSearch,
    SemanticPrioritizedFields,
    SearchableField,
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)

#### Define Search Index Schema

Put the name of your search index in placeholder below. If you have created search index just update the name in placeholder below and jump to [Data Ingestion](#data-ingestion). Otherwise follow steps below to create your index.

In [None]:
index_name = '<your_index_name>'

In [None]:
financial_index = SearchIndex(
    name=index_name,
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="en.microsoft"),
        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
        SearchableField(name="headings", type=SearchFieldDataType.Collection(SearchFieldDataType.String), collection=True, filterable=True, facetable=True),
        SearchableField(name="sourcePage", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="sourceFile", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="reportedYear", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="subsidiary", type=SearchFieldDataType.String, filterable=True, facetable=True)
    ]
)

#### Introduce Vector Search: Define Vector Search Parameters

In [None]:
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

financial_index.vector_search = vector_search

#### Introduce Semantic Search: Define Semantic configuration based on Index schema

In [None]:
semantic_config = SemanticConfiguration(
    name="microsoft-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="headings"),
        keywords_fields=[
            SemanticField(field_name="reportedYear"),
            SemanticField(field_name="subsidiary")
        ],
        content_fields=[
            SemanticField(field_name="content")
        ]
    )
)

# Create the semantic settings with the configuration
financial_index.semantic_search = SemanticSearch(configurations=[semantic_config])

#### Create Index in Azure AI Search

In [None]:
# Replace the placeholders with your Azure AI Search service details.
AZURE_AI_SEARCH_RESOURCE_NAME = 'srch-5xd4xcq23ihzu'  # e.g., 'my-search-service'

# Create a SearchIndexClient for index management
index_client = SearchIndexClient(endpoint=f"https://{AZURE_AI_SEARCH_RESOURCE_NAME}.search.windows.net",
                                 credential=DefaultAzureCredential())

result = index_client.create_index(financial_index)
print(f"Successfully created index '{index_name}'.")

# Check if the Index Exists
index_list = index_client.list_index_names()
if index_name in index_list:
    print(f"Index '{index_name}' exists.")

## Data Ingestion

#### Load Microsoft Financial Data

In [None]:
# Setup Storage Account details
AZURE_STORAGE_ACCOUNT_NAME = ""
AZURE_STORAGE_BLOB_CONTAINER_NAME = ""

blob_client = BlobServiceClient(
    account_url=f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
    credential=DefaultAzureCredential()
)

blob_container_client = blob_client.get_container_client(AZURE_STORAGE_BLOB_CONTAINER_NAME)
if not blob_container_client.exists():
    blob_container_client.create_container()
    print(f"Created container {AZURE_STORAGE_BLOB_CONTAINER_NAME} in {AZURE_STORAGE_ACCOUNT_NAME}.")

# List all financial data files
directory_path = "data"
microsoft_financial_pdfs = []

for root, dirs, files in os.walk(directory_path):
    for filename in files:
        if filename.lower().endswith('.pdf'):
            with open(os.path.join(root, filename), 'rb') as f:
                # Upload file to storage
                blob_client = blob_container_client.upload_blob(name=filename, data=f.read(), overwrite=True)

                # Track files to ingest
                microsoft_financial_pdfs.append(filename)

print(f"Microsoft Financial data found. Total files to be ingested: {len(microsoft_financial_pdfs)}")

#### Ingest data into Azure AI Search

In [None]:
# Replace the placeholders with Ingestion service URL (local or remote).
INGESTION_SERVICE_URL = ""

# Create a map for filenames to reported year and subsidiary
microsoft_financial_report_to_metadata_map = {
    "2022_Annual_Report.pdf" : { "reported_year": "2022", "subsidiary": "Microsoft" },
    "2023_Annual_Report.pdf" : { "reported_year": "2023", "subsidiary": "Microsoft" },
    "2024_Annual_Report.pdf" : { "reported_year": "2024", "subsidiary": "Microsoft" },
    "MSFT_FY22Q4_10K.pdf" : { "reported_year": "2022", "subsidiary": "Microsoft" },
    "MSFT_FY23Q4_10K.pdf" : { "reported_year": "2023", "subsidiary": "Microsoft" },
    "MSFT_FY24Q4_10K.pdf" : { "reported_year": "2024", "subsidiary": "Microsoft" },
    "2022-Q1.pdf" : { "reported_year": "2022", "subsidiary": "Microsoft" },
    "2022-Q2.pdf" : { "reported_year": "2022", "subsidiary": "Microsoft" },
    "2022-Q3.pdf" : { "reported_year": "2022", "subsidiary": "Microsoft" },
    "2023-Q1.pdf" : { "reported_year": "2023", "subsidiary": "Microsoft" },
    "2023-Q2.pdf" : { "reported_year": "2023", "subsidiary": "Microsoft" },
    "2023-Q3.pdf" : { "reported_year": "2023", "subsidiary": "Microsoft" },
    "2024-Q1.pdf" : { "reported_year": "2024", "subsidiary": "Microsoft" },
    "2024-Q2.pdf" : { "reported_year": "2024", "subsidiary": "Microsoft" },
    "2024-Q3.pdf" : { "reported_year": "2024", "subsidiary": "Microsoft" },
}

# Setup HTTP request (URL and headers)
CONVERSATION_ID = "" # Required
USER_ID = "" # Required
DIALOG_ID = "" # Required

url = f'{INGESTION_SERVICE_URL}/indexer/index'
headers = {'conversation_id': CONVERSATION_ID, 'user_id': USER_ID, 'dialog_id': DIALOG_ID}

# Create payload for each file
for pdf_file in microsoft_financial_pdfs[:1]:
    file_payload = {
        "storage_container_name": "microsoft-content",
        "index_name": index_name,
        "payload": {
            "filename": pdf_file,
            "reported_year": microsoft_financial_report_to_metadata_map[pdf_file]["reported_year"],
            "subsidiary": microsoft_financial_report_to_metadata_map[pdf_file]["subsidiary"]
        },
        "enrichment": "NONE" # can be NONE | TABLE_AS_LIST | IMAGE_DESCRIPTION
    }

    response = requests.post(url, json=file_payload, headers=headers)
    if response.status_code == 201:
        print(f"Report {pdf_file} submitted for indexing successfully.")

### Search

This part is for testing your ingested data and is not needed for ingesting date directly.

#### Setup Search Client

In [None]:
AZURE_SEARCH_SERVICE = ""
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"

credential = DefaultAzureCredential()

index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
search_client = SearchClient(AZURE_SEARCH_ENDPOINT, index_name, credential=credential)

#### Setup Utility Functions

In [None]:
# Replace the placeholders with your Azure OpenAI Service and Model Deployment.
AZURE_OPENAI_SERVICE = ""
AZURE_OPENAI_ADA_DEPLOYMENT = ""

token_provider = get_bearer_token_provider(
    DefaultAzureCredential(),
    "https://cognitiveservices.azure.com/.default"
)

openai_client = AzureOpenAI(
    api_version="2024-08-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding

def search_results_cleanup(result_dict: dict) -> dict:
    '''
    Removing entries in the search result dict that have None value
    '''
    return {key: value for key, value in result_dict.items() if value is not None and key != "@search.score"}

def unpack_search_results(results: SearchItemPaged[Dict]):
        output = list()
        # search results come in an iterator, unpacking before returning
        for r in results:
            output.append(search_results_cleanup(r))

        return output

#### Search using vector similarity (hybrid + semantic search)

In [None]:
# NOTE: These queries are specifically tailored for Microsoft Financial documents.
microsoft_sample_queries = [
    "What was Microsoft's revenue in fiscal year 2022?",
    "What was Microsoft's revenue in Q3 of fiscal year 2023?",
    "How did Microsoft's operating income trend from 2022 to 2024?",
    "Can you analyze the compound quarterly growth rate (CQGR) for Microsoft's Intelligent Cloud segment from Q3 2022 to Q3 2024?",
    "What were the most significant changes in Microsoft's financial performance in fiscal year 2024 compared to previous years?"
]

search_query = random.choice(microsoft_sample_queries)

search_query_vector = get_embedding(search_query)

results = search_client.search(
    search_text=search_query,
    top=5,
    vector_queries=[
        VectorizedQuery(vector=search_query_vector,
                        k_nearest_neighbors=5,
                        fields="contentVector")
    ],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name=semantic_config.name)

final_result_set = unpack_search_results(results)
print(f"Successfully retrieved {len(final_result_set)} results from Search.")