In [1]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth


## Instantiate the PolicyIndexingPipeline Class

Settings for running the pipeline are in `src\pipeline\policyIndexer\settings.yaml`.

The PolicyIndexingPipeline automates the process of indexing policy documents into Azure AI Search.

In [2]:
from src.pipeline.policyIndexer.run import PolicyIndexingPipeline

In [3]:
indexer = PolicyIndexingPipeline()

## Upload Document to Landing Zone Blob Storage

In [4]:
## TODO: ALLOW KEY BASED AUTHENTICATION

In [5]:
indexer.upload_documents(local_path="utils\data\cases\policies")

2024-11-10 14:18:21,255 - micro - MainProcess - INFO     Uploaded utils\data\cases\policies\001.pdf to policies_ocr\001.pdf (run.py:upload_documents:140)
2024-11-10 14:18:21,341 - micro - MainProcess - INFO     Uploaded utils\data\cases\policies\002.pdf to policies_ocr\002.pdf (run.py:upload_documents:140)
2024-11-10 14:18:21,407 - micro - MainProcess - INFO     Uploaded utils\data\cases\policies\003.pdf to policies_ocr\003.pdf (run.py:upload_documents:140)
2024-11-10 14:18:21,501 - micro - MainProcess - INFO     Uploaded utils\data\cases\policies\004.pdf to policies_ocr\004.pdf (run.py:upload_documents:140)
2024-11-10 14:18:21,579 - micro - MainProcess - INFO     Uploaded utils\data\cases\policies\005.pdf to policies_ocr\005.pdf (run.py:upload_documents:140)


## Create Data Source (Connect Blob)

In [6]:
indexer.create_data_source()

2024-11-10 14:18:28,538 - micro - MainProcess - INFO     Data source 'ai-policies-blob' created or updated (run.py:create_data_source:163)


## Create Index 

In [7]:
indexer.create_index()

2024-11-10 14:18:32,476 - micro - MainProcess - INFO     Index 'ai-policies-index' created or updated successfully. (run.py:create_index:277)


## Creare Skillset

In [8]:
indexer.create_skillset()

2024-11-10 14:18:39,935 - micro - MainProcess - INFO     Skillset 'ai-policies-skillset' created or updated (run.py:create_skillset:429)


## Create Indexer

In [9]:
indexer.create_indexer()

2024-11-10 14:18:53,615 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' created or updated (run.py:create_indexer:464)


## Create Run Indexer 

In [10]:
from src.pipeline.policyIndexer.run import IndexerRunner

In [11]:
indexer = IndexerRunner(indexer_name="ai-policies-indexer")

In [18]:
indexer.monitor_indexer_status()

2024-11-10 14:19:17,112 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' has been started. (run.py:run_indexer:533)


2024-11-10 14:19:17,168 - micro - MainProcess - INFO     Indexer Status: running (run.py:monitor_indexer_status:575)
2024-11-10 14:19:17,169 - micro - MainProcess - INFO     Last Run Time: 2024-11-10 20:19:12.790000+00:00 (run.py:monitor_indexer_status:576)
2024-11-10 14:19:17,171 - micro - MainProcess - INFO     Execution Status: success (run.py:monitor_indexer_status:577)
2024-11-10 14:19:17,173 - micro - MainProcess - INFO     Indexer 'ai-policies-indexer' completed successfully. (run.py:monitor_indexer_status:583)


## Test Search 

In [1]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential

credential = (
    AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_ADMIN_KEY"))
    if os.getenv("AZURE_AI_SEARCH_ADMIN_KEY")
    else DefaultAzureCredential()
)
index_name = os.getenv("AZURE_AI_SEARCH_INDEX_NAME", "ai-policies-index")


search_client = SearchClient(
    endpoint=os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"],
    index_name=index_name,
    credential=AzureKeyCredential(os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]),
)

In [26]:
SEARCH_QUERY = "afiniitor therapy"

In [27]:
vector_query = VectorizableTextQuery(
    text=SEARCH_QUERY, k_nearest_neighbors=5, fields="vector", weight=0.5
)

In [28]:
def format_azure_search_results(results: list, truncate: int = 1000) -> str:
    """
    Formats Azure AI Search results into a structured, readable string.
    
    Each result contains:
    - Chunk ID
    - Reranker Score
    - Source Document Path
    - Content (truncated to the specified number of characters if too long)
    - Caption (highlighted if available)
    
    :param results: List of results from the Azure AI Search API.
    :param truncate: Maximum number of characters to include in the content before truncating.
    :return: Formatted string representation of the search results.
    """
    formatted_results = []

    for result in results:
        # Access all properties like a dictionary
        chunk_id = result['chunk_id'] if 'chunk_id' in result else 'N/A'
        reranker_score = result['@search.reranker_score'] if '@search.reranker_score' in result else 'N/A'
        source_doc_path = result['parent_path'] if 'parent_path' in result else 'N/A'
        content = result['chunk'] if 'chunk' in result else 'N/A'
        
        # Truncate content to specified number of characters
        content = content[:truncate] + "..." if len(content) > truncate else content

        # Extract caption (highlighted caption if available)
        captions = result['@search.captions'] if '@search.captions' in result else []
        caption = "Caption not available"
        if captions:
            first_caption = captions[0]
            if first_caption.highlights:
                caption = first_caption.highlights
            elif first_caption.text:
                caption = first_caption.text

        # Format each result section
        result_string = (
            f"========================================\n"
            f"🆔 ID: {chunk_id}\n"
            f"📈 Reranker Score: {reranker_score}\n"
            f"📂 Source Doc Path: {source_doc_path}\n"
            f"📜 Content: {content}\n"
            f"💡 Caption: {caption}\n"
            f"========================================"
        )

        formatted_results.append(result_string)

    # Join all the formatted results into a single string
    return "\n\n".join(formatted_results)

In [29]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

results = search_client.search(
    search_text=SEARCH_QUERY,
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name="my-semantic-config",
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=5,
)

In [30]:
result = format_azure_search_results(results)
print(result)

🆔 ID: 677854350e06_aHR0cHM6Ly9zdG9yYWdlYWVhc3R1c2ZhY3RvcnkuYmxvYi5jb3JlLndpbmRvd3MubmV0L3ByZS1hdXRoLXBvbGljaWVzL3BvbGljaWVzX29jci8wMDUucGRm0_normalized_images_10_pages_0
📈 Reranker Score: 2.7419040203094482
📂 Source Doc Path: https://storageaeastusfactory.blob.core.windows.net/pre-auth-policies/policies_ocr/005.pdf
📜 Content: UnitedHealthcare® (2) Presence of phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha (PIK3CA) mutation Authorization will be issued for 12 months. 2. Reauthorization a. Afinitor will be approved based on the following criterion: (1) Patient does not show evidence of progressive disease while on Afinitor therapy Authorization will be issued for 12 months. Q. Gastrointestingal Stromal Tumor (GIST) 1. Initial Authorization a. Afinitor will be approved based on all of the following criteria: (1) Diagnosis of Gastrointestinal Stromal Tumor (GIST) -AND- (2) Disease is one of the following: (a) Unresectable (b) Progressive (c) Metastatic (d) Gross res

In [25]:
type(result)

str