# Test indexing process

## Setup

In [None]:
import logging
import os
import uuid
import base64

import azure.functions as func
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch
from langchain_core.documents import Document

In [None]:
load_dotenv()
print("Environment variables loaded")

In [None]:
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_DEPLOYMENT_MODEL"),
    api_version="2024-02-01",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

document_client = DocumentAnalysisClient(
    endpoint=os.getenv("AZURE_DOC_INT_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("AZURE_DOC_INT_API_KEY"))
)

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-small",
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBED_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMBED_API_KEY")
)

vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_SERVICE"),
    azure_search_key=os.environ["AZURE_SEARCH_API_KEY"],
    index_name="dataroots-guidelines-vector-index",
    embedding_function=embeddings.embed_query,
    search_options={
        "select": "id,content,metadata",  # Specify which fields to return
        "vector_fields": None  # Don't include vector fields in results
    },
    additional_search_client_options={
        "retry_total": 4,
        "connection_timeout": 5,
        "read_timeout": 30
    }
)

## Define pipeline

In [None]:
def process_blob_document(container_name: str, blob_name: str):
    """
    Simulates the blob trigger function by processing a blob document file.
    This helps us test our document processing logic without needing Azure Functions.
    """
    blob_service_client = BlobServiceClient.from_connection_string(
            os.getenv("AZURE_STORAGE_CONNECTION_STRING")
        )
    blob_client = blob_service_client.get_container_client(container_name).get_blob_client(blob_name)
    blob_content = blob_client.download_blob().readall()
    
    # Start the document analysis - notice we're using begin_analyze_document
    # instead of begin_analyze_document_from_url since we have a local file
    result = document_client.begin_analyze_document(
        "prebuilt-document",
        blob_content
    ).result() 
    print(f"Processed document: {blob_name}")
    
    # Return the extracted text for further processing if needed
    return result

In [None]:
def chunk_blob_document(container_name: str, blob_name: str):

    result = process_blob_document(container_name, blob_name)
    # Analyze the document
    
    document_chunks = []
    current_chunk = []
    current_length = 0
    target_chunk_size = 1000
    
    for paragraph in result.paragraphs:
        paragraph_text = paragraph.content
        
        # If adding this paragraph would exceed our target size
        if current_length + len(paragraph_text) > target_chunk_size and current_chunk:
            # Save the current chunk and start a new one
            document_chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
            
        current_chunk.append(paragraph_text)
        current_length += len(paragraph_text)
    
    # add the last chunk
    if current_chunk:
        document_chunks.append(" ".join(current_chunk))
    
    return document_chunks, result

In [None]:
def embed_and_upload_blob_document(container_name: str, blob_name: str):

    chunks, result = chunk_blob_document(container_name, blob_name)
    logging.info(f"Document analysis result attributes: {dir(result)}")
    
    documents_to_upload = []
    for i, chunk in enumerate(chunks):

        metadata = get_metadata(result, blob_name, i, chunks)
        
        doc = Document(
            page_content=chunk,
            metadata=metadata
        )
        documents_to_upload.append(doc)
        
        if len(documents_to_upload) >= 5:
            vector_store.add_texts(
                texts=[doc.page_content for doc in documents_to_upload],
                metadatas=[doc.metadata for doc in documents_to_upload]
            )
            documents_to_upload = []
            
    if documents_to_upload:
        vector_store.add_texts(
            texts=[doc.page_content for doc in documents_to_upload],
            metadatas=[doc.metadata for doc in documents_to_upload]
        )

## Run pipeline

### Testing metadata available from form recognizer

In [None]:
container_name = "st-dataroots-guiden-pdfstorage"
blob_names = ["Development of a RAG-Chatbot for Rule and Guideline Retrieval.pdf", "xmas_project_2.pdf", "m1-generative-ai-engineering-with-databricks.pdf"]
i = 2
blob_name = blob_names[i]

In [None]:
result = process_blob_document(container_name, blob_name)

In [None]:
for elem in result.to_dict()['paragraphs']:
    print(elem)

In [None]:
section_titles = []

for elem in result.to_dict()['paragraphs']:
    if elem['role'] in ['sectionHeading', 'title', 'heading']:
        section_titles.append(elem['content'])

section_titles    

In [None]:
title = section_titles[0]
title

In [None]:
file_type = blob_name.split('.')[-1]
file_type

In [None]:
def get_metadata(result, blob_name, i, chunks) -> dict:
    metadata = {}
    # Get project structure
    metadata['section_titles'] = []
    for elem in result.to_dict()['paragraphs']:
        if elem['role'] in ['sectionHeading', 'title', 'heading']:
            metadata['section_titles'].append(elem['content'])

    # Add other metadata
    metadata['title'] = metadata['section_titles'][0]
    metadata['file_type'] = blob_name.split('.')[-1]
    metadata['file_name'] = blob_name
    metadata['page'] = i + 1
    metadata['total_pages'] = len(chunks)

    return(metadata)

In [None]:
chunks, result = chunk_blob_document(container_name, blob_name)
    
documents_to_upload = []
for i, chunk in enumerate(chunks):
    metadata = get_metadata(result, blob_name, i, chunks)
    print(metadata)
    
    doc = Document(
        page_content=chunk,
        metadata=metadata
    )
    documents_to_upload.append(doc)

In [None]:
print(documents_to_upload[0])