# Azure AI Search PDF Uploader
This notebook loads files from a folder, extracts text, chunks it, generates embeddings with Azure OpenAI, and uploads into an Azure AI Search index with vector search enabled.

To use:
1. Create an .env file from sample.env and input your variables
2. Include path to local folder of PDF's
3. Create a search index (optional after initial creation)
4. Upload documents with embeddings
5. Test with a query

Notes:
- To avoid issues with different SDK versions, this notebook creates the search index using the `schema.json` file and inputs variables directly from your .env file. If you want to make changes to your index (ie: change field names), update the `schema.json` file.
- This is utilizing the simple chunking strategy of 1 page per chunk. Depending on your documents, you may want to utilize a different strategy


In [None]:
import json
import os
import requests
import uuid
from pathlib import Path
import fitz  # PyMuPDF
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from openai import AzureOpenAI

In [None]:
# ---------------------------------------------------------
# 1. Load environment variables
# ---------------------------------------------------------
load_dotenv(override=True)

SEARCH_ENDPOINT = os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"]
SEARCH_API_KEY = os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]
SEARCH_INDEX = os.environ["SEARCH_INDEX_NAME"]
INDEX_SCHEMA_PATH = os.environ["INDEX_SCHEMA_PATH"]
MODEL_DIMENSIONS = int(os.environ["MODEL_DIMENSIONS"])

AOAI_ENDPOINT= os.environ["AZURE_OPEN_AI_ENDPOINT"]
EMBED_MODEL = os.environ["AZURE_OPEN_AI_DEPLOYMENT_NAME"] # 3072 dims
AOAI_API_VERSION = os.environ["AZURE_OPEN_AI_API_VERSION"]
AOAI_KEY=os.environ["AZURE_OPEN_AI_API_KEY"]

# Path to folder containing PDFs to upload
PDF_FOLDER = Path(r"C:\Users\annaquincy\Desktop\Code\bofa-demo\art-voice-agent-accelerator\utils\data\creditcardsProducts")

# ---------------------------------------------------------
# 2. Initialize Azure AI Foundry client (embeddings)
# ---------------------------------------------------------
credential = DefaultAzureCredential()

# Initialize Azure OpenAI client
aoai_client = AzureOpenAI(
    api_key=AOAI_KEY,
    api_version=AOAI_API_VERSION,
    azure_endpoint=AOAI_ENDPOINT
)
# ---------------------------------------------------------
# 3. Initialize Azure AI Search client
# ---------------------------------------------------------
search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=SEARCH_INDEX,
    credential=AzureKeyCredential(SEARCH_API_KEY)
)

search_index_client = SearchIndexClient(
    endpoint=SEARCH_ENDPOINT,
    credential=AzureKeyCredential(SEARCH_API_KEY)
)


### Optional: Create Index

In [None]:
index_schema_file = Path(INDEX_SCHEMA_PATH)
with open(index_schema_file, "r") as f:
    index_schema = json.loads(f.read())
    index_schema["name"] = SEARCH_INDEX
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['resourceUri'] = AOAI_ENDPOINT
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['deploymentId'] = EMBED_MODEL
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey'] = AOAI_KEY
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['modelName'] = EMBED_MODEL
    #for vector field, set dimensions
    for field in index_schema['fields']:
        if field.get('name') == 'vector':
            field['dimensions'] = MODEL_DIMENSIONS 

search_headers = {
    "Content-Type": "application/json",
    "api-key": SEARCH_API_KEY
}

# #print index schema for debugging
# print(json.dumps(index_schema, indent=2))

create_index_url = f"{SEARCH_ENDPOINT}/indexes?api-version=2025-09-01"
print(create_index_url)
response = requests.post(create_index_url, headers=search_headers, json=index_schema)
if response.status_code == 201:
    print(f"Index '{SEARCH_INDEX}' created successfully.")
elif response.status_code == 204:
    print(f"Index '{SEARCH_INDEX}' already exists.")
else:
    print(f"Failed to create index '{SEARCH_INDEX}'. Status code: {response.status_code}, Response: {response.text}")


### Upload Embedded Documents

In [None]:
# ---------------------------------------------------------
# 4. PDF per-page extraction
# ---------------------------------------------------------
def extract_pdf_pages(pdf_path: Path):
    """Return list of (page_number, text) for each non-empty page."""
    doc = fitz.open(pdf_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text").replace("\x00", "").strip()

        if text:
            pages.append((page_num + 1, text))

    return pages

# ---------------------------------------------------------
# 5. Embed a single chunk using Azure AI Foundry
# ---------------------------------------------------------
def generate_embedding(text):
    """Generate 3072-dim embedding using text-embedding-3-large"""
    try:
        response = aoai_client.embeddings.create(
            model=EMBED_MODEL,
            input=text,
            dimensions=3072
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"‚ùå Error generating embedding: {e}")
        raise


# ---------------------------------------------------------
# 6. Build a document entry for AI Search
# ---------------------------------------------------------
def build_search_doc(pdf_path: Path, page_num: int, content: str, vector):
    return {
        "id": str(uuid.uuid4()),
        "title": pdf_path.stem,
        "content": content,
        "file_name": f"{pdf_path.name}#page={page_num}",
        "vector": vector,
    }





In [None]:
# ---------------------------------------------------------
# 7. Main ingestion loop (read PDFs ‚Üí embed ‚Üí upload)
# ---------------------------------------------------------
all_documents = []

for pdf_file in PDF_FOLDER.glob("*.pdf"):
    print(f"üìÑ Processing {pdf_file.name}...")

    pages = extract_pdf_pages(pdf_file)

    for page_num, text in pages:
        # Per-page embedding
        vector = generate_embedding(text)

        # Build search document
        doc = build_search_doc(pdf_file, page_num, text, vector)
        all_documents.append(doc)

# ---------------------------------------------------------
# 8. Upload to Azure AI Search in batches
# ---------------------------------------------------------
print(f"üöÄ Uploading {len(all_documents)} chunks to Azure AI Search...")

result = search_client.upload_documents(all_documents)

print("‚úÖ Upload complete!")

### Test Query

In [None]:
#query AI Search for relevant documents
def query_ai_search(query: str, top_k: int =3):
    embedding = generate_embedding(query)
    search_results = search_client.search(
        search_text=query,
        vector_queries=[{
            "kind": "vector",
            "vector": embedding,
            "fields": "vector",
            "k": top_k
        }]
    )
    return search_results

# Example usage
query = "What are the best options for a travel credit card?"
results = query_ai_search(query, top_k=3)

for result in results:
    print(f"Title: {result['title']}")
    print(f"File Name: {result['file_name']}")
    print(f"Content Snippet: {result['content'][:200]}...")
    print("-----")