# Azure AI Search PDF Uploader
This notebook loads files from a folder, extracts text, chunks it, generates embeddings with Azure OpenAI, and uploads into an Azure AI Search index with vector search enabled.

To use:
1. Create an .env file from sample.env and input your variables
2. Include path to local folder of PDF's
3. Create a search index (optional after initial creation)
4. Upload documents with embeddings
5. Test with a query

Notes:
- To avoid issues with different SDK versions, this notebook creates the search index using the `schema.json` file and inputs variables directly from your .env file. If you want to make changes to your index (ie: change field names), update the `schema.json` file.
- This is utilizing the simple chunking strategy of 1 page per chunk. Depending on your documents, you may want to utilize a different strategy


In [1]:
import json
import os
import requests
import uuid
from pathlib import Path
import fitz  # PyMuPDF
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from openai import AzureOpenAI

In [3]:
import asyncio
import datetime
import sys
import os
from typing import Dict, List, Literal, Optional, TypedDict

import logging

# Set the directory to the repository root
# Navigate to repo root from samples/labs/dev/
repo_root = "/Users/pablo/Desktop/dev/art-voice-agent-accelerator"

# Alternatively, auto-detect by walking up from current working directory
if not os.path.exists(repo_root):
    current = os.getcwd()
    while current != os.path.dirname(current):
        if os.path.exists(os.path.join(current, "src")) and os.path.exists(os.path.join(current, "pyproject.toml")):
            repo_root = current
            break
        current = os.path.dirname(current)

os.chdir(repo_root)
print(f"‚úÖ Changed directory to repo root: {os.getcwd()}")

# Add src to path if needed
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

# Now import from src
from src.cosmosdb.manager import CosmosDBMongoCoreManager
from utils.ml_logging import get_logger
from pymongo.errors import NetworkTimeout, DuplicateKeyError

logger = get_logger("ai_search_indexer")

‚úÖ Changed directory to repo root: /Users/pablo/Desktop/dev/art-voice-agent-accelerator



In [4]:
# ---------------------------------------------------------
# 1. Load environment variables
# ---------------------------------------------------------
# Load from samples/labs/dev/.env (or .env.ai_search)
labs_env_path = Path("samples/labs/dev/.env")
labs_env_ai_search_path = Path("samples/labs/dev/.env.ai_search")

if labs_env_path.exists():
    load_dotenv(labs_env_path, override=True)
    print(f"‚úÖ Loaded env from: {labs_env_path}")
elif labs_env_ai_search_path.exists():
    load_dotenv(labs_env_ai_search_path, override=True)
    print(f"‚úÖ Loaded env from: {labs_env_ai_search_path}")
else:
    load_dotenv(override=True)
    print("‚ö†Ô∏è Using default .env file")

# Azure AI Search Configuration
SEARCH_ENDPOINT = os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"]
SEARCH_API_KEY = os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]
SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX_NAME"]

# Index Schema Configuration
INDEX_SCHEMA_PATH = os.environ.get("INDEX_SCHEMA_PATH", "samples/labs/dev/schema.json")
MODEL_DIMENSIONS = int(os.environ.get("MODEL_DIMENSIONS", "3072"))

# Azure OpenAI Configuration (for embeddings)
AOAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
EMBED_MODEL = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
AOAI_API_VERSION = os.environ["AZURE_OPENAI_API_VERSION"]
AOAI_KEY = os.environ["AZURE_OPENAI_KEY"]

# Path to folder containing PDFs to upload
# Update this to your local path or set PDF_FOLDER_PATH in .env
PDF_FOLDER = Path(os.environ.get("PDF_FOLDER_PATH", "utils/data/creditcardsProducts"))

print(f"‚úÖ Configuration loaded:")
print(f"   Search Endpoint: {SEARCH_ENDPOINT}")
print(f"   Search Index: {SEARCH_INDEX}")
print(f"   AOAI Endpoint: {AOAI_ENDPOINT}")
print(f"   Embedding Model: {EMBED_MODEL}")
print(f"   Model Dimensions: {MODEL_DIMENSIONS}")
print(f"   PDF Folder: {PDF_FOLDER}")
print(f"   PDF Folder exists: {PDF_FOLDER.exists()}")

# ---------------------------------------------------------
# 2. Initialize Azure OpenAI client (embeddings)
# ---------------------------------------------------------
credential = DefaultAzureCredential()

# Initialize Azure OpenAI client
aoai_client = AzureOpenAI(
    api_key=AOAI_KEY,
    api_version=AOAI_API_VERSION,
    azure_endpoint=AOAI_ENDPOINT
)

# ---------------------------------------------------------
# 3. Initialize Azure AI Search client
# ---------------------------------------------------------
search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=SEARCH_INDEX,
    credential=AzureKeyCredential(SEARCH_API_KEY)
)

search_index_client = SearchIndexClient(
    endpoint=SEARCH_ENDPOINT,
    credential=AzureKeyCredential(SEARCH_API_KEY)
)

print("‚úÖ Clients initialized successfully!")

‚úÖ Loaded env from: samples/labs/dev/.env
‚úÖ Configuration loaded:
   Search Endpoint: https://search-ai-factory-centralus.search.windows.net
   Search Index: banking-assistants-ai-index
   AOAI Endpoint: https://aoai-ai-factory-eus-dev.openai.azure.com/
   Embedding Model: text-embedding-3-large
   Model Dimensions: 3072
   PDF Folder: utils/data/creditcardsProducts
   PDF Folder exists: True

‚úÖ Configuration loaded:
   Search Endpoint: https://search-ai-factory-centralus.search.windows.net
   Search Index: banking-assistants-ai-index
   AOAI Endpoint: https://aoai-ai-factory-eus-dev.openai.azure.com/
   Embedding Model: text-embedding-3-large
   Model Dimensions: 3072
   PDF Folder: utils/data/creditcardsProducts
   PDF Folder exists: True
‚úÖ Clients initialized successfully!
‚úÖ Clients initialized successfully!


### Create Index (with card_name field for filtering)

In [6]:
index_schema_file = Path(INDEX_SCHEMA_PATH)
with open(index_schema_file, "r") as f:
    index_schema = json.loads(f.read())
    index_schema["name"] = SEARCH_INDEX
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['resourceUri'] = AOAI_ENDPOINT
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['deploymentId'] = EMBED_MODEL
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey'] = AOAI_KEY
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['modelName'] = EMBED_MODEL
    #for vector field, set dimensions
    for field in index_schema['fields']:
        if field.get('name') == 'vector':
            field['dimensions'] = MODEL_DIMENSIONS 

search_headers = {
    "Content-Type": "application/json",
    "api-key": SEARCH_API_KEY
}

# #print index schema for debugging
# print(json.dumps(index_schema, indent=2))

create_index_url = f"{SEARCH_ENDPOINT}/indexes?api-version=2025-09-01"
print(create_index_url)
response = requests.post(create_index_url, headers=search_headers, json=index_schema)
if response.status_code == 201:
    print(f"Index '{SEARCH_INDEX}' created successfully.")
elif response.status_code == 204:
    print(f"Index '{SEARCH_INDEX}' already exists.")
else:
    print(f"Failed to create index '{SEARCH_INDEX}'. Status code: {response.status_code}, Response: {response.text}")


https://search-ai-factory-centralus.search.windows.net/indexes?api-version=2025-09-01

Index 'banking-assistants-ai-index' created successfully.
Index 'banking-assistants-ai-index' created successfully.


### Upload Embedded Documents

In [7]:
# ---------------------------------------------------------
# 4. PDF per-page extraction
# ---------------------------------------------------------
def extract_pdf_pages(pdf_path: Path):
    """Return list of (page_number, text) for each non-empty page."""
    doc = fitz.open(pdf_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text").replace("\x00", "").strip()

        if text:
            pages.append((page_num + 1, text))

    return pages

# ---------------------------------------------------------
# 5. Embed a single chunk using Azure OpenAI
# ---------------------------------------------------------
def generate_embedding(text):
    """Generate embedding using text-embedding-3-large"""
    try:
        response = aoai_client.embeddings.create(
            model=EMBED_MODEL,
            input=text,
            dimensions=MODEL_DIMENSIONS
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"‚ùå Error generating embedding: {e}")
        raise


# ---------------------------------------------------------
# 6. Extract card name from PDF filename
# ---------------------------------------------------------
def extract_card_name(pdf_path: Path) -> str:
    """
    Extract clean card name from PDF filename.
    Examples:
      'Premium Rewards.pdf' -> 'Premium Rewards'
      'BankAmericard¬Æ.pdf' -> 'BankAmericard'
      'Customized Cash Rewards.pdf' -> 'Customized Cash Rewards'
    """
    card_name = pdf_path.stem  # Remove .pdf extension
    # Clean up special characters but keep spaces
    card_name = card_name.replace("¬Æ", "").replace("‚Ñ¢", "").strip()
    return card_name


# ---------------------------------------------------------
# 7. Build a document entry for AI Search
# ---------------------------------------------------------
def build_search_doc(pdf_path: Path, page_num: int, content: str, vector):
    """
    Build document with card_name for filtering.
    """
    card_name = extract_card_name(pdf_path)
    return {
        "id": str(uuid.uuid4()),
        "title": pdf_path.stem,
        "card_name": card_name,  # NEW: Filterable card name field
        "content": content,
        "file_name": f"{pdf_path.name}#page={page_num}",
        "vector": vector,
    }


# Show what card names will be extracted
print("üìã Card names that will be indexed:")
for pdf_file in PDF_FOLDER.glob("*.pdf"):
    print(f"   ‚Ä¢ {extract_card_name(pdf_file)}")

üìã Card names that will be indexed:
   ‚Ä¢ Elite
   ‚Ä¢ Unlimited Cash Rewards
   ‚Ä¢ BankAmericard
   ‚Ä¢ Travel Rewards
   ‚Ä¢ Premium Rewards
   ‚Ä¢ Customized Cash Rewards

   ‚Ä¢ Elite
   ‚Ä¢ Unlimited Cash Rewards
   ‚Ä¢ BankAmericard
   ‚Ä¢ Travel Rewards
   ‚Ä¢ Premium Rewards
   ‚Ä¢ Customized Cash Rewards


In [8]:
# ---------------------------------------------------------
# 7. Main ingestion loop (read PDFs ‚Üí embed ‚Üí upload)
# ---------------------------------------------------------
all_documents = []

for pdf_file in PDF_FOLDER.glob("*.pdf"):
    print(f"üìÑ Processing {pdf_file.name}...")

    pages = extract_pdf_pages(pdf_file)

    for page_num, text in pages:
        # Per-page embedding
        vector = generate_embedding(text)

        # Build search document
        doc = build_search_doc(pdf_file, page_num, text, vector)
        all_documents.append(doc)

# ---------------------------------------------------------
# 8. Upload to Azure AI Search in batches
# ---------------------------------------------------------
print(f"üöÄ Uploading {len(all_documents)} chunks to Azure AI Search...")

result = search_client.upload_documents(all_documents)

print("‚úÖ Upload complete!")

üìÑ Processing Elite.pdf...

üìÑ Processing Unlimited Cash Rewards.pdf...
üìÑ Processing Unlimited Cash Rewards.pdf...
üìÑ Processing BankAmericard¬Æ.pdf...
üìÑ Processing BankAmericard¬Æ.pdf...
üìÑ Processing Travel Rewards.pdf...
üìÑ Processing Travel Rewards.pdf...
üìÑ Processing Premium Rewards.pdf...
üìÑ Processing Premium Rewards.pdf...
üìÑ Processing Customized Cash Rewards.pdf...
üìÑ Processing Customized Cash Rewards.pdf...
üöÄ Uploading 24 chunks to Azure AI Search...üöÄ Uploading 24 chunks to Azure AI Search...

‚úÖ Upload complete!
‚úÖ Upload complete!


### Test Query

In [9]:
# ---------------------------------------------------------
# Query AI Search for relevant documents
# ---------------------------------------------------------
def query_ai_search(query: str, card_name: str = None, top_k: int = 3):
    """
    Hybrid vector + keyword search with optional card_name filter.
    
    Args:
        query: Natural language question
        card_name: Optional filter to search only within a specific card's documents
        top_k: Number of results to return
    """
    embedding = generate_embedding(query)
    
    # Build search parameters
    search_params = {
        "search_text": query,
        "vector_queries": [{
            "kind": "vector",
            "vector": embedding,
            "fields": "vector",
            "k": top_k
        }],
        "select": ["title", "card_name", "content", "file_name"],
        "top": top_k
    }
    
    # Add filter if card_name provided
    if card_name:
        search_params["filter"] = f"card_name eq '{card_name}'"
        print(f"üîç Filtering by card_name: '{card_name}'")
    
    search_results = search_client.search(**search_params)
    return search_results


# ---------------------------------------------------------
# Test 1: General query (no filter) - might mix cards
# ---------------------------------------------------------
print("=" * 60)
print("TEST 1: General query WITHOUT filter")
print("=" * 60)
query = "What are the foreign transaction fees?"
results = query_ai_search(query, top_k=3)

for result in results:
    print(f"üìÑ Card: {result.get('card_name', 'Unknown')}")
    print(f"   Title: {result['title']}")
    print(f"   Content: {result['content'][:150]}...")
    print("-----")


# ---------------------------------------------------------
# Test 2: Filtered query - only Premium Rewards
# ---------------------------------------------------------
print("\n" + "=" * 60)
print("TEST 2: Query WITH filter (Premium Rewards only)")
print("=" * 60)
query = "What are the foreign transaction fees?"
results = query_ai_search(query, card_name="Premium Rewards", top_k=3)

for result in results:
    print(f"üìÑ Card: {result.get('card_name', 'Unknown')}")
    print(f"   Title: {result['title']}")
    print(f"   Content: {result['content'][:150]}...")
    print("-----")


# ---------------------------------------------------------
# Test 3: Filtered query - Travel Rewards
# ---------------------------------------------------------
print("\n" + "=" * 60)
print("TEST 3: Query WITH filter (Travel Rewards only)")
print("=" * 60)
query = "What is the annual fee?"
results = query_ai_search(query, card_name="Travel Rewards", top_k=3)

for result in results:
    print(f"üìÑ Card: {result.get('card_name', 'Unknown')}")
    print(f"   Title: {result['title']}")
    print(f"   Content: {result['content'][:150]}...")
    print("-----")

TEST 1: General query WITHOUT filter

TEST 1: General query WITHOUT filter
üìÑ Card: Unlimited Cash Rewards
   Title: Unlimited Cash Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of the amount of each transaction, for transactions made 
3%
within...
-----
üìÑ Card: Travel Rewards
   Title: Travel Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of the amount of each transaction, for transactions made 
3%
within...
-----
üìÑ Card: Customized Cash Rewards
   Title: Customized Cash Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of the amount of each transaction, for transactions made 
3%
within...
-----

TEST 2: Query WITH filter (Premium Rewards only)
üìÑ Card: Unlimited Cash Rewards
   Title: Unlimited Cash Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of

In [11]:
query = "What is the annual fee?"
results = query_ai_search(query, card_name="Travel Rewards", top_k=3)

for result in results:
    print(f"üìÑ Card: {result.get('card_name', 'Unknown')}")
    print(f"   Title: {result['title']}")
    print(f"   Content: {result['content'][:1000]}...")
    print("-----")

üîç Filtering by card_name: 'Travel Rewards'

üìÑ Card: Travel Rewards
   Title: Travel Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of the amount of each transaction, for transactions made 
3%
within 60 days of opening your account.
After that, your fee will be:
 of the amount of each transaction.
4%
  ‚Ä¢ Cash Advance
Direct Deposit and Check Cash Advances: 
 of the amount of each transaction.
4%
ATM, Over the Counter, Same-Day Online and Cash Equivalent Cash Advances:
 of the amount of each transaction.
5%
  ‚Ä¢ Foreign Transaction
None
Penalty Fee
 
  ‚Ä¢ Late Payment
Up to 
. 
 
.
$40 See footnote 3 for explanation
 We use a method called ‚Äúaverage daily balance (including new purchases).‚Äù
How We Will Calculate Your Balance:
Payments are allocated to posted balances. We will first allocate the amount of your payment equal to the Total 
Minimum Payment Due to any Custom Pay Plan Payment due, then to the lowest APR bal

In [12]:
query = "What is the annual fee?"
results = query_ai_search(query, top_k=3)

for result in results:
    print(f"üìÑ Card: {result.get('card_name', 'Unknown')}")
    print(f"   Title: {result['title']}")
    print(f"   Content: {result['content'][:1000]}...")
    print("-----")

üìÑ Card: Unlimited Cash Rewards
   Title: Unlimited Cash Rewards
   Content: Fees
Annual Fee
None
Transaction Fees
 
  ‚Ä¢ Balance Transfer
Introductory Fee of 
 of the amount of each transaction, for transactions made 
3%
within 60 days of opening your account.
After that, your fee will be:
 of the amount of each transaction.
4%
  ‚Ä¢ Cash Advance
Direct Deposit and Check Cash Advances: 
 of the amount of each transaction.
4%
ATM, Over the Counter, Same-Day Online and Cash Equivalent Cash Advances:
 of the amount of each transaction.
5%
  ‚Ä¢ Foreign Transaction
 of the U.S. dollar amount of each transaction made in a foreign currency. This 
3%
fee will be in addition to any other applicable fee.
Penalty Fee
 
  ‚Ä¢ Late Payment
Up to 
. 
 
.
$40 See footnote 3 for explanation
 We use a method called ‚Äúaverage daily balance (including new purchases).‚Äù
How We Will Calculate Your Balance:
Payments are allocated to posted balances. We will first allocate the amount of your payment e