In [12]:
import os
import re
import base64
from dotenv import load_dotenv
from azure.storage.blob import BlobSasPermissions, generate_blob_sas, BlobServiceClient, BlobClient
from datetime import datetime, timedelta
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SimpleField, SearchableField, VectorSearch, VectorSearchAlgorithmConfiguration, VectorSearchProfile, HnswParameters
)
from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI


In [13]:
# Load environment variables
load_dotenv()

# Fetch API credentials from .env
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
EMBEDDING_MODEL = "text-embedding-ada-002"

STORAGE_ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
STORAGE_ACCOUNT_KEY = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
BLOB_CONTAINER_NAME = "pdfcont"
blob_name = "employee_handbook.pdf"

SEARCH_SERVICE_ENDPOINT = os.getenv("SEARCH_SERVICE_ENDPOINT")
SEARCH_ADMIN_KEY = os.getenv("SEARCH_ADMIN_KEY")
SEARCH_INDEX_NAME = os.getenv("SEARCH_INDEX_NAME", "index3")

DOC_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
DOC_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")


In [14]:
# Azure clients for Blob, Form Recognizer, OpenAI, and Search Index
blob_service_client = BlobServiceClient(
    account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
    credential=DefaultAzureCredential()
)

form_recognizer_client = DocumentAnalysisClient(
    endpoint=DOC_INTELLIGENCE_ENDPOINT,
    credential=AzureKeyCredential(DOC_INTELLIGENCE_KEY)
)

openai_client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_OPENAI_API_VERSION
)

search_client = SearchClient(
    endpoint=SEARCH_SERVICE_ENDPOINT,
    index_name=SEARCH_INDEX_NAME,
    credential=AzureKeyCredential(SEARCH_ADMIN_KEY)
)


In [15]:
def sanitize_and_encode_document_key(doc_key):
    # Remove invalid characters and replace with underscores
    sanitized_key = re.sub(r'[^a-zA-Z0-9_\-=]', '_', doc_key)
    
    # Encode the sanitized key in Base64 (URL-safe)
    base64_key = base64.urlsafe_b64encode(sanitized_key.encode()).decode().rstrip("=")
    
    return base64_key


In [16]:
# Generate SAS Token for Blob Access
def generate_sas_token(account_name, container_name, blob_name, account_key, expiration_minutes=60):
    sas_token = generate_blob_sas(
        account_name=account_name,
        container_name=container_name,
        blob_name=blob_name,
        account_key=account_key,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(minutes=expiration_minutes)
    )
    return sas_token

# Construct the Blob URL with SAS Token
def get_blob_url_with_sas(account_name, container_name, blob_name, sas_token):
    return f"https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"


In [17]:
def download_blob_as_base64(blob_url):
    blob_client = BlobClient.from_blob_url(blob_url)
    blob_data = blob_client.download_blob().readall()
    
    # Ensure valid Base64 encoding
    base64_encoded_data = base64.b64encode(blob_data).decode()

    return base64_encoded_data


In [18]:
def extract_text_from_pdf(base64_pdf):
    # Convert Base64 string back to bytes
    pdf_bytes = base64.b64decode(base64_pdf)
    
    # Send to Azure Form Recognizer as raw bytes
    poller = form_recognizer_client.begin_analyze_document(
        "prebuilt-document", 
        document=pdf_bytes  # Pass as bytes, without content_type
    )
    
    result = poller.result()
    
    extracted_text = []
    for page in result.pages:
        for line in page.lines:
            extracted_text.append(line.content)
    
    return "\n".join(extracted_text)


In [19]:
# Function to generate embeddings using OpenAI
def generate_embeddings(text):
    response = openai_client.embeddings.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    return response.data[0].embedding


In [20]:
def store_in_search_index(doc_id, page_num, pdf_name, text, embedding):
    # Ensure embedding is a list of floats (array of numbers)
    if isinstance(embedding, list):
        # Convert all elements to string (previously it was converting to float)
        embedding = [str(x) for x in embedding]
    else:
        # If embedding is a single float value, wrap it into a list and convert to string
        embedding = [str(embedding)]
        
    # Check that embedding is now a list of strings
    print(f"Embedding is: {embedding}")

    document = {
        "id": doc_id,
        "page_num": str(page_num),
        "text": text,
        "pdf_name": pdf_name,
        "text_vector": embedding  # Directly store the embedding as an array of strings
    }

    try:
        # Upload the document to Azure Search
        search_client.upload_documents(documents=[document])
        print("Document uploaded successfully.")
    except Exception as e:
        print(f"Error uploading document: {e}")


In [21]:
def process_pdf_and_store_in_search(local_file_path):
    # Extract the blob name from the local file path
    blob_name = os.path.basename(local_file_path)
    
    # Sanitize and encode the blob name in Base64
    encoded_doc_key = sanitize_and_encode_document_key(blob_name)
    
    # Generate SAS token for the blob
    sas_token = generate_sas_token(
        STORAGE_ACCOUNT_NAME,
        BLOB_CONTAINER_NAME,
        blob_name,
        STORAGE_ACCOUNT_KEY
    )
    
    # Construct the blob URL with the SAS token
    blob_url = get_blob_url_with_sas(STORAGE_ACCOUNT_NAME, BLOB_CONTAINER_NAME, blob_name, sas_token)
    
    # Download the Blob and encode as Base64
    base64_pdf = download_blob_as_base64(blob_url)
    
    # Extract text from the Base64 encoded PDF
    extracted_text = extract_text_from_pdf(base64_pdf)
    
    # Generate embeddings for the extracted text
    embedding = generate_embeddings(extracted_text)
    
    # Store the data in the Azure Search Index
    store_in_search_index(
        doc_id=encoded_doc_key,
        page_num=1,  # You may want to change this for handling multi-page PDFs
        text=extracted_text,
        pdf_name=blob_name,
        embedding=embedding
    )
    
    return {"message": "PDF processed and stored successfully!"}


In [22]:

# Test the process
local_file_path = "/Users/aryan_zingade/Desktop/employee_handbook.pdf"  # Update with your local file path
result = process_pdf_and_store_in_search(local_file_path)
print(result)

Embedding is: ['-0.019856768', '0.0010852821', '0.008038876', '-0.027766967', '-0.013118198', '0.02654793', '-0.0019877085', '0.00045798565', '-0.023148168', '-0.0324264', '0.011018745', '0.0017862286', '-0.036733665', '0.012549315', '-0.0051775235', '0.010585309', '0.039848983', '-0.019382697', '-0.015698494', '-0.0114318635', '-0.014723265', '0.03226386', '-0.030205041', '0.030069593', '-0.01473681', '-0.015603681', '0.03250767', '-0.02549143', '0.018393923', '-0.009887749', '0.031017734', '-0.0071110525', '0.009271458', '0.008485856', '-0.008587442', '-0.014222105', '0.01668727', '-0.0025582858', '0.01955878', '-0.003159339', '0.040309507', '0.03640859', '-0.009142782', '0.004666205', '-0.014845168', '0.034999922', '-0.030177953', '-0.02058819', '0.020236023', '0.03253476', '0.017757313', '0.034918655', '-0.020804908', '0.007293908', '0.014506547', '-0.010090922', '0.005455193', '0.0292569', '-0.004134569', '-0.015373418', '0.028146222', '0.00024190276', '-0.028227491', '0.03128863'