First, install the dependencies.

In [9]:
%pip install azure-ai-documentintelligence==1.0.0b1 azure-search-documents==11.4.0 unidecode==1.3.8 nltk==3.8.1

Note: you may need to restart the kernel to use updated packages.


Plus, install NLTK data.

In [11]:
from nltk import download

download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clemlesne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Then, initialize the clients to Document Intelligence and AI Search.

In [15]:
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from azure.search.documents import SearchClient
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from textwrap import dedent
from typing import Dict, Tuple
from unidecode import unidecode
import asyncio
import glob
import re

source = "car-peugeot-206.pdf"

doc_endpoint = "https://claim-ai.cognitiveservices.azure.com"
doc_credential = AzureKeyCredential("xxx")
doc_client = DocumentIntelligenceClient(
  endpoint=doc_endpoint,
  credential=doc_credential,
)

search_endpoint = "https://claim-ai.search.windows.net"
search_credential = AzureKeyCredential("xxx")
search_client = SearchClient(
  endpoint=search_endpoint,
  index_name="trainings",
  credential=search_credential,
)

Next, transform PDF in Markdown text. We are using Document Intelligence for that.

Be warned that this step can take a few minutes, depending on the size of the PDF. From a minute for a few pages to 20 minutes for a 1000 pages PDF.

In [17]:
async def pdf_to_markdown(source: str) -> Tuple[str, AnalyzeResult]:
    with open(source, "rb") as file:
        doc_poller = await doc_client.begin_analyze_document(
            analyze_request=file,
            content_type="application/octet-stream",
            locale="fr-FR",  # We only have French documents in this dataset
            model_id="prebuilt-layout",
            output_content_format="markdown",
        )
        doc_result = await doc_poller.result()
        return source, doc_result


doc_results: Dict[str, AnalyzeResult] = {}
doc_tasks = []

for source in glob.glob("dataset/*.pdf"):
    print(f"Starting {source}")
    doc_tasks.append(asyncio.create_task(pdf_to_markdown(source)))

print("Waiting for results...")
for doc_task in asyncio.as_completed(doc_tasks):
    try:
        source, doc_result = await doc_task
        print(f"Ended {source}")
        doc_results[source] = doc_result
    except HttpResponseError as e:
        print(f"Failed {source}: {e}")

# doc_results

Starting dataset/20210300_CG_GH_3350-56221.pdf
Starting dataset/CYBER_SECURITE_3350-93501-032023-Web.pdf
Starting dataset/DISPOSITIONS GENERALES GROUPAMA COHESION.pdf
Starting dataset/CG-groupama-habitation 56221-062018.pdf
Starting dataset/conditions_generales_multirisque_climatique_2022.pdf
Starting dataset/Code des assurances 2024-1.pdf
Starting dataset/conditions_generales_groupama.pdf
Starting dataset/Conditions Générales Chasse-01.pdf
Starting dataset/Groupama-URD2021-FR-ecobook.pdf
Starting dataset/Code des assurances 2024-2.pdf
Starting dataset/Code des assurances 2024-3.pdf
Starting dataset/CG MRH GROUPAMA.pdf
Starting dataset/Groupama-Offre.pdf
Starting dataset/conditions-generales-assurance-auto-groupama.pdf
Starting dataset/GROUPAMA_Document-dEnregistrement-Universel_2019.pdf
Starting dataset/Groupama_Sante_Prevoyance_CG-2.pdf
Waiting for results...
Ended dataset/Groupama-Offre.pdf
Ended dataset/CYBER_SECURITE_3350-93501-032023-Web.pdf
Ended dataset/Conditions Générales Cha

Split the Markdown text into smaller blocks, we'll call chuncks. Each block is divided, at a minimum, by level 4 headers. Each block content is minified with a stemmer.

In [19]:
lang = "french"
stemmer = SnowballStemmer(lang)


def compress(text: str) -> str:
    tokenized = word_tokenize(text, language=lang)
    tokens = [stemmer.stem(token) for token in tokenized]
    prompt = " ".join(tokens)
    return prompt


def data(content: str, source_uri: str, title: str, iterator: int) -> dict[str, str]:
    return {
        "content": content,
        "id": f"{'_'.join(re.sub('[^a-z0-9]', ' ', unidecode(source_uri).lower()).split())}-{iterator}",  # Use deterministic ID to avoid duplicates after a new run
        "source_uri": unidecode(source_uri).lower(),
        "title": ' '.join(re.sub('[^a-z0-9]', ' ', unidecode(title).lower()).split()),  # Remove all special characters
    }


documents = []
iterator = 0

for source, doc_result in doc_results.items():
  for h1_section in (doc_result.content or "").split("\n#"):
    h1_title = h1_section.split("\n")[0].strip()
    for h2_section in h1_section.split("\n##"):
      h2_title = h2_section.split("\n")[0].strip()
      for h3_section in h2_section.split("\n###"):
        h3_title = h3_section.split("\n")[0].strip()
        for h4_section in h3_section.split("\n###"):
          h4_title = h4_section.split("\n")[0].strip()
          h4_content = compress(" ".join(h4_section.split("\n")[1:]))
          content = dedent(f"""
            # {h1_title}
            ## {h2_title}
            ### {h3_title}
            #### {h4_title}
            {h4_content}
          """)
          documents.append(data(content, source, h4_title, iterator))
          iterator += 1

        else: # No H4
          h3_content = compress(" ".join(h3_section.split("\n")[1:]))
          content = dedent(f"""
            # {h1_title}
            ## {h2_title}
            ### {h3_title}
            {h3_content}
          """)
          documents.append(data(content, source, h3_title, iterator))
          iterator += 1

      else:  # No H3
        h2_content = compress(" ".join(h2_section.split("\n")[1:]))
        content = dedent(f"""
          # {h1_title}
          ## {h2_title}
          {h2_content}
        """)
        documents.append(data(content, source, h2_title, iterator))
        iterator += 1

    else:  # No H2
      h1_content = compress(" ".join(h1_section.split("\n")[1:]))
      content = dedent(f"""
        # {h1_title}
        {h1_content}
      """)
      documents.append(data(content, source, h1_title, iterator))
      iterator += 1

print(f"Created {len(documents)} chunks")
# documents

Created 8768 chunks


Finally, upload the chuncks to AI Search.

In [20]:
print(f"Uploading {len(documents)} documents to Azure Search")
search_client.merge_or_upload_documents(documents)
print(f"There are {search_client.get_document_count()} documents in the index")

Uploading 8768 documents to Azure Search
There are 13628 documents in the index


Congratulations! 😎