In [1]:
import nest_asyncio
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# with unstructured

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
import os
import time

from utils.markdown_utils import unstructured_elements_to_markdown


In [2]:
nest_asyncio.apply()
load_dotenv()


True

In [9]:
out_path = "../data/out/"
origin_path = "../data/pdf/"

# Get all the filenames from the origin path
file_names = os.listdir(origin_path)
file_names.sort()
file_names = [{"path": name} for name in file_names]

print("len:", len(file_names))


len: 329


## Converting PDF data to TXT


In [10]:
s = UnstructuredClient(
    api_key_auth=os.environ["UNSTRUCTURED_API_KEY"],
    server_url=os.environ["UNSTRUCTURED_SERVER_URL"],
)


In [11]:
for idx, file in enumerate(file_names):
    file_path = origin_path + file["path"]
    with open(file_path, "rb") as f:
        # Note that this currently only supports a single file
        files = shared.Files(
            content=f.read(),
            file_name=file_path,
        )

    req = shared.PartitionParameters(
        files=files,
        # Other partition params
        strategy="fast",
        languages=["eng"],
        # split_pdf_allow_failed=True
        # pdf_infer_table_structure=True,
        # skip_infer_table_types=[],
    )

    try:
        resp = s.general.partition(req)
        # print(len(resp.elements))
    except SDKError as e:
        print(e)
        file["unstructured_error"] = e
        continue

    simple_md = unstructured_elements_to_markdown(resp.elements)

    # get the size of the file
    file_names[idx] = {"path": file["path"], "size": len(simple_md)}

    file_out = out_path + f"txt/{str(idx+1)}-" + file["path"].replace(".pdf", ".txt")
    # file_out = file_out.replace("pdf", "txt")

    with open(file_out, "w") as f:
        f.write(simple_md)

    time.sleep(2)

    # Destroy the objects
    del resp
    del simple_md
    del req


INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 1
INFO: Concurrency level set to 5
INFO: Splitting pages 1 to 9 (9 total)
INFO: Determined optimal split size of 2 pages.
INFO: Partitioning 4 files with 2 page(s) each.
INFO: Partitioning 1 file with 1 page(s).
INFO: Partitioning set #1 (pages 1-2).
INFO: Partitioning set #2 (pages 3-4).
INFO: Partitioning set #3 (pages 5-6).
INFO: Partitioning set #4 (pages 7-8).
INFO: Partitioning set #5 (pages 9-9).
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: Successfully partitioned set #1, elements added to the final result.
INFO: Successfully partitioned set 

In [17]:
import json

smallest = [file for file in file_names if file["size"] < 400]

print(json.dumps(smallest, indent=4))


[
    {
        "path": "Email-Data-Privacypdf.pdf",
        "size": 334
    },
    {
        "path": "Finding-Your-Location-in-PATH-ECpdf.pdf",
        "size": 140
    },
    {
        "path": "Finding-Your-Location-in-PATHpdf.pdf",
        "size": 119
    },
    {
        "path": "Finding-the-Correct-BYU-Pathway-Applicationpdf.pdf",
        "size": 0
    },
    {
        "path": "First-Contact-Messages-for-Learnerspdf.pdf",
        "size": 0
    },
    {
        "path": "How-Students-Choose-Their-Certificatepdf.pdf",
        "size": 0
    },
    {
        "path": "How-a-Student-Changes-Their-Church-Account-Namepdf.pdf",
        "size": 323
    },
    {
        "path": "How-to-Log-into-PATHpdf.pdf",
        "size": 356
    },
    {
        "path": "Impersonate-a-Learner-to-Accesspdf.pdf",
        "size": 0
    },
    {
        "path": "Log-in-to-the-Zoom-Websitepdf.pdf",
        "size": 377
    },
    {
        "path": "Merge-Your-Zoom-Accountpdf.pdf",
        "size": 330
    },
    {

# Convert the TXT info to Markdown with Llama-Parse


In [None]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, closely resembling the original PDF structure. "
        "Use headers from H1 to H3, with H1 for main titles, H2 for sections, and H3 for subsections. "
        "Detect any bold, large, or all-uppercase text as headers. "
        "Preserve bullet points and numbered lists with proper indentation to reflect nested lists. "
        "if it is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic"
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text. "
        "When processing text, pay attention to line breaks that may incorrectly join or split words. "
        "Automatically correct common errors, such as wrongly concatenated words or broken lines, to ensure the text reads naturally"
        "If code snippets or technical commands are found, enclose them in triple backticks ``` for proper formatting. "
        "If any tables are detected, parse them as a title (bold header) followed by list items"
        "If you see the same header multiple times, merge them into one."
        "If images contain important text, transcribe only the highlighted or boxed text and ignore general background text. "
        "The final output should be a clean, concise Markdown document closely reflecting the original PDF's intent and structure without adding any extra text."
    ),
)


In [None]:
file_extractor = {".txt": parser}


for idx, file in enumerate(file_names):
    load_file = out_path + "txt/" + file["path"].replace("pdf.pdf", ".txt")
    load_file = load_file.replace(".pdf", ".txt")
    limit_try = 3
    documents = None

    # Tru to load the file until it is loaded but with a limit of 5 tries
    while limit_try > 0:

        documents = SimpleDirectoryReader(
            input_files=[load_file], file_extractor=file_extractor
        ).load_data()

        size = sum([len(doc.text) for doc in documents])
        if size < (file["size"] - 300):
            limit_try -= 1
        else:
            break

    if limit_try == 0:
        file["error"] = "Not loaded."
        print(f"File {file} could not be loaded")

    out_name = f"{out_path}md/{str(idx+1)}-{file['path'].replace('.pdf', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")


In [None]:
# print the files that could not be loadedand has an error attribute
for file in file_names:
    if "error" in file:
        print(file)


Review if there are files with error to review them manually.


In [None]:
files_with_error = [file for file in file_names if "error" in file]

print(files_with_error)


### Pros

- Cheapest because it makes less requests to the API
- Most accurate because it uses the text directly
- Well structured result

### Cons

- We are ignoring completely the image and only using the text related. This could be a problem if the image contains important information
- Sometimes the order of the text is not the same as the PDF because the structure (i.e. 18)

### warnings

- In some PDF's thera are text inside the images (to complete the example, this could be a problem, for example, a specific price)
