In [1]:
import nest_asyncio
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# with unstructured

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
import os
import time

from utils.markdown_utils import unstructured_elements_to_markdown

import logging

# Set the logging level to WARNING or higher to suppress INFO messages
logging.basicConfig(level=logging.WARNING)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/isaiaszc/pathway/pathway-
[nltk_data]     indexer/.venv/lib/python3.12/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
nest_asyncio.apply()
load_dotenv()


True

In [3]:
out_path = "../data/data_16_09_24/out/from_pdf/"
origin_path = "../data/data_16_09_24/crawl/pdf/"

# Get all the filenames from the origin path
file_names = os.listdir(origin_path)
file_names.sort()
file_names = [{"path": name} for name in file_names]

print("len:", len(file_names))


len: 294


## Converting PDF data to TXT


In [4]:
s = UnstructuredClient(
    api_key_auth=os.environ["UNSTRUCTURED_API_KEY"],
    server_url=os.environ["UNSTRUCTURED_SERVER_URL"],
)


In [5]:
def parse_pdf_to_txt(file):
    file_path = origin_path + file["path"]
    print("Processing file:", file_path)
    with open(file_path, "rb") as f:
        # Note that this currently only supports a single file
        files = shared.Files(
            content=f.read(),
            file_name=file_path,
        )

    req = shared.PartitionParameters(
        files=files,
        # Other partition params
        strategy="fast",
        languages=["eng"],
        encoding="utf-8",
        # split_pdf_allow_failed=True
        # pdf_infer_table_structure=True,
        # skip_infer_table_types=[],
    )

    try:
        resp = s.general.partition(req)
        # print(len(resp.elements))
    except SDKError as e:
        print(e)
        return True
    except Exception as e:  # if the SDKError is not caught
        print("Another exception", e)
        return True
    simple_md = unstructured_elements_to_markdown(resp.elements)

    # get the size of the file
    file["size"] = len(simple_md)
    # file_names[idx] = {"path": file["path"], "size": len(simple_md)}

    file_out = out_path + file["path"].replace(".pdf", ".txt")
    # file_out = file_out.replace("pdf", "txt")

    with open(file_out, "w") as f:
        f.write(simple_md)

    return False


In [6]:
for file in file_names:
    with_error = parse_pdf_to_txt(file)
    try_limit = 3
    while with_error and try_limit > 0:
        with_error = parse_pdf_to_txt(file)
        try_limit -= 1
        time.sleep(4)

    if try_limit == 0:
        print("Error processing file:", file["path"])
        file["unstructured_error"] = True

    time.sleep(2)


Processing file: ../data/data_16_09_24/crawl/pdf/110f0051.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/12180468.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/123bdf7c.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/12a1fef2.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/130b6f2b.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/13c81944.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/143661b8.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/144be1f1.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1784dc2f.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/196f7c22.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1975d8f7.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/19db036c.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1bdba2b7.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1d506ff.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1e4bc734.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/1e85ce7b.pdf
Processin

ERROR:unstructured-client:Failed to partition the document.
  self.coroutines_to_execute[operation_id] = []


Processing file: ../data/data_16_09_24/crawl/pdf/bfe1c205.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c145f4ad.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c35c4d18.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c3bc8397.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c3bdfc5e.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c3e9d44b.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c5838cfb.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c6672c90.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c66a1208.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c79965c3.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c7d23ae3.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c9481527.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/c9b5896.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/cb9db7cd.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/cc28eacc.pdf
Processing file: ../data/data_16_09_24/crawl/pdf/cce5165c.pdf
Processin

In [7]:
# process the files with errors
file_names_with_errors = [file for file in file_names if "unstructured_error" in file]
print(file_names_with_errors)


[]


Manual verification the files with smallest size


IMPORTANT: The next piece of code is unnecessary, was writed just because the original file_names variable was empty after running the last function (probably a bug in the function or by ram memory).


In [8]:
# get the files names from the out path
# file_names_out = os.listdir(out_path)
# file_names_out.sort()

# file_names_out = [{"path": name} for name in file_names_out]

# get the size of each file
# for idx, file in enumerate(file_names):
#     file_path = out_path + file["path"].replace(".pdf", ".txt")
#     with open(file_path, "r") as f:
#         file["size"] = len(f.read())


# Convert the TXT info to Markdown with Llama-Parse


In [9]:
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=(
        "Convert the provided text into accurate and well-structured Markdown format, closely resembling the original PDF structure. "
        "Use headers from H1 to H3, with H1 for main titles, H2 for sections, and H3 for subsections. "
        "Detect any bold, large, or all-uppercase text as headers. "
        "Preserve bullet points and numbered lists with proper indentation to reflect nested lists. "
        "if it is not a header, ensure that bold and italic text is properly formatted using double **asterisks** for bold and single *asterisks* for italic"
        "Detect and correctly format blockquotes using the '>' symbol for any quoted text. "
        "When processing text, pay attention to line breaks that may incorrectly join or split words. "
        "Automatically correct common errors, such as wrongly concatenated words or broken lines, to ensure the text reads naturally"
        "If code snippets or technical commands are found, enclose them in triple backticks ``` for proper formatting. "
        "If any tables are detected, parse them as a title (bold header) followed by list items"
        "If you see the same header multiple times, merge them into one."
        "If images contain important text, transcribe only the highlighted or boxed text and ignore general background text. "
        "Do not enclose fragments of code/Markdown or any other content in triple backticks unless they are explicitly formatted as code blocks in the original text. "
        "The final output should be a clean, concise Markdown document closely reflecting the original PDF's intent and structure without adding any extra text."
    ),
)


In [10]:
file_extractor = {".txt": parser}


def parse_txt_to_md(file):
    load_file = out_path + file.replace(".pdf", ".txt")

    if os.path.exists(load_file.replace(".txt", ".md")):
        print(f"Markdown file {load_file.replace(".txt",".md")} already exists. Skipping.")
        return 0

    documents = SimpleDirectoryReader(
        input_files=[load_file], file_extractor=file_extractor
    ).load_data()

    size = sum([len(doc.text) for doc in documents])

    if size == 0:
        print(f"Error parsing {load_file}. Review the limit credits.")
        return 0

    out_name = f"{out_path}{file.replace('.pdf', '.md')}"
    # out_name = file.replace(".txt", ".md")
    with open(out_name, "w") as f:
        for doc in documents:
            f.write(doc.text)
            f.write("\n\n")
        print(out_name, "saved.")

    return size


In [11]:
for file in file_names:
    limit_try = 2
    size = parse_txt_to_md(file["path"])

    print(
        f"size parsed: {size}, file size: {file['size']}, diff: {file['size'] - size}"
    )
    while size < (file["size"] - 300) and limit_try > 0 and size != 0:
        try:
            limit_try -= 1
            time.sleep(2)
            size = parse_txt_to_md(file["path"])
        except Exception as e:
            limit_try -= 1
            print(f"Error processing file {file['path']} - {limit_try} tries left.")
            time.sleep(2)

    if limit_try == 0 and size != 0:
        print(f"Error processing file {file['path']} - {limit_try} tries left.")
        file["llama_error"] = True

    if size != 0:
        # delete the txt file
        loaded_file = out_path + file["path"].replace(".pdf", ".txt")
        if os.path.exists(loaded_file):
            os.remove(loaded_file)


Started parsing the file under job_id b9efddef-dc42-4add-b215-524c35b40930
../data/data_16_09_24/out/from_pdf/110f0051.md saved.
size parsed: 3332, file size: 3295, diff: -37
Started parsing the file under job_id c189dafc-afde-47f0-8309-9589298e60bd
../data/data_16_09_24/out/from_pdf/12180468.md saved.
size parsed: 2643, file size: 2731, diff: 88
Started parsing the file under job_id 13f44422-ff67-4df8-8cd0-dfc37e6a7641
../data/data_16_09_24/out/from_pdf/123bdf7c.md saved.
size parsed: 1442, file size: 1405, diff: -37
Started parsing the file under job_id 44e28948-cf1d-4986-af63-bfbfee667aba
../data/data_16_09_24/out/from_pdf/12a1fef2.md saved.
size parsed: 654, file size: 643, diff: -11
Started parsing the file under job_id b5490da4-b37c-464c-b647-2a936b07221a
../data/data_16_09_24/out/from_pdf/130b6f2b.md saved.
size parsed: 1864, file size: 1847, diff: -17
Started parsing the file under job_id f56e5a5a-3377-46d5-bb53-5a59b20c1573
../data/data_16_09_24/out/from_pdf/13c81944.md saved.

In [12]:
# print the files that could not be loadedand has an error attribute
for file in file_names:
    if "llama_error" in file:
        print(file)


Review if there are files with error to review them manually.


In [13]:
files_with_error = [file for file in file_names if "error" in file]

print(files_with_error)


[]
