In [1]:
import os
import glob
import pymupdf4llm  # pip install pymupdf4llm


def extract_markdown_from_pdfs(directory):
    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
    processed_docs = []

    print(f"Found {len(pdf_files)} PDF files in {directory}")

    for pdf_file in pdf_files:
        print(f"Processing: {os.path.basename(pdf_file)}")
        try:
            # Returns a list of dictionaries [{'text': '...', 'metadata': {...}}, ...]
            # separating content by page automatically
            doc_data = pymupdf4llm.to_markdown(pdf_file, page_chunks=True)

            for page in doc_data:
                # Add source filename to metadata for citation
                page['metadata']['source'] = os.path.basename(pdf_file)
                processed_docs.append(page)

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    return processed_docs


Consider using the pymupdf_layout package for a greatly improved page layout analysis.


In [3]:
# rag_data_dir = os.path.join(os.path.dirname(__file__), "rag_data")
rag_data_dir = "./rag_data"
extracted_text = extract_markdown_from_pdfs(rag_data_dir)

Found 2 PDF files in ./rag_data
Processing: STCW_guide_english.pdf
Processing: training-manual-vol-i-pre-sea.pdf


In [12]:
# extracted_text = "\n".join(extracted_text)
# Save to file to avoid console buffer limits if large, and also print a summary
only_texts = [text for ]

INTERNATIONAL TRANSPORT WORKERSâ€™ FEDERATION


Ratings forming part of a navigational watch





C/R certificate required. D/P Documentary proof. T/O Training onboard. E/R Endorsement required.


General requirements to obtain a certificate of competency as a rating

forming part of a navigational watch


Applies only for service on ships of 500 gross tonnage or more.


The rating must:


a) **Age** : be not less than 16 years of age.


b) **Seagoing service** : have completed approved seagoing service, including not less

than six months' training and experience, or


c) **Education & training** : have received special training, either pre-sea or onboard ship,

including an approved period of seagoing service which shall not be less than two

months; **and**


d) **Competence** : meet the standard of competence specified in section A-II/4.


STCW: A GUIDE FOR SEAFARERS **33**





In [None]:
output_file = os.path.join(rag_data_dir, "extracted_content.txt")
with open(output_file, "w", encoding="utf-8") as f:
    for output in extracted_text:
        f.write(output)

print(f"Extracted dictionary length: {len(extracted_text)}")
print(f"Saved extracted text to: {output_file}")