# **Split PDF into 50 page PDFs**

In [3]:
# Use this to split the PDF into smaller chunks
# Splits the PDF into 50 page chunks for use with LandingAI

import PyPDF2
import os

pdf_file = open('pdf/Pathfinder Core Rulebook.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
total_pages = len(pdf_reader.pages)
old_name = 'Pathfinder Core Rulebook'

save_dir = f'splitPDF/{old_name}'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for start in range(0, total_pages, 50):
   end = min(start + 49, total_pages - 1)
   pdf_writer = PyPDF2.PdfWriter()
   
   for page_num in range(start, end + 1):
       pdf_writer.add_page(pdf_reader.pages[page_num])
   
   output_filename = f"{old_name}_{start + 1}_{end + 1}.pdf"
   with open(output_filename, 'wb') as output_file:
       pdf_writer.write(os.path.join(save_dir, output_filename))

pdf_file.close()

## **Install required library for LandingAI**

In [None]:
!pip install agentic-doc

## **Directory of split PDF to parse with LandingAI**

In [4]:
path_to_split_pdf_dir = 'splitPDF/Legends'

In [56]:
from agentic_doc.parse import parse
import os
import json

fn = 'Legends'

if not os.path.exists(f"data/{fn}"):
    os.makedirs(f"data/{fn}")


for file in os.listdir(path_to_split_pdf_dir):
    old_name = file.split('.')[0]
    result = parse(f"{path_to_split_pdf_dir}/{file}")
    all_chunks = []

    # Parse through the results and save as json
    for chunk in result[0].chunks:
        # Convert each chunk to a dictionary using model_dump()
        chunk_dict = chunk.model_dump()
        all_chunks.append(chunk_dict)

    # Write all chunks to a single JSON file
    with open(f'data/{fn}/{old_name}.json', 'w') as f:
        json.dump(all_chunks, f, indent=2)

    with open(f'data/{fn}/{old_name}.md', 'w') as f:
        f.write(result[0].markdown)




[2m2025-06-10 18:09:52[0m [info   [0m] [1mAPI key is valid.             [0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:44)
[2m2025-06-10 18:09:52[0m [info   [0m] [1mParsing 1 documents           [0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:233)


Parsing documents:   0%|          | 0/1 [00:00<?, ?it/s]

[2m2025-06-10 18:09:52[0m [info   [0m] [1mSplitting PDF: 'splitPDF/Legends/Pathfinder Legends_101_130.pdf' into 3 parts under '/var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppyy9w804'[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:216)
[2m2025-06-10 18:09:52[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppyy9w804/Pathfinder Legends_101_130_1.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:09:52[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppyy9w804/Pathfinder Legends_101_130_2.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:09:52[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppyy9w804/Pathfinder Legends_101_130_3.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:09:52[0m [info   [0m] [1mStart parsing document part: 'File name: Pathfinder Legends_101_130_1.pdf	Pag




[2m2025-06-10 18:09:52[0m [info   [0m] [1mStart parsing document part: 'File name: Pathfinder Legends_101_130_3.pdf	Page: [20:29]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:468)


Parsing document parts from 'Pathfinder Legends_101_130.pdf':   0%|          | 0/3 [00:00<?, ?it/s][A

HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
[2m2025-06-10 18:10:12[0m [info   [0m] [1mTime taken to successfully parse a document chunk: 19.34 seconds[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:552)
[2m2025-06-10 18:10:12[0m [info   [0m] [1mSuccessfully parsed document part: 'File name: Pathfinder Legends_101_130_1.pdf	Page: [0:9]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:474)




HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
[2m2025-06-10 18:10:15[0m [info   [0m] [1mTime taken to successfully parse a document chunk: 22.79 seconds[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:552)
[2m2025-06-10 18:10:15[0m [info   [0m] [1mSuccessfully parsed document part: 'File name: Pathfinder Legends_101_130_2.pdf	Page: [10:19]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:474)




HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mTime taken to successfully parse a document chunk: 48.37 seconds[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:552)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mSuccessfully parsed document part: 'File name: Pathfinder Legends_101_130_3.pdf	Page: [20:29]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:474)


Parsing document parts from 'Pathfinder Legends_101_130.pdf': 100%|██████████| 3/3 [00:48<00:00, 16.13s/it]
Parsing documents: 100%|██████████| 1/1 [00:48<00:00, 48.65s/it]


[2m2025-06-10 18:10:41[0m [info   [0m] [1mAPI key is valid.             [0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:44)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mParsing 1 documents           [0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:233)


Parsing documents:   0%|          | 0/1 [00:00<?, ?it/s]

[2m2025-06-10 18:10:41[0m [info   [0m] [1mSplitting PDF: 'splitPDF/Legends/Pathfinder Legends_0_49.pdf' into 5 parts under '/var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppqwln6yu'[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:216)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppqwln6yu/Pathfinder Legends_0_49_1.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppqwln6yu/Pathfinder Legends_0_49_2.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppqwln6yu/Pathfinder Legends_0_49_3.pdf[0m [[0m[1m[34magentic_doc.utils[0m][0m (utils.py:232)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mCreated /var/folders/kd/8jptvn1s1c7dqlrnrwvj24100000gn/T/tmppqwln6yu/Pathfinder Legends_0




[2m2025-06-10 18:10:41[0m [info   [0m] [1mStart parsing document part: 'File name: Pathfinder Legends_0_49_3.pdf	Page: [20:29]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:468)


Parsing document parts from 'Pathfinder Legends_0_49.pdf':   0%|          | 0/5 [00:00<?, ?it/s][A

[2m2025-06-10 18:10:41[0m [info   [0m] [1mStart parsing document part: 'File name: Pathfinder Legends_0_49_4.pdf	Page: [30:39]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:468)
[2m2025-06-10 18:10:41[0m [info   [0m] [1mStart parsing document part: 'File name: Pathfinder Legends_0_49_5.pdf	Page: [40:49]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:468)
HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
[2m2025-06-10 18:11:02[0m [info   [0m] [1mTime taken to successfully parse a document chunk: 20.59 seconds[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:552)
[2m2025-06-10 18:11:02[0m [info   [0m] [1mSuccessfully parsed document part: 'File name: Pathfinder Legends_0_49_5.pdf	Page: [40:49]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:474)
[2m2025-06-10 



HTTP Request: POST https://api.va.landing.ai/v1/tools/agentic-document-analysis "HTTP/1.1 200 OK" (_client.py:1025)
[2m2025-06-10 18:11:30[0m [info   [0m] [1mTime taken to successfully parse a document chunk: 48.75 seconds[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:552)
[2m2025-06-10 18:11:30[0m [info   [0m] [1mSuccessfully parsed document part: 'File name: Pathfinder Legends_0_49_2.pdf	Page: [10:19]'[0m [[0m[1m[34magentic_doc.parse[0m][0m (parse.py:474)


Parsing document parts from 'Pathfinder Legends_0_49.pdf': 100%|██████████| 5/5 [00:48<00:00,  9.75s/it]
Parsing documents: 100%|██████████| 1/1 [00:48<00:00, 48.94s/it]


### Testing

In [54]:
# all_chunks = []


#     # Parse through the results and save as json
# for chunk in result[0].chunks:
#     # Convert each chunk to a dictionary using model_dump()
#     chunk_dict = chunk.model_dump()
#     all_chunks.append(chunk_dict)

# # Write all chunks to a single JSON file
# with open(f'data/{fn}/Pathfinder Legends_51_100.json', 'w') as f:
#     json.dump(all_chunks, f, indent=2)

In [55]:
# with open(f'data/{fn}/Pathfinder Legends_51_100.md', 'w') as f:
#     f.write(result[0].markdown)