In [12]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import json

In [3]:
# Read PDF file and return full text
def read_pdf(filepath):
    text = ""
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [8]:
# Chunk text with RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [13]:
# Save chunks as JSON to chunks/ directory
def save_chunks_to_json(chunks, filename, output_dir="chunks"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)

    # Wrap each chunk as a dict for easier future use
    data = [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(data)} chunks to {output_path}")

In [None]:
# Process all PDFs in /data
data_dir = "data"
for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(data_dir, file)
        print(f"Processing: {pdf_path}")
        text = read_pdf(pdf_path)
        chunks = chunk_text(text, chunk_size=600, chunk_overlap=100)

        base_filename = os.path.splitext(file)[0] + ".json"
        save_chunks_to_json(chunks, base_filename)

📚 Processing: data/catan_knights_3to4p.pdf
✅ Saved 140 chunks to chunks/catan_knights_3to4p.json
📚 Processing: data/catan_barbarians_3to4p.pdf
✅ Saved 167 chunks to chunks/catan_barbarians_3to4p.json
📚 Processing: data/catan_seafarers_3to4p.pdf
✅ Saved 126 chunks to chunks/catan_seafarers_3to4p.json
📚 Processing: data/catan_knights_5to6p.pdf
✅ Saved 12 chunks to chunks/catan_knights_5to6p.json
📚 Processing: data/catan_barbarians_5to6p.pdf
✅ Saved 30 chunks to chunks/catan_barbarians_5to6p.json
📚 Processing: data/catan_pirates_3to4p.pdf
✅ Saved 112 chunks to chunks/catan_pirates_3to4p.json
📚 Processing: data/catan_pirates_5to4p.pdf
✅ Saved 23 chunks to chunks/catan_pirates_5to4p.json
📚 Processing: data/catan_base_5to6p.pdf
✅ Saved 22 chunks to chunks/catan_base_5to6p.json
📚 Processing: data/catan_base_3to4p.pdf
✅ Saved 111 chunks to chunks/catan_base_3to4p.json
📚 Processing: data/catan_seafarers_5to6p.pdf
✅ Saved 41 chunks to chunks/catan_seafarers_5to6p.json
