In [1]:
#---------------------------
#        cell 1
#----------------------------

import os
from pathlib import Path
from PyPDF2 import PdfReader
import json

# Folder paths
data_raw_path = Path("../data_raw")      # Where your PDFs are
json_output_path = Path("../data_json")  # Where JSON chunks will be saved
json_output_path.mkdir(exist_ok=True)

print("Setup done. Ready to select a PDF.")


Setup done. Ready to select a PDF.


In [2]:
#---------------------------
#        cell 2
#----------------------------
# List available PDFs
pdf_files = list(data_raw_path.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs:")
for i, pdf in enumerate(pdf_files):
    print(f"{i+1}. {pdf.name}")

# Choose PDF interactively
pdf_index = int(input("Enter the number of the PDF to process: ")) - 1
selected_pdf = pdf_files[pdf_index]
print(f"Selected PDF: {selected_pdf.name}")


Found 1 PDFs:
1. Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org).pdf
Selected PDF: Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org).pdf


In [3]:
#---------------------------
#        cell 3
#----------------------------

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
import json

# Load PDF
loader = PyPDFLoader(str(selected_pdf))
documents = loader.load()

# Combine all pages into a single string
text = "\n\n".join([doc.page_content for doc in documents])

# Paragraph-aware splitting
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_text(text)

# Save to JSON
out_file = json_output_path / f"{selected_pdf.stem}_chunks.json"
with open(out_file, "w", encoding="utf-8") as f:
    json.dump([{"id": i+1, "text": c} for i, c in enumerate(chunks)], f, ensure_ascii=False, indent=2)

print(f"PDF split into {len(chunks)} paragraph-aware chunks and saved to {out_file}")


PDF split into 3315 paragraph-aware chunks and saved to ..\data_json\Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org)_chunks.json


In [6]:
#---------------------------
#        cell 4     
#----------------------------


# Create output folder if it doesn't exist
json_output_path.mkdir(parents=True, exist_ok=True)

# Define single JSON file for the PDF
out_file = json_output_path / f"{selected_pdf.stem}_chunks.json"

# Prepare JSON data
json_data = [
    {
        "id": i+1,
        "source": selected_pdf.name,
        "text": chunk,
        "topic": "",    # placeholder for later LLM processing
        "subtopic": ""
    }
    for i, chunk in enumerate(chunks)
]

# Save all chunks in one JSON file
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print(f"All {len(chunks)} chunks saved in a single JSON: {out_file}")



All 3315 chunks saved in a single JSON: ..\data_json\Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org)_chunks.json
