In [1]:
# =========================
# 01_pdf_extraction.ipynb
# FinWise: PDF Ingestion, Cleaning, and Chunking
# =========================

# --------------------------------------
# Step 0: Setup
# --------------------------------------
import sys
from pathlib import Path
import json
from collections import defaultdict

# If your notebook is in notebooks/, this adds the project root to sys.path
project_root = Path().resolve().parent  # adjust if needed
sys.path.append(str(project_root))

# Now imports from src/ will work
import json
from src.data.pdf_loader import ingest_all_sources, save_docs_to_json
from src.data.cleaners import clean_docs,split_all_cleaned_jsons
from src.data.chunkers import chunk_docs_streaming

print("Setup complete. src/ modules are now importable.")


Setup complete. src/ modules are now importable.


## Step 1: Load PDFs from all sources

We load PDFs from both Varsity and SEBI sources, adding rich metadata per page:

- source filename
- page number
- category
- total pages
- load date
- source path


In [None]:
print("\nLoading PDFs from Varsity and SEBI sources...")
docs = ingest_all_sources(base_raw_path="../data/raw")
print(f"Total pages loaded: {len(docs)}")

# Compute pages per source
pages_per_source = defaultdict(int)
for doc in docs:
    pages_per_source[doc.metadata['category']] += 1

print("\nPages loaded per source:")
for src, count in pages_per_source.items():
    print(f"- {src}: {count} pages")


Loading PDFs from Varsity and SEBI sources...


In [None]:
# Inspect a sample page
print("Example page metadata:")
print(docs[0].metadata)

print("\nExample page content (first 500 chars):")
print(docs[0].page_content[:500])


## Step 2: Save raw pages to JSON

We save all pages with metadata to `data/interim/`.
This allows us to restart cleaning or chunking without re-reading PDFs.


In [None]:
print("\nSaving raw PDF pages to JSON files (one per source)...")
save_docs_to_json(docs, output_folder="../data/interim/raw/")
print("Raw PDF pages saved to data/interim/raw/")

## Step 3: Clean the PDF pages

Cleaning steps:

- Remove extra whitespace
- Remove repeated headers/footers
- Normalize quotes and hyphens
- Keep metadata intact


In [None]:
print("\nCleaning all PDF JSON files in data/interim/...")
clean_docs(input_folder="../data/interim/raw", output_folder="../data/interim/cleaned")
print("Cleaning complete. Each input JSON now has a *_cleaned.json counterpart.")

# Inspect a sample cleaned page
sample_cleaned_file = Path("../data/interim/cleaned").glob("*_cleaned.json")
sample_file = next(sample_cleaned_file)
with open(sample_file, "r", encoding="utf-8") as f:
    cleaned_pages = json.load(f)

print(f"Total cleaned pages in sample file: {len(cleaned_pages)}")
print("\nExample cleaned page metadata:")
print(cleaned_pages[0]["metadata"])
print("\nExample cleaned page content (first 500 chars):")
print(cleaned_pages[0]["page_content"][:500])

## Step 4: Split cleaned JSONs into smaller files

Splitting steps:

- Automatically pick all \*\_cleaned.json files in data/interim/
- Split each file into chunks of n pages (configurable)
- Save split files with \_partX_cleaned.json suffix for easier downstream processing
- Keep metadata intact


In [None]:
print("\nSplitting all cleaned JSON files in data/interim/cleaned/..")
split_all_cleaned_jsons(
    input_folder="../data/interim/cleaned/",
    output_folder="../data/interim/split/",
    pages_per_file=200
)
print("Splitting complete.")

## Step 5: Chunk cleaned pages

Chunking parameters:

- chunk_size = 800 characters
- overlap = 100 characters

Chunks have a unique `chunk_id` and keep all page metadata.


In [None]:
print("\nChunking all cleaned JSON files in data/interim/...")

# Memory-efficient chunking
chunk_docs_streaming(
    input_folder="../data/interim/split/",
    output_folder="../data/processed/",
    chunk_size=500,
    overlap=100,
    jsonl_output=True,
)

print("Chunking complete. Each cleaned JSON now has a corresponding *_chunks.json file in data/processed/")

# Display number of chunks per source
chunk_files = list(Path("../data/processed").glob("*_chunks.json"))
chunks_per_source = defaultdict(int)

for file in chunk_files:
    count = 0
    with open(file, "r", encoding="utf-8") as f:
        # count number of items in JSON array without loading all into memory
        for line in f:
            if line.strip().startswith('{'):
                count += 1
    source_name = file.stem.replace("_chunks", "")
    chunks_per_source[source_name] = count

print("\nChunks per source:")
for src, count in chunks_per_source.items():
    print(f"- {src}: {count} chunks")


## Step 6: Summary

At this point:

- Raw extracted pages: `data/interim/`
- Cleaned pages: `data/interim/`
- Chunked pages ready for embedding: `data/processed/`


In [None]:
# Paths
raw_folder = Path("../data/interim/raw/")
cleaned_folder = Path("../data/interim/cleaned/")
split_folder = Path("../data/interim/split/")
processed_folder = Path("../data/processed/")

def count_json_lines(file_path):
    """Count number of JSON objects in a file (works for JSONL or JSON array)."""
    count = 0
    try:
        if file_path.suffix == ".jsonl":
            with open(file_path, "r", encoding="utf-8") as f:
                for _ in f:
                    count += 1
        elif file_path.suffix == ".json":
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                count = len(data)
    except Exception:
        count = 0
    return count

# Count files
raw_files = list(raw_folder.glob("*.json"))
cleaned_files = list(cleaned_folder.glob("*_cleaned.json"))
split_files = list(split_folder.glob("*.json"))
chunked_files = list(processed_folder.glob("*_chunks.jsonl"))

# Totals
total_raw_pages = sum(count_json_lines(f) for f in raw_files)
total_cleaned_pages = sum(count_json_lines(f) for f in cleaned_files)
total_split_pages = sum(count_json_lines(f) for f in split_files)
total_chunks = sum(count_json_lines(f) for f in chunked_files)

# Summary
print("\n✅ PDF extraction, cleaning, and chunking complete!\n")
print(f"📂 Raw JSON files ({raw_folder}): {len(raw_files)}")
print(f"    Total raw pages: {total_raw_pages}")
print(f"📂 Cleaned JSON files ({cleaned_folder}): {len(cleaned_files)}")
print(f"    Total cleaned pages: {total_cleaned_pages}")
print(f"📂 Split JSON files ({split_folder}): {len(split_files)}")
print(f"    Total split pages: {total_split_pages}")
print(f"📂 Chunked JSONL files ({processed_folder}): {len(chunked_files)}")
print(f"    Total chunks: {total_chunks}")
