In [1]:
# =========================
# 01_pdf_extraction.ipynb
# FinWise: PDF Ingestion, Cleaning, and Chunking
# =========================

# --------------------------------------
# Step 0: Setup
# --------------------------------------
import sys
from pathlib import Path
import json
from collections import defaultdict

# If your notebook is in notebooks/, this adds the project root to sys.path
project_root = Path().resolve().parent  # adjust if needed
sys.path.append(str(project_root))

# Now imports from src/ will work
import json
from src.data.pdf_loader import ingest_all_sources, save_docs_to_json
from src.data.cleaners import clean_docs,split_all_cleaned_jsons
from src.data.chunkers import chunk_docs_streaming

print("Setup complete. src/ modules are now importable.")


Setup complete. src/ modules are now importable.


## Step 1: Load PDFs from all sources

We load PDFs from both Varsity and SEBI sources, adding rich metadata per page:

- source filename
- page number
- category
- total pages
- load date
- source path


In [2]:
print("\nLoading PDFs from Varsity and SEBI sources...")
docs = ingest_all_sources(base_raw_path="../data/raw")
print(f"Total pages loaded: {len(docs)}")

# Compute pages per source
pages_per_source = defaultdict(int)
for doc in docs:
    pages_per_source[doc.metadata['category']] += 1

print("\nPages loaded per source:")
for src, count in pages_per_source.items():
    print(f"- {src}: {count} pages")


Loading PDFs from Varsity and SEBI sources...
Total pages loaded: 2929

Pages loaded per source:
- varsity: 2186 pages
- sebi_education: 743 pages


In [3]:
# Inspect a sample page
print("Example page metadata:")
print(docs[0].metadata)

print("\nExample page content (first 500 chars):")
print(docs[0].page_content[:500])


Example page metadata:
{'source': 'varsity_module01_introduction_to_stock_markets.pdf', 'source_path': '..\\data\\raw\\varsity\\varsity_module01_introduction_to_stock_markets.pdf', 'category': 'varsity', 'page_number': 1, 'page_label': '1', 'total_pages': 111, 'text_length': 58, 'load_date': '2025-08-14T16:32:15.043596'}

Example page content (first 500 chars):
Introduction to 
Stock Markets
ZERODHA.COM/VARSITY
ZERODHA


## Step 2: Save raw pages to JSON

We save all pages with metadata to `data/interim/`.
This allows us to restart cleaning or chunking without re-reading PDFs.


In [4]:
print("\nSaving raw PDF pages to JSON files (one per source)...")
save_docs_to_json(docs, output_folder="../data/interim/raw/")
print("Raw PDF pages saved to data/interim/raw/")


Saving raw PDF pages to JSON files (one per source)...
Saved 2186 pages to ..\data\interim\raw\all_pdfs_varsity.json
Saved 743 pages to ..\data\interim\raw\all_pdfs_sebi_education.json
Raw PDF pages saved to data/interim/raw/


## Step 3: Clean the PDF pages

Cleaning steps:

- Remove extra whitespace
- Remove repeated headers/footers
- Normalize quotes and hyphens
- Keep metadata intact


In [5]:
print("\nCleaning all PDF JSON files in data/interim/...")
clean_docs(input_folder="../data/interim/raw", output_folder="../data/interim/cleaned")
print("Cleaning complete. Each input JSON now has a *_cleaned.json counterpart.")

# Inspect a sample cleaned page
sample_cleaned_file = Path("../data/interim/cleaned").glob("*_cleaned.json")
sample_file = next(sample_cleaned_file)
with open(sample_file, "r", encoding="utf-8") as f:
    cleaned_pages = json.load(f)

print(f"Total cleaned pages in sample file: {len(cleaned_pages)}")
print("\nExample cleaned page metadata:")
print(cleaned_pages[0]["metadata"])
print("\nExample cleaned page content (first 500 chars):")
print(cleaned_pages[0]["page_content"][:500])


Cleaning all PDF JSON files in data/interim/...
Saved 743 cleaned pages to ..\data\interim\cleaned\all_pdfs_sebi_education_cleaned.json
Saved 2186 cleaned pages to ..\data\interim\cleaned\all_pdfs_varsity_cleaned.json
Cleaning complete. Each input JSON now has a *_cleaned.json counterpart.
Total cleaned pages in sample file: 743

Example cleaned page metadata:
{'source': 'sebi_buyback_open_offer.pdf', 'source_path': '..\\data\\raw\\sebi_education\\sebi_buyback_open_offer.pdf', 'category': 'sebi_education', 'page_number': 1, 'page_label': '1', 'total_pages': 32, 'text_length': 35, 'load_date': '2025-08-14T16:34:03.027404'}

Example cleaned page content (first 500 chars):
Buyback of and Open Offer of Shares


## Step 4: Split cleaned JSONs into smaller files

Splitting steps:

- Automatically pick all \*\_cleaned.json files in data/interim/
- Split each file into chunks of n pages (configurable)
- Save split files with \_partX_cleaned.json suffix for easier downstream processing
- Keep metadata intact


In [6]:
print("\nSplitting all cleaned JSON files in data/interim/cleaned/..")
split_all_cleaned_jsons(
    input_folder="../data/interim/cleaned/",
    output_folder="../data/interim/split/",
    pages_per_file=200
)
print("Splitting complete.")


Splitting all cleaned JSON files in data/interim/cleaned/..
Saved 200 pages to ..\data\interim\split\all_pdfs_sebi_education_cleaned_part1.json
Saved 200 pages to ..\data\interim\split\all_pdfs_sebi_education_cleaned_part2.json
Saved 200 pages to ..\data\interim\split\all_pdfs_sebi_education_cleaned_part3.json
Saved 143 pages to ..\data\interim\split\all_pdfs_sebi_education_cleaned_part4.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part1.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part2.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part3.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part4.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part5.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part6.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleaned_part7.json
Saved 200 pages to ..\data\interim\split\all_pdfs_varsity_cleane

## Step 5: Chunk cleaned pages

Chunking parameters:

- chunk_size = 800 characters
- overlap = 100 characters

Chunks have a unique `chunk_id` and keep all page metadata.


In [7]:
print("\nChunking all cleaned JSON files in data/interim/...")

# Memory-efficient chunking
chunk_docs_streaming(
    input_folder="../data/interim/split/",
    output_folder="../data/processed/",
    chunk_size=800,
    overlap=100,
    jsonl_output=True,
)

print("Chunking complete. Each cleaned JSON now has a corresponding *_chunks.json file in data/processed/")

# Display number of chunks per source
chunk_files = list(Path("../data/processed").glob("*_chunks.json"))
chunks_per_source = defaultdict(int)

for file in chunk_files:
    count = 0
    with open(file, "r", encoding="utf-8") as f:
        # count number of items in JSON array without loading all into memory
        for line in f:
            if line.strip().startswith('{'):
                count += 1
    source_name = file.stem.replace("_chunks", "")
    chunks_per_source[source_name] = count

print("\nChunks per source:")
for src, count in chunks_per_source.items():
    print(f"- {src}: {count} chunks")



Chunking all cleaned JSON files in data/interim/...

Processing all_pdfs_sebi_education_cleaned_part1.json ...


Chunking all_pdfs_sebi_education_cleaned_part1.json: 100%|██████████| 200/200 [00:00<00:00, 86037.01page/s]


Saved chunks to ..\data\processed\all_pdfs_sebi_education_cleaned_part1_chunks.jsonl

Processing all_pdfs_sebi_education_cleaned_part2.json ...


Chunking all_pdfs_sebi_education_cleaned_part2.json: 100%|██████████| 200/200 [00:00<00:00, 73103.34page/s]


Saved chunks to ..\data\processed\all_pdfs_sebi_education_cleaned_part2_chunks.jsonl

Processing all_pdfs_sebi_education_cleaned_part3.json ...


Chunking all_pdfs_sebi_education_cleaned_part3.json: 100%|██████████| 200/200 [00:00<00:00, 53305.00page/s]


Saved chunks to ..\data\processed\all_pdfs_sebi_education_cleaned_part3_chunks.jsonl

Processing all_pdfs_sebi_education_cleaned_part4.json ...


Chunking all_pdfs_sebi_education_cleaned_part4.json: 100%|██████████| 143/143 [00:00<00:00, 28291.77page/s]

Saved chunks to ..\data\processed\all_pdfs_sebi_education_cleaned_part4_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part1.json ...



Chunking all_pdfs_varsity_cleaned_part1.json: 100%|██████████| 200/200 [00:00<00:00, 16142.49page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part1_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part10.json ...


Chunking all_pdfs_varsity_cleaned_part10.json: 100%|██████████| 200/200 [00:00<00:00, 16257.31page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part10_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part11.json ...


Chunking all_pdfs_varsity_cleaned_part11.json: 100%|██████████| 186/186 [00:00<00:00, 16584.62page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part11_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part2.json ...


Chunking all_pdfs_varsity_cleaned_part2.json: 100%|██████████| 200/200 [00:00<00:00, 19397.87page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part2_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part3.json ...


Chunking all_pdfs_varsity_cleaned_part3.json: 100%|██████████| 200/200 [00:00<00:00, 15938.53page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part3_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part4.json ...


Chunking all_pdfs_varsity_cleaned_part4.json: 100%|██████████| 200/200 [00:00<00:00, 43484.57page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part4_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part5.json ...


Chunking all_pdfs_varsity_cleaned_part5.json: 100%|██████████| 200/200 [00:00<00:00, 25341.70page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part5_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part6.json ...


Chunking all_pdfs_varsity_cleaned_part6.json: 100%|██████████| 200/200 [00:00<00:00, 19374.12page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part6_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part7.json ...


Chunking all_pdfs_varsity_cleaned_part7.json: 100%|██████████| 200/200 [00:00<00:00, 16686.11page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part7_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part8.json ...


Chunking all_pdfs_varsity_cleaned_part8.json: 100%|██████████| 200/200 [00:00<00:00, 43986.20page/s]


Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part8_chunks.jsonl

Processing all_pdfs_varsity_cleaned_part9.json ...


Chunking all_pdfs_varsity_cleaned_part9.json: 100%|██████████| 200/200 [00:00<00:00, 23607.94page/s]

Saved chunks to ..\data\processed\all_pdfs_varsity_cleaned_part9_chunks.jsonl
Chunking complete. Each cleaned JSON now has a corresponding *_chunks.json file in data/processed/

Chunks per source:





## Step 6: Summary

At this point:

- Raw extracted pages: `data/interim/`
- Cleaned pages: `data/interim/`
- Chunked pages ready for embedding: `data/processed/`


In [12]:
# Paths
raw_folder = Path("../data/interim/raw/")
cleaned_folder = Path("../data/interim/cleaned/")
split_folder = Path("../data/interim/split/")
processed_folder = Path("../data/processed/")

def count_json_lines(file_path):
    """Count number of JSON objects in a file (works for JSONL or JSON array)."""
    count = 0
    try:
        if file_path.suffix == ".jsonl":
            with open(file_path, "r", encoding="utf-8") as f:
                for _ in f:
                    count += 1
        elif file_path.suffix == ".json":
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                count = len(data)
    except Exception:
        count = 0
    return count

# Count files
raw_files = list(raw_folder.glob("*.json"))
cleaned_files = list(cleaned_folder.glob("*_cleaned.json"))
split_files = list(split_folder.glob("*.json"))
chunked_files = list(processed_folder.glob("*_chunks.jsonl"))

# Totals
total_raw_pages = sum(count_json_lines(f) for f in raw_files)
total_cleaned_pages = sum(count_json_lines(f) for f in cleaned_files)
total_split_pages = sum(count_json_lines(f) for f in split_files)
total_chunks = sum(count_json_lines(f) for f in chunked_files)

# Summary
print("\n✅ PDF extraction, cleaning, and chunking complete!\n")
print(f"📂 Raw JSON files ({raw_folder}): {len(raw_files)}")
print(f"    Total raw pages: {total_raw_pages}")
print(f"📂 Cleaned JSON files ({cleaned_folder}): {len(cleaned_files)}")
print(f"    Total cleaned pages: {total_cleaned_pages}")
print(f"📂 Split JSON files ({split_folder}): {len(split_files)}")
print(f"    Total split pages: {total_split_pages}")
print(f"📂 Chunked JSONL files ({processed_folder}): {len(chunked_files)}")
print(f"    Total chunks: {total_chunks}")



✅ PDF extraction, cleaning, and chunking complete!

📂 Raw JSON files (..\data\interim\raw): 2
    Total raw pages: 2929
📂 Cleaned JSON files (..\data\interim\cleaned): 2
    Total cleaned pages: 2929
📂 Split JSON files (..\data\interim\split): 15
    Total split pages: 2929
📂 Chunked JSONL files (..\data\processed): 15
    Total chunks: 5846
