In [1]:
from chonkie import RecursiveChunker

chunker = RecursiveChunker.from_recipe(
    "markdown",
    lang="en",
    chunk_size=512,                 #Chunk max size.
    min_characters_per_chunk=24,    # Prevent meaningless chunks like 1 word.
    return_type="chunks"            # Ensure we get the RecursiveChunk objects
)

text_simple = "This is the first sentence. This is the second sentence. And here's a third one with some additional context."

text_markdown = """
# Annual Report 2023

## Section 1: Executive Summary

This report summarizes the financial performance of the company for the fiscal year 2023. We saw significant growth in key areas.

## Section 2: Financial Performance

### Revenue Analysis

Total revenue for 2023 reached $1.2 Billion, a 15% increase from $1.04 Billion in 2022. Growth was driven by strong performance in the SaaS division.

| Quarter | Revenue ($M) | Growth (%) |
|---------|--------------|------------|
| Q1 2023 | 250          | 12         |
| Q2 2023 | 290          | 14         |
| Q3 2023 | 330          | 16         |
| Q4 2023 | 380          | 18         |

### Operating Expenses

Operating expenses increased by 10%, primarily due to increased investment in R&D and marketing efforts to support future growth initiatives. R&D spending was $150 Million, while marketing was $200 Million.

### Net Income

Net income for the year was $200 Million, resulting in an earnings per share of $2.50. This represents a 20% improvement compared to the previous year.
"""

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

texts_to_chunk = [text_simple, text_markdown]

print("Chunking texts...")
batch_chunks = chunker.chunk_batch(texts_to_chunk)

print("\n--------- Chunking Results ---------")
for doc_index, doc_chunks in enumerate(batch_chunks):
    print(f"\n+++--- Document {doc_index + 1} ---+++")
    if not doc_chunks:
        print("No chunks generated for this document.")
        continue

    for i, chunk in enumerate(doc_chunks):
        print(f"Chunk {i+1}:")
        print(f"  Token count: {chunk.token_count}")
        # RecursiveChunk objects have properties like start_index, end_index, level
        print(f"  Level: {chunk.level}") # Level indicates the hierarchy (e.g., based on heading depth)
        print(f"  Text:\n---START CHUNK---\n{chunk.text}\n---END CHUNK---\n")

Chunking texts...


🦛 choooooooooooooooooooonk 100% • 2/2 docs chunked [00:01<00:00,  1.45doc/s] 🌱


--------- Chunking Results ---------

+++--- Document 1 ---+++
Chunk 1:
  Token count: 23
  Level: 0
  Text:
---START CHUNK---
This is the first sentence. This is the second sentence. And here's a third one with some additional context.
---END CHUNK---


+++--- Document 2 ---+++
Chunk 1:
  Token count: 314
  Level: 0
  Text:
---START CHUNK---

# Annual Report 2023

## Section 1: Executive Summary

This report summarizes the financial performance of the company for the fiscal year 2023. We saw significant growth in key areas.

## Section 2: Financial Performance

### Revenue Analysis

Total revenue for 2023 reached $1.2 Billion, a 15% increase from $1.04 Billion in 2022. Growth was driven by strong performance in the SaaS division.

| Quarter | Revenue ($M) | Growth (%) |
|---------|--------------|------------|
| Q1 2023 | 250          | 12         |
| Q2 2023 | 290          | 14         |
| Q3 2023 | 330          | 16         |
| Q4 2023 | 380          | 18         |

### Operating Ex


