In [6]:
from langchain_text_splitters import CharacterTextSplitter

# Sample text
text = """
LangChain is a framework for developing applications powered by language models.
It enables applications that are context-aware and can reason about information.

The framework consists of several components including chains, agents, and memory.
Chains allow you to combine multiple calls to LLMs or other utilities.
Agents enable LLMs to interact with their environment through tools.
Memory systems help maintain state across multiple interactions.

LangChain supports various integrations with popular LLM providers like OpenAI,
Anthropic, and Google. It also provides utilities for working with vector stores,
embeddings, and document loaders.
"""

# Fixed-size chunking
splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=150,
    chunk_overlap=30,
    length_function=len,
)

chunks = splitter.split_text(text)

print(f"Created {len(chunks)} chunks using CharacterTextSplitter\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1} (length: {len(chunk)}):")
    print(chunk)
    print("\n" + "-" * 60 + "\n")


Created 6 chunks using CharacterTextSplitter

Chunk 1 (length: 80):
LangChain is a framework for developing applications powered by language models.

------------------------------------------------------------

Chunk 2 (length: 80):
It enables applications that are context-aware and can reason about information.

------------------------------------------------------------

Chunk 3 (length: 82):
The framework consists of several components including chains, agents, and memory.

------------------------------------------------------------

Chunk 4 (length: 139):
Chains allow you to combine multiple calls to LLMs or other utilities.
Agents enable LLMs to interact with their environment through tools.

------------------------------------------------------------

Chunk 5 (length: 144):
Memory systems help maintain state across multiple interactions.
LangChain supports various integrations with popular LLM providers like OpenAI,

-----------------------------------------------------------

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Smart recursive splitting - respects boundaries
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]  # Try these in order
)

chunks = splitter.split_text(text)

print(f"Created {len(chunks)} chunks using RecursiveCharacterTextSplitter\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} (length: {len(chunk)}):")
    print(chunk)
    print("\n" + "="*60 + "\n")


Created 4 chunks using RecursiveCharacterTextSplitter

Chunk 1 (length: 161):
LangChain is a framework for developing applications powered by language models.
It enables applications that are context-aware and can reason about information.


Chunk 2 (length: 153):
The framework consists of several components including chains, agents, and memory.
Chains allow you to combine multiple calls to LLMs or other utilities.


Chunk 3 (length: 133):
Agents enable LLMs to interact with their environment through tools.
Memory systems help maintain state across multiple interactions.


Chunk 4 (length: 195):
LangChain supports various integrations with popular LLM providers like OpenAI,
Anthropic, and Google. It also provides utilities for working with vector stores,
embeddings, and document loaders.


