In [None]:
# Install required packages
!pip install langchain-text-splitters tiktoken

# Import necessary libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken
from typing import List, Dict

print("Environment setup complete!")

In [2]:
def create_smart_splitter(
    chunk_size: int = 1000,
    chunk_overlap: int = 200
) -> RecursiveCharacterTextSplitter:
    """
    Create a recursive text splitter that preserves context.

    Args:
        chunk_size: Target size for each chunk
        chunk_overlap: Amount of overlap between chunks
    """
    return RecursiveCharacterTextSplitter(
        # Start with paragraph breaks, then try sentences, then words
        separators=["\n\n", "\n", ".", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

In [None]:
def demonstrate_recursive_splitting():
    """
    Demonstrate how recursive splitting handles different text structures.
    """
    # Sample research abstract
    research_text = """
    Machine Learning in Healthcare: A Comprehensive Review

    This review examines the impact of machine learning in healthcare settings.
    Recent advances have shown promising results in diagnosis and treatment.

    Key Applications:
    • Medical imaging analysis has achieved expert-level accuracy
    • Patient risk prediction models are improving care outcomes
    • Drug discovery is being accelerated through ML techniques

    Methods and Results:
    Our analysis covers 500 research papers from 2020-2025. The findings
    indicate a 45% improvement in early diagnosis rates when ML is applied.
    """

    # Create splitter
    splitter = create_smart_splitter(chunk_size=200, chunk_overlap=50)

    # Split text
    chunks = splitter.split_text(research_text)

    # Analyze and display results
    print(f"Original Length: {len(research_text)} characters")
    print(f"Number of Chunks: {len(chunks)}\n")

    for i, chunk in enumerate(chunks, 1):
        print(f"Chunk {i} ({len(chunk)} chars):")
        print("-" * 50)
        print(chunk)
        print("\n")

# Run demonstration
demonstrate_recursive_splitting()

**Advanced Configurations and Special Cases**

In [4]:
def create_multilingual_splitter(
    languages: List[str] = ["en", "zh", "ja", "th"]
) -> RecursiveCharacterTextSplitter:
    """
    Create a splitter that handles multiple languages intelligently.
    Particularly useful for languages without word boundaries.
    """
    separators = [
        # Default separators
        "\n\n", "\n", ".", " ",
        # Chinese/Japanese punctuation
        "。", "，", "！", "？",
        # Thai sentence breaks
        "\u0e46", "\u0e2f",
        ""
    ]

    return RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

In [None]:
def test_multilingual_splitting():
    """Test splitting text containing multiple languages"""

    # Mixed language text
    mixed_text = """
    Machine Learning Applications
    机器学习应用

    Modern ML systems have transformed industries.
    现代机器学习系统已经改变了行业。

    Key benefits include:
    主要好处包括：
    """

    splitter = create_multilingual_splitter()
    chunks = splitter.split_text(mixed_text)

    # Display results
    for i, chunk in enumerate(chunks, 1):
        print(f"\nChunk {i}:")
        print(chunk)

# Run test
test_multilingual_splitting()

In [6]:
def create_custom_format_splitter(
    format_type: str = "scientific"
) -> RecursiveCharacterTextSplitter:
    """
    Create a splitter optimized for specific document formats.

    Args:
        format_type: Type of document format to handle
    """
    if format_type == "scientific":
        # Handle scientific papers with sections, citations
        separators = [
            "\n## ", "\n# ",  # Section headers
            "\nReference:", "\nCitation:",  # Bibliography
            "\n\n", "\n", ". ",
            " "
        ]
    elif format_type == "legal":
        # Handle legal documents with articles, clauses
        separators = [
            "\nArticle ", "\nSection ",
            "\nClause ", "\n\n",
            "\n", ". ", " "
        ]

    return RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

In [7]:
def split_with_metadata():
    """Demonstrate splitting while preserving document metadata"""

    document = {
        'content': "Your long document text here...",
        'metadata': {
            'author': "Jane Doe",
            'date': "2025-02-10",
            'category': "Technical"
        }
    }

    splitter = create_smart_splitter()
    chunks = splitter.split_text(document['content'])

    # Preserve metadata across chunks
    chunks_with_metadata = []
    for i, chunk in enumerate(chunks):
        chunks_with_metadata.append({
            'content': chunk,
            'metadata': {
                **document['metadata'],
                'chunk_number': i + 1,
                'total_chunks': len(chunks)
            }
        })

    return chunks_with_metadata