In [2]:
def hierarchical_chunking(text, level_sizes):
    """
    Segments the input text into a hierarchical structure of chunks.

    Args:
        text (str): The input text to chunk.
        level_sizes (list[int]): List of chunk sizes for each level in the hierarchy.

    Returns:
        dict: Hierarchical structure of text chunks.
    """
    def chunk_text(text, chunk_size):
        """Helper function to split text into chunks of a given size."""
        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    hierarchy = {}
    current_level_text = text

    for level, chunk_size in enumerate(level_sizes, start=1):
        chunks = chunk_text(current_level_text, chunk_size)
        hierarchy[f"Level {level}"] = chunks

        # For the next level, treat each chunk as a new text to be further chunked
        current_level_text = " ".join(chunks)  # Combine chunks for hierarchical processing

    return hierarchy

In [3]:
# Example Document
sample_document = (
    "Natural Language Processing (NLP) involves understanding and generating human language. "
    "It has applications in sentiment analysis, machine translation, and more. "
    "One challenge in NLP is segmenting long text streams into manageable pieces. "
    "Hierarchical chunking organizes text into multi-level structures for better understanding. "
    "This approach is useful for tasks like summarization, knowledge graph building, and question answering."
)

# Define hierarchical levels with respective chunk sizes (in characters)
level_sizes = [120, 60, 30]

# Perform hierarchical chunking
hierarchical_chunks = hierarchical_chunking(sample_document, level_sizes)

# Display the hierarchical chunks
for level, chunks in hierarchical_chunks.items():
    print(f"\n{level}:")
    for i, chunk in enumerate(chunks, start=1):
        print(f"  Chunk {i}: {chunk}")



Level 1:
  Chunk 1: Natural Language Processing (NLP) involves understanding and generating human language. It has applications in sentiment
  Chunk 2:  analysis, machine translation, and more. One challenge in NLP is segmenting long text streams into manageable pieces. H
  Chunk 3: ierarchical chunking organizes text into multi-level structures for better understanding. This approach is useful for ta
  Chunk 4: sks like summarization, knowledge graph building, and question answering.

Level 2:
  Chunk 1: Natural Language Processing (NLP) involves understanding and
  Chunk 2:  generating human language. It has applications in sentiment
  Chunk 3:   analysis, machine translation, and more. One challenge in 
  Chunk 4: NLP is segmenting long text streams into manageable pieces. 
  Chunk 5: H ierarchical chunking organizes text into multi-level struc
  Chunk 6: tures for better understanding. This approach is useful for 
  Chunk 7: ta sks like summarization, knowledge graph building, and

### Using LangChain

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def hierarchical_chunking_with_langchain(text, level_sizes):
    """
    Hierarchical chunking using LangChain's RecursiveCharacterTextSplitter.

    Args:
        text (str): The input text to chunk.
        level_sizes (list[int]): List of chunk sizes for each level in the hierarchy.

    Returns:
        dict: A dictionary with hierarchical chunks at each level.
    """
    hierarchy = {}
    current_text = [text]  # Start with the full text

    for level, chunk_size in enumerate(level_sizes, start=1):
        level_chunks = []
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_size // 4  # Overlap is 25% of the chunk size
        )
        
        # Chunk each piece of text from the previous level
        for chunk in current_text:
            level_chunks.extend(text_splitter.split_text(chunk))
        
        hierarchy[f"Level {level}"] = level_chunks
        current_text = level_chunks  # Use the current level chunks for the next level

    return hierarchy

In [7]:
# Example Document
sample_document = (
    "Natural Language Processing (NLP) involves understanding and generating human language. "
    "It has applications in sentiment analysis, machine translation, and more. "
    "One challenge in NLP is segmenting long text streams into manageable pieces. "
    "Hierarchical chunking organizes text into multi-level structures for better understanding. "
    "This approach is useful for tasks like summarization, knowledge graph building, and question answering."
)

# Define hierarchical levels with respective chunk sizes (in characters)
level_sizes = [150, 100, 50]

# Perform hierarchical chunking
hierarchical_chunks = hierarchical_chunking_with_langchain(sample_document, level_sizes)

# Display the hierarchical chunks
for level, chunks in hierarchical_chunks.items():
    print(f"\n{level}:")
    for i, chunk in enumerate(chunks, start=1):
        print(f"  Chunk {i}: {chunk}")



Level 1:
  Chunk 1: Natural Language Processing (NLP) involves understanding and generating human language. It has applications in sentiment analysis, machine
  Chunk 2: in sentiment analysis, machine translation, and more. One challenge in NLP is segmenting long text streams into manageable pieces. Hierarchical
  Chunk 3: into manageable pieces. Hierarchical chunking organizes text into multi-level structures for better understanding. This approach is useful for tasks
  Chunk 4: This approach is useful for tasks like summarization, knowledge graph building, and question answering.

Level 2:
  Chunk 1: Natural Language Processing (NLP) involves understanding and generating human language. It has
  Chunk 2: human language. It has applications in sentiment analysis, machine
  Chunk 3: in sentiment analysis, machine translation, and more. One challenge in NLP is segmenting long text
  Chunk 4: is segmenting long text streams into manageable pieces. Hierarchical
  Chunk 5: into manageable