In [2]:
import re

def dynamic_chunking(text, max_chunk_size):
    """
    Splits text into dynamic chunks based on punctuation and semantic boundaries.

    Args:
        text (str): The input text to chunk.
        max_chunk_size (int): Maximum size of each chunk.

    Returns:
        List[str]: List of dynamic chunks.
    """
    sentences = re.split(r'(\.|!|\?)', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [3]:
# Example Usage
text = "Dynamic chunking adapts to text structure. It improves context preservation! This is great for NLP tasks."
chunks = dynamic_chunking(text, max_chunk_size=50)
print(chunks)

['Dynamic chunking adapts to text structure.', 'It improves context preservation!', 'This is great for NLP tasks.']


In [4]:
# Example Usage 2
text2 = (
    "Dynamic chunking ensures that text is processed adaptively. This technique works especially well for conversational systems, "
    "summarization models, and document parsing. By analyzing the structure, we achieve better results!"
)
chunks2 = dynamic_chunking(text2, max_chunk_size=70)
print("Example 2 Chunks:", chunks2)

Example 2 Chunks: ['Dynamic chunking ensures that text is processed adaptively.', 'This technique works especially well for conversational systems, summarization models, and document parsing', '. By analyzing the structure, we achieve better results!']


### Using LangChain

In [5]:
%pip install langchain

Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/d0/a8/0a8f868615b7a30636b1d15b718e3ea9875bf0dccced03583477c2372495/langchain-0.3.14-py3-none-any.whl.metadata
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Obtaining dependency information for SQLAlchemy<3,>=1.4 from https://files.pythonhosted.org/packages/b1/03/d12b7c1d36fd80150c1d52e121614cf9377dac99e5497af8d8f5b2a8db64/SQLAlchemy-2.0.36-cp311-cp311-win_amd64.whl.metadata
  Downloading SQLAlchemy-2.0.36-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Obtaining dependency information for aiohttp<4.0.0,>=3.8.3 from https://files.pythonhosted.org/packages/fc/db/2192489a8a51b52e06627506f8ac8df69ee221de88ab9bdea77aa793aa6a/aiohttp-3.11.11-cp311-cp311-win_amd64.whl.metadata
  Downloading aiohttp-3.11.11-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting lang

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'D:\\Mentoring\\HayStack\\venv\\Lib\\site-packages\\~umpy.libs\\libscipy_openblas64_-43e11ff0749b8cbe0a615c9cf6737e0e.dll'
Check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample text
text = (
    "Dynamic chunking ensures that text is processed adaptively. "
    "This technique works especially well for conversational systems, "
    "summarization models, and document parsing. By analyzing the structure, "
    "we achieve better results!"
)

# Define a RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,            # Maximum size of each chunk
    chunk_overlap=20,          # Overlap between chunks for context retention
    separators=["\n\n", ". ", "! ", "? "]  # Dynamic boundary indicators
)

# Apply the splitter
chunks = text_splitter.split_text(text)

# Output the dynamic chunks
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}: {chunk}")


ModuleNotFoundError: No module named 'langchain'