# Chunkings

1. Fixed Size Chunking
2. Recursive Split Chunking
3. Document Based Chunking
4. Semantic Chunking
5. Agentic Chunking

# 1.Fixed Size Chunking

* It breaks down the text into chunks of a specified number of characters, regardless of their content or structure.


In [None]:
! pip install -U langchain-text-splitters

In [None]:
# Chunking without Metadata


from langchain_text_splitters import CharacterTextSplitter

# Step 1: Initialize the text splitter with the desired parameters
text_splitter = CharacterTextSplitter(
    separator="\n\n",         # Split text based on double newlines
    chunk_size=1000,          # Each chunk will be a maximum of 1000 characters long
    chunk_overlap=200,        # Overlap 200 characters between consecutive chunks
    length_function=len,      # Use Python's len() function to calculate chunk length
    is_separator_regex=False  # Treat the separator as plain text, not a regular expression
)


# Step 2: Load the long document to be split
with open("C:/Users/admin/OneDrive/Desktop/Gen-AI/0_Dataset/Math.txt", encoding='utf-8') as f:
    text = f.read()

# Step 3: Split the text into chunks without metadata
chunks= text_splitter.create_documents([text])

# Step 4: Print the first chunk and total number of chunks
print('Total number of chunks is:',len(chunks))
print(chunks[0].page_content[-150:])  # Output is a plain chunk of text


# Step 5: Print maximum and minimum chunk sizes
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print('Maximum chunk size among all:', max(chunk_sizes))
print('Minimum chunk size among all:', min(chunk_sizes))



In [None]:
# Chunking with Metadata

from langchain_text_splitters import CharacterTextSplitter

# Step 1: Load the long document to be split
with open("C:/Users/admin/OneDrive/Desktop/Gen-AI/0_Dataset/Math.txt", encoding='utf-8') as f:
    text = f.read()

# Step 2: Initialize the text splitter with the desired parameters
text_splitter = CharacterTextSplitter(
    separator="\n\n",         # Split text based on double newlines
    chunk_size=1000,          # Each chunk will be a maximum of 1000 characters long
    chunk_overlap=200,        # Overlap 200 characters between consecutive chunks
    length_function=len,      # Use Python's len() function to calculate chunk length
    is_separator_regex=False  # Treat the separator as plain text, not a regular expression
)

# Step 3: Split the text into chunks without metadata
chunks_without_metadata = text_splitter.split_text(text)

# Step 4: Add metadata to each chunk
metadata = []
for idx, chunk in enumerate(chunks_without_metadata):
    chunk_metadata = {
        "document_id": "Math.txt",     # The name of the original document
        "chunk_index": idx + 1,        # Index of the chunk (starting from 1)
        "text_length": len(chunk),     # Length of the chunk
        "start_position": sum(len(c) for c in chunks_without_metadata[:idx]),  # Starting position of this chunk in the full text
        "end_position": sum(len(c) for c in chunks_without_metadata[:idx+1]),  # Ending position of this chunk in the full text
        "section_title": "Math Section",  # You can modify this based on document structure, if applicable
        "timestamp": "2025-01-18"        # Current timestamp or any other relevant info
    }
    metadata.append(chunk_metadata)

# Step 5: Combine the chunks with their metadata
chunks_with_metadata = text_splitter.create_documents([text], metadatas=metadata)

# Step 6: Print metadata for the first chunk and total number of chunks
print("Total number of chunks:", len(chunks_with_metadata))
print("First chunk:", chunks_with_metadata[0])

# Step 7: Print maximum and minimum chunk sizes
chunk_sizes = [len(chunk.page_content) for chunk in chunks_with_metadata]
print('Maximum chunk size among all:', max(chunk_sizes))
print('Minimum chunk size among all:', min(chunk_sizes))



# 2.Recursive Split Chunking 

* It divides text into smaller chunks using a set of separators in a hierarchical and iterative manner.
* If the chunks don't meet the desired size, the method recursively applies different separators until the desired size is achieved.

In [None]:
# Chunking without Metadata

from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
    separators=["\n\n", "\n", " ", ""]
)

# Step 2: Load the long document to be split
with open("C:/Users/admin/OneDrive/Desktop/Gen-AI/0_Dataset/Math.txt", encoding='utf-8') as f:
    text = f.read()

# Step 3: Split the text into chunks without metadata
chunks= recursive_text_splitter.create_documents([text])

# Step 4: Print the first chunk and total number of chunks
print('Total number of chunks is:',len(chunks))
print(chunks[0].page_content[-100:])  # Output is a plain chunk of text

# Step 5: Print maximum and minimum chunk sizes
chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print('Maximum chunk size among all:', max(chunk_sizes))
print('Minimum chunk size among all:', min(chunk_sizes))


In [None]:
# Chunking with Metadata

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Step 1: Initialize the RecursiveCharacterTextSplitter
recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,          # Maximum chunk size of 1000 characters
    chunk_overlap=200,        # 200 characters overlap between chunks
    length_function=len,      # Use Python's len() function to calculate chunk length
    is_separator_regex=False, # Treat the separators as plain text, not a regular expression
    separators=["\n\n", "\n", " ", ""]  # Multiple split options
)

# Step 2: Load the long document to be split
with open("C:/Users/admin/OneDrive/Desktop/Gen-AI/0_Dataset/Math.txt", encoding='utf-8') as f:
    text = f.read()

# Step 3: Split the text into chunks without metadata
chunks = recursive_text_splitter.create_documents([text])

# Step 4: Add metadata to each chunk
metadata = []
for idx, chunk in enumerate(chunks):
    chunk_metadata = {
        "document_id": "Math.txt",        # The name of the original document
        "chunk_index": idx + 1,           # Index of the chunk (starting from 1)
        "text_length": len(chunk.page_content),  # Length of the chunk
        "start_position": sum(len(c.page_content) for c in chunks[:idx]),  # Starting position of this chunk in the full text
        "end_position": sum(len(c.page_content) for c in chunks[:idx + 1]),  # Ending position of this chunk in the full text
        "section_title": "Math Section",  # You can modify this based on document structure, if applicable
        "timestamp": "2025-01-18"         # Current timestamp or any other relevant info
    }
    metadata.append(chunk_metadata)

# Step 5: Combine the chunks with their metadata
for idx, chunk in enumerate(chunks):
    chunk.metadata = metadata[idx]  # Assign metadata to each chunk

# Step 6: Print the total number of chunks and the first chunk
print('Total number of chunks is:', len(chunks))
print("First chunk:", chunks[0])  # Output is a chunk of text (last 100 characters)

# Step 7: Print maximum and minimum chunk sizes
chunk_sizes = [len(chunk.page_content) for chunk in chunks]  # Use page_content to get chunk size
print('Maximum chunk size among all:', max(chunk_sizes))
print('Minimum chunk size among all:', min(chunk_sizes))


3.