<a href="https://colab.research.google.com/github/Es101-am/Chunkingdata/blob/main/chunk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.	Character splitting**

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Very long sample text
text = (
    "LangChain is an open-source framework designed to simplify the development of applications "
    "powered by large language models (LLMs). It provides tools for chaining together LLM calls "
    "and integrating them with external data sources, such as APIs and databases. LangChain enables "
    "developers to build powerful applications such as chatbots, agents, and question-answering systems. "
    "With built-in support for memory, context management, and document loaders, it abstracts away many "
    "of the complexities involved in building production-ready language model applications."
)

# Initialize the CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    separator=' ',
    strip_whitespace=False
)

# Create documents
docs = text_splitter.create_documents([text])

# Print the resulting chunks
for i, doc in enumerate(docs):
    print(f"Chunk {i+1}: {doc.page_content!r}")


Chunk 1: 'LangChain is an open-source framework designed to'
Chunk 2: 'to simplify the development of applications'
Chunk 3: 'powered by large language models (LLMs). It'
Chunk 4: '(LLMs). It provides tools for chaining together'
Chunk 5: 'together LLM calls and integrating them with'
Chunk 6: 'them with external data sources, such as APIs and'
Chunk 7: 'APIs and databases. LangChain enables developers'
Chunk 8: 'developers to build powerful applications such as'
Chunk 9: 'such as chatbots, agents, and question-answering'
Chunk 10: 'systems. With built-in support for memory, context'
Chunk 11: 'context management, and document loaders, it'
Chunk 12: 'it abstracts away many of the complexities'
Chunk 13: 'involved in building production-ready language'
Chunk 14: 'language model applications.'


# **Recursive Character Text Splitting**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text = (
    "LangChain is an open-source framework designed to simplify the development of applications.\n"
    "It is powered by large language models (LLMs).\n"
    "It provides tools for chaining together LLM calls and integrating them with external data sources,\n"
    "such as APIs and databases.\n"
    "LangChain enables developers to build powerful applications such as chatbots, agents,\n"
    "and question-answering systems.\n"
    "With built-in support for memory, context management, and document loaders,\n"
    "it abstracts away many of the complexities involved in building production-ready language model applications."
)

# Create the text splitter that only splits on newline
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=65,
    chunk_overlap=0,
    separators=["\n"]
)

# Split the text into documents
docs = text_splitter.create_documents([text])

# Print each chunk
for i, doc in enumerate(docs):
    print(f"Chunk {i+1}: {doc.page_content!r}")


Chunk 1: 'LangChain is an open-source framework designed to simplify the development of applications.'
Chunk 2: 'It is powered by large language models (LLMs).'
Chunk 3: '\nIt provides tools for chaining together LLM calls and integrating them with external data sources,'
Chunk 4: 'such as APIs and databases.'
Chunk 5: '\nLangChain enables developers to build powerful applications such as chatbots, agents,'
Chunk 6: 'and question-answering systems.'
Chunk 7: '\nWith built-in support for memory, context management, and document loaders,'
Chunk 8: '\nit abstracts away many of the complexities involved in building production-ready language model applications.'


# **Document Specific Splitting**

In [None]:
import re
from langchain.schema import Document

markdown_text_1 = """
# Exploring Japan

## Tokyo
Visit the Shibuya crossing and try sushi at Tsukiji Market.

## Kyoto
Don’t miss the bamboo grove in Arashiyama and Fushimi Inari shrine.

## Nara
See the free-roaming deer and visit Todai-ji temple.
"""

# Split based on Markdown headers (lines starting with '#')
header_blocks = re.split(r'(?=^#{1,6} )', markdown_text_1, flags=re.MULTILINE)

# Remove empty blocks and wrap into LangChain Document objects
docs = [Document(page_content=block.strip()) for block in header_blocks if block.strip()]

# Print the chunks
for i, doc in enumerate(docs):
    print(f"Chunk {i+1}:\n{doc.page_content!r}\n")



Chunk 1:
'# Exploring Japan'

Chunk 2:
'## Tokyo\nVisit the Shibuya crossing and try sushi at Tsukiji Market.'

Chunk 3:
'## Kyoto\nDon’t miss the bamboo grove in Arashiyama and Fushimi Inari shrine.'

Chunk 4:
'## Nara\nSee the free-roaming deer and visit Todai-ji temple.'



In [None]:
import re

def split_code_blocks(text):
    blocks = re.split(r'\n(?=\w|#)', text.strip())  # split where new block starts
    return [block.strip() for block in blocks if block.strip()]

code_text = """
# Python Basics

x = 10  # variable

for i in range(x):
    print(i)

def greet(name):
    return f"Hello, {name}"
"""

# Split code into logical blocks
blocks = split_code_blocks(code_text)
for i, block in enumerate(blocks):
    print(f"Block {i+1}:\n{block}\n")


Block 1:
# Python Basics

Block 2:
x = 10  # variable

Block 3:
for i in range(x):
    print(i)

Block 4:
def greet(name):
    return f"Hello, {name}"



# **PDF, Table **

In [1]:
!pip install layoutparser
!pip install pdf2image

Collecting layoutparser
  Downloading layoutparser-0.3.4-py3-none-any.whl.metadata (7.7 kB)
Collecting iopath (from layoutparser)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pdfplumber (from layoutparser)
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdf2image (from layoutparser)
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting portalocker (from iopath->layoutparser)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting pdfminer.six==20250506 (from pdfplumber->layoutparser)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber->layou

In [2]:
!pip install "unstructured[pdf]"

Collecting unstructured[pdf]
  Downloading unstructured-0.18.9-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured[pdf])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured[pdf])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured[pdf])
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting dataclasses-json (from unstructured[pdf])
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting python-iso639 (from unstructured[pdf])
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured[pdf])
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured[pdf]

In [3]:
!apt-get install -y poppler-utils tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (269 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [4]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

# Define the PDF file path
filename = "/content/drive/MyDrive/1.pdf"

# Extract elements from the PDF
elements = partition_pdf(
    filename=filename,
    strategy="hi_res",              # high-resolution parsing
    infer_table_structure=True,     # enable table structure inference
    model_name="yolox"              # layout model used for parsing
)

# View the parsed elements
elements




preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

[<unstructured.documents.elements.NarrativeText at 0x7c4179932810>,
 <unstructured.documents.elements.Title at 0x7c4179931210>,
 <unstructured.documents.elements.Text at 0x7c4179931390>,
 <unstructured.documents.elements.NarrativeText at 0x7c4179931090>,
 <unstructured.documents.elements.Text at 0x7c4179930390>,
 <unstructured.documents.elements.Text at 0x7c4179931ed0>,
 <unstructured.documents.elements.Text at 0x7c4179931850>,
 <unstructured.documents.elements.NarrativeText at 0x7c41799318d0>,
 <unstructured.documents.elements.NarrativeText at 0x7c41799315d0>,
 <unstructured.documents.elements.Image at 0x7c4179947c90>,
 <unstructured.documents.elements.Text at 0x7c4179945650>,
 <unstructured.documents.elements.Text at 0x7c4179946a50>,
 <unstructured.documents.elements.Image at 0x7c4179933290>,
 <unstructured.documents.elements.Text at 0x7c4179946a10>,
 <unstructured.documents.elements.Text at 0x7c4179945050>,
 <unstructured.documents.elements.Text at 0x7c41799476d0>,
 <unstructured.do

# **Table Extraction**

In [None]:
from unstructured.documents.elements import Table

# Find all table elements
tables = [el for el in elements if isinstance(el, Table)]

# Show how many were found
print(f"Found {len(tables)} table(s).")

# Preview the first one
if tables:
    from IPython.display import HTML, display
    for i in range (0,len(tables)):
      display(HTML(tables[i].metadata.text_as_html))


Found 1 table(s).


| 1,fl,f2— f3,CU__| None,A
3,fl fl,f2,f3 f2— f3,1.02\ +15
4,,,fl f2> f3,2500


# **Image Extraction from pdf**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:

from typing import Any
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf


filepath = "/content/drive/MyDrive/4.pdf"

#  Extract rich elements using advanced settings
raw_pdf_elements = partition_pdf(
    filename=filepath,
    strategy="hi_res",                    # High-res strategy for OCR
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=400,
    new_after_n_chars=380,
    combine_text_under_n_chars=200,
    image_output_dir_path="/content/drive/MyDrive/",
    languages=["eng"]
)



yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

In [7]:
for i, el in enumerate(raw_pdf_elements[:5]):
    print(f"\n--- Element {i} ---")
    print(el)



--- Element 0 ---
3.2.2. Stage1:

--- Element 1 ---
Multimodal pretraining In this stage, we combine the unimodal models as explained in Section 3.1 and train the whole model on a broad mixture of large-scale visionlanguage tasks. Contrary to most recent VLMs, our core goal is to train a base model that finetunes well to a wide range of tasks, not merely to align the modalities. Intuitively, we want a mix of tasks which force the model to acquire a

--- Element 2 ---
broad range of “skills”, regardless of the task’s user (or benchmark) friendliness out of the box. More on this in Section 3.2.5. It is common practice, also followed by previous PaLI versions, to keep the image encoder frozen during the first multimodal pretraining stage. This is partially due to findings as in LiT [132] reporting multimodal tuning of pretrained image encoders degrading their

--- Element 3 ---
representations. However, more recent work such as CapPa [110] and LocCa [115] have shown that captioning and o

In [10]:
!pip install -q langchain langchain-community


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
!pip install -q -U langchain-openai


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
from dotenv import load_dotenv
load_dotenv()  # This loads the environment variables from .env


True

In [17]:
# Then run:
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import numpy as np
load_dotenv()

# Initialize embeddings
embedding_model = OpenAIEmbeddings()  # will read OPENAI_API_KEY from .env

# Sample text split into sentences
sentences = [
    "AI is changing the world rapidly.",
    "Companies are adopting it across industries.",
    "However, challenges in trust and explainability remain.",
    "Researchers are working on making AI more transparent.",
    "Some tools like SHAP and LIME are gaining popularity.",
    "These tools help users understand AI predictions.",
    "But they don’t solve every issue in AI trust."
]

# Create 3-sentence sliding windows
windows = [" ".join(sentences[i:i+3]) for i in range(len(sentences)-2)]

# Embed the windows
window_vectors = embedding_model.embed_documents(windows)

# Compare cosine similarity between consecutive windows
for i in range(len(window_vectors) - 1):
    sim = cosine_similarity([window_vectors[i]], [window_vectors[i+1]])[0][0]
    print(f"Window {i+1} vs {i+2} similarity: {sim:.3f}")
    if sim < 0.90:
        print(" 🔹 Suggested break point.")

Window 1 vs 2 similarity: 0.959
Window 2 vs 3 similarity: 0.942
Window 3 vs 4 similarity: 0.957
Window 4 vs 5 similarity: 0.942


In [18]:
# Then run:
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import numpy as np
load_dotenv()

# Initialize embeddings
embedding_model = OpenAIEmbeddings()  # will read OPENAI_API_KEY from .env

# Sample text split into sentences
sentences = [
    "AI is changing the world rapidly.",
    "Companies are adopting it across industries.",
    "However, challenges in trust and explainability remain.",
    "The rainforest canopy hosts thousands of undiscovered species.",
    "Some tools like SHAP and LIME are gaining popularity.",
    "Mountaineering requires specialized gear and training.",
    "The economy of ancient Rome was heavily based on agriculture."
]


# Create 3-sentence sliding windows
windows = [" ".join(sentences[i:i+3]) for i in range(len(sentences)-2)]

# Embed the windows
window_vectors = embedding_model.embed_documents(windows)

# Compare cosine similarity between consecutive windows
for i in range(len(window_vectors) - 1):
    sim = cosine_similarity([window_vectors[i]], [window_vectors[i+1]])[0][0]
    print(f"Window {i+1} vs {i+2} similarity: {sim:.3f}")
    if sim < 0.90:
        print(" 🔹 Suggested break point.")

Window 1 vs 2 similarity: 0.881
 🔹 Suggested break point.
Window 2 vs 3 similarity: 0.940
Window 3 vs 4 similarity: 0.917
Window 4 vs 5 similarity: 0.897
 🔹 Suggested break point.


# **Agentic Chunking**

In [19]:
essay_propositions = [
    "The month is October.",
    "The year is 2023.",
    "I was a child at some past time.",
    "At that past time, I did not understand something important about the world.",
    "The important thing I did not understand is the degree to which the returns for performance are superlinear.",
    "Teachers and coaches implicitly told us the returns were linear.",
    "Teachers and coaches meant well.",
    "The statement 'You get out what you put in' was heard a thousand times by the speaker.",
    "The statement 'You get out what you put in' is rarely true.",
    "If your product is only half as good as your competitor's product, you do not get half the results.",
]


In [24]:
import re
from typing import List

class AgenticChunker:
    def __init__(self):
        self.chunks = []

    def is_date(self, sentence: str) -> bool:
        # Detects simple dates or temporal patterns
        return bool(re.search(r'\b(19|20)\d{2}\b', sentence)) or "month" in sentence.lower() or "time" in sentence.lower()

    def is_quote_or_statement(self, sentence: str) -> bool:
        return '"' in sentence or "statement" in sentence.lower()

    def is_performance_related(self, sentence: str) -> bool:
        return any(word in sentence.lower() for word in ["returns", "performance", "superlinear", "linear"])

    def chunk_by_rules(self, propositions: List[str]) -> List[List[str]]:
        current_chunk = []
        last_tag = None

        def tag_sentence(s):
            if self.is_date(s):
                return "time"
            elif self.is_performance_related(s):
                return "performance"
            elif self.is_quote_or_statement(s):
                return "quote"
            else:
                return "general"

        for prop in propositions:
            tag = tag_sentence(prop)

            if last_tag is None:
                current_chunk.append(prop)
                last_tag = tag
            elif tag == last_tag:
                current_chunk.append(prop)
            else:
                self.chunks.append(current_chunk)
                current_chunk = [prop]
                last_tag = tag

        if current_chunk:
            self.chunks.append(current_chunk)

        return self.chunks

    def pretty_print_chunks(self):
        for i, chunk in enumerate(self.chunks):
            print(f"\n🔹 Chunk {i+1} ({len(chunk)} items):")
            for sentence in chunk:
                print("  -", sentence)

    def get_chunks(self, get_type='list_of_strings'):
        if get_type == 'list_of_strings':
            return [" ".join(chunk) for chunk in self.chunks]
        return self.chunks


In [28]:
ac = AgenticChunker()
ac.chunk_by_rules(essay_propositions)
ac.pretty_print_chunks()


🔹 Chunk 1 (4 items):
  - The month is October.
  - The year is 2023.
  - I was a child at some past time.
  - At that past time, I did not understand something important about the world.

🔹 Chunk 2 (2 items):
  - The important thing I did not understand is the degree to which the returns for performance are superlinear.
  - Teachers and coaches implicitly told us the returns were linear.

🔹 Chunk 3 (1 items):
  - Teachers and coaches meant well.

🔹 Chunk 4 (1 items):
  - The statement 'You get out what you put in' was heard a thousand times by the speaker.

🔹 Chunk 5 (1 items):
  - The statement 'You get out what you put in' is rarely true.

🔹 Chunk 6 (1 items):
  - If your product is only half as good as your competitor's product, you do not get half the results.
