# Approach Overview

Two chunking methods combined:

 1. Splitting by the markdown headers. Expecting the input to be pre-processed by document intelligence (or other tools).
 2. If chunks exceed MAX_CHUNK_TOKEN_SIZE, split further by using semantic chunker.
 3. If the resulting semantic chunk is still larger than MAX_CHUNK_TOKEN_SIZE, use Semantic Chunking on this chunk alone. 
 No recursion, just put it back into queue .


Using Langchain splitters. Also included Llama Index Semantic Chunker as an example.

In [26]:
!pip install tiktoken langchain_experimental langchain_openai langchain-text-splitters python-dotenv llama-index llama-index-embeddings-azure-openai



In [6]:
import tiktoken
import os
from dotenv import load_dotenv
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from collections import deque

In [7]:
load_dotenv(override=True)

MAX_CHUNK_TOKEN_SIZE = 512

In [54]:
# [ SEMANTIC SPLITTER ]
embeddings = AzureOpenAIEmbeddings(
    azure_deployment = os.environ["AZURE_OPENAI_EMBEDDING_MODEL"],
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
    openai_api_version = "2024-02-01"
)

# Uses cosine similarity to determine the semantic similarity between two chunks. 
# If the similarity is below the threshold, the sentences will be merged itno the same chunk.
# Otherwise - starting a new chunk
semantic_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile", breakpoint_threshold_amount=0.95)

In [6]:
# [ MARKDOWN SPLITTER ] 
# The second value (like "header+1") becomes a key in the metadata output from the splitter
headers_to_split_on = [
    ("#", "header_1"),
    ("##", "header_2"),
    ("###", "header_3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers = False)

In [7]:
def token_len(input: str) -> int:
    """Returns number of tokens for the input. Only for models > gpt-3.5 supported as we use 'cl100k_base' encoding."""
    encoding = tiktoken.get_encoding("cl100k_base") 
    return len(encoding.encode(input))

In [73]:
def process_semantic_split(pages):
    queue = deque(s for s in pages)  # Initialize queue 
    result = []

    while queue:
        current_page = queue.popleft()
        # If the chunk is too large, semantically split it into smaller chunks and add them back to the queue, to check if they fit the token limit
        if token_len(current_page) > MAX_CHUNK_TOKEN_SIZE:
            semantic_chunks = semantic_splitter.split_text(current_page)
            for s_chunk in reversed(semantic_chunks):
                queue.appendleft(s_chunk) 
        else:
            result.append(current_page)

    return result

In [None]:
# [ TEST ] 
# Reading source document
# Normally would be expecting here a markdown document, but for the sake of the example we are using a plain text
with open("./sample_data/Tesla_Model_S.txt") as f:
    input_text = f.read()

In [None]:
# Markdown split 
md_header_splits = markdown_splitter.split_text(input_text)
plain_content = [ value.page_content for value in md_header_splits] # Extracting the text content from markdown split result

In [None]:
# Semantic split
semantic_list = process_semantic_split(plain_content)

print("Number of semantic chunks:"+str(len(semantic_list)))
for split in semantic_list:
    print(token_len(split))

In [75]:
# Write results
with open('slit_results.txt', 'w') as file:
    for item in semantic_list:
        file.write(f"------------\n")
        file.write(f"{item}\n")

### OPTIONAL EXPERIMENTS 
Experimenting with LLAMA - INDEX

In [13]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.readers.file import FlatReader
from pathlib import Path
import os

from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core import SimpleDirectoryReader  
from llama_index.core import Settings  
from llama_index.core.node_parser import (  
    SemanticSplitterNodeParser,  
)  

In [9]:
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name=os.environ["AZURE_OPENAI_EMBEDDING_MODEL"],
    api_key= os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version= "2024-02-01"
)

splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model)

In [16]:
# Normally would be expecting here a markdown document, but for the sake of the example we are using a plain text
md_docs = FlatReader().load_data(Path("./sample_data/Tesla_Model_S.txt"))
parser = MarkdownNodeParser()
nodes = parser.get_nodes_from_documents(md_docs)

In [None]:
split_nodes = splitter.get_nodes_from_documents(nodes)  

print(len(split_nodes))   

In [None]:
print(split_nodes[6].get_content())