# Document Splitting 

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [4]:
chunk_size =26
chunk_overlap = 4

In [6]:
# Initialize two different text splitters:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [7]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [8]:
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [9]:
# Let's try with longer string past the 26 characters count:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz'

In [10]:
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefghijklmnopqrstuv', 'stuvwxyz']

In [11]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [12]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
# The issue is that the character text splitter splits on single character and by default that character is a new line character but below there are n new lines:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [14]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

# Recursive splitting details

In [15]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentences. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [16]:
len(some_text)

496

In [18]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
# Seperators mean that it will first try to split by double new lines, and if it still needs to split more it will go by newline and then by space and finally character by character.
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [19]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentences. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [20]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentences. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

## Let's reduce the chunk size a bit and add a period to our separators:

In [21]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentences. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [22]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentences. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [23]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../public/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [24]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [25]:
docs = text_splitter.split_documents(pages)

In [26]:
len(docs)

77

In [27]:
len(pages)

22

In [28]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("../public/Notion_DB")
notion_db = loader.load()

In [29]:
docs = text_splitter.split_documents(notion_db)

In [30]:
len(notion_db)

1

In [31]:
len(docs)

13

## Token splitting 
We can also split on token count explicity, if we want.

This can be useful because LLMs often have context windows designated in tokens.

Tokens are often ~4 characters.

In [32]:
from langchain_text_splitters import TokenTextSplitter

In [37]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [46]:
text1 = "Hello there my name is David."

In [48]:
text2 = "Hello there my name is Davie."

In [47]:
text_splitter.split_text(text1)

['Hello', ' there', ' my', ' name', ' is', ' David', '.']

In [49]:
text_splitter.split_text(text2)

['Hello', ' there', ' my', ' name', ' is', ' Dav', 'ie', '.']

In [50]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [51]:
docs = text_splitter.split_documents(pages)

In [52]:
docs[0]

Document(page_content='MachineLearning-Lecture01  \n', metadata={'source': '../public/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0})

In [53]:
pages[0].metadata

{'source': '../public/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}

## Context aware splitting

Chunking aims to keep text with common context together.

A text splitting often uses sentences or other delimiters to keep related text together but many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting.

We can use `MarkdownHeaderTextSplitter` to preserve header metadata in our chunks, as show below.

In [54]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [56]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [57]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [58]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [59]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [60]:
md_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})

Try on a real Markdown file, like a Notion database.

In [61]:
loader = NotionDirectoryLoader("../public/Notion_DB")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

In [62]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [63]:
md_header_splits = markdown_splitter.split_text(txt)

In [64]:
md_header_splits[0]

Document(page_content='Created: March 6, 2024 8:32 PM\nTags: Personal  \nLangChain is an open-source developer framework for building LLM applications.\nIt has Python and TypeScript packages.\nFocused on composition and modularity.  \nLangChain key values add modular components that can be used in conjunction with each other or by themselves and the other key value is the use cases where common ways to combine the components to end to end applications.\nLangChain includes prompts, models, indexes, chains.\nPrompts include prompt templates, output parsers\nModels include 20+ integrations of LLMs, chat models and text embedding models (10+ integrations)\nIndexes are ways for ingesting data and combining it with models. It contains document loaders, text splitters, vector stores and retrievers\nChains are more end-to-end use cases can be used as building blocks for other chains\nAgents consist of algorithms for getting LLMs to use tools and uses the model as a reasoning engine.  \nModels 