In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Splitters
* We use splitters to divide documents in small chunks for the RAG technique.
* The way we split one document is very relevant, since it has a big impact in the quality of the RAG retrieval.
* It is important to understand how to optimize the splitting process.
* Two important techniques to optimize splitting are:
    * How we build the chunks (ideally, whole sentences or paragraphs).
    * The metadata we add to each chunk.

## Character Splitter
* This splits based on characters (by default "\n\n") and measure chunk length by number of characters.

In [1]:
from langchain.text_splitter import CharacterTextSplitter

In [10]:
chunk_size =10
chunk_overlap = 4

In [11]:
character_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [14]:
text1 = 'abcdefghijklmnopq.rstuvwxyzabcdefg'

In [15]:
character_splitter.split_text(text1)

['abcdefghijklmnopq.rstuvwxyzabcdefg']

In [16]:
text2 = """
Data that Speak
LLM Applications are revolutionizing industries such as 
banking, healthcare, insurance, education, legal, tourism, 
construction, logistics, marketing, sales, customer service, 
and even public administration.

The aim of our programs is for students to learn how to 
create LLM Applications in the context of a business, 
which presents a set of challenges that are important 
to consider in advance.
"""

In [17]:
character_splitter.split_text(text2)

Created a chunk of size 227, which is longer than the specified 10


['Data that Speak\nLLM Applications are revolutionizing industries such as \nbanking, healthcare, insurance, education, legal, tourism, \nconstruction, logistics, marketing, sales, customer service, \nand even public administration.',
 'The aim of our programs is for students to learn how to \ncreate LLM Applications in the context of a business, \nwhich presents a set of challenges that are important \nto consider in advance.']

In [19]:
len(character_splitter.split_text(text2))

Created a chunk of size 227, which is longer than the specified 10


2

## Recursive Character Splitter
This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [21]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [22]:
recursive_splitter.split_text(text1)

['abcdefghij', 'ghijklmnop', 'mnopq.rstu', 'rstuvwxyza', 'xyzabcdefg']

In [23]:
recursive_splitter.split_text(text2)

['Data that',
 'Speak',
 'LLM',
 'Applicati',
 'cations',
 'are',
 'revolutio',
 'utionizing',
 'industrie',
 'tries',
 'such as',
 'banking,',
 'healthcar',
 'hcare,',
 'insurance',
 'ance,',
 'education',
 'tion,',
 'legal,',
 'tourism,',
 'construct',
 'ruction,',
 'logistics',
 'tics,',
 'marketing',
 'ting,',
 'sales,',
 'customer',
 'service,',
 'and even',
 'public',
 'administr',
 'istration.',
 'The aim',
 'aim of',
 'of our',
 'programs',
 'is for',
 'students',
 'to learn',
 'how to',
 'create',
 'LLM',
 'Applicati',
 'cations',
 'in the',
 'context',
 'of a',
 'business,',
 'which',
 'presents',
 'a set of',
 'challenge',
 'enges',
 'that are',
 'important',
 'to',
 'consider',
 'in',
 'advance.']

In [24]:
second_recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [25]:
second_recursive_splitter.split_text(text2)

['Data that Speak\nLLM Applications are revolutionizing industries such as \nbanking, healthcare, insurance, education, legal, tourism,',
 'construction, logistics, marketing, sales, customer service, \nand even public administration.',
 'The aim of our programs is for students to learn how to \ncreate LLM Applications in the context of a business,',
 'which presents a set of challenges that are important \nto consider in advance.']

In [26]:
chunks = second_recursive_splitter.split_text(text2)

In [27]:
len(chunks)

4

## Adding helpful metadata to the text chunks

In [28]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [30]:
document_with_markdown = """
# Title: My book\n\n \

## Chapter 1: The day I was born\n\n \
I was born in a very sunny day of summer...\n\n \

### Section 1.1: My family \n\n \
My father had a big white car... \n\n 

## Chapter 2: My school\n\n \
My first day at the school was...\n\n \

"""

In [31]:
headers_to_split_on = [
    ("#", "Book title"),
    ("##", "Chapter"),
    ("###", "Section"),
]

In [32]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [33]:
md_header_splits = markdown_splitter.split_text(document_with_markdown)

In [34]:
md_header_splits[0]

Document(page_content='I was born in a very sunny day of summer...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 1: The day I was born'})

In [23]:
md_header_splits[1]

Document(page_content='My father had a big white car...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 1: The day I was born', 'Section': 'Section 1.1: My family'})

In [24]:
md_header_splits[2]

Document(page_content='My first day at the school was...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 2: My school'})