In [36]:
from langchain.text_splitter import (CharacterTextSplitter,
                                     RecursiveJsonSplitter,
                                     HTMLHeaderTextSplitter,
                                     MarkdownHeaderTextSplitter,
                                     RecursiveCharacterTextSplitter)

In [30]:
from langchain_text_splitters import HTMLSectionSplitter

In [2]:
with open("data/output.txt") as f:
    state_of_the_union = f.read()

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [8]:
texts = text_splitter.create_documents([state_of_the_union])


In [9]:
text_splitter.split_text(state_of_the_union)[:2]

['"ID","Name","Statement Period"\n"28","PRASHANT GORDHANDAS SHETH","01-06-2024 TO 30-06-2024"',
 '"ISIN","Security","Transaction Particulars","Date","Op. Bal","Credit","Debit","Cl. Bal","Stamp Duty']

In [11]:
import requests
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()



In [12]:
splitter = RecursiveJsonSplitter(max_chunk_size=300)



In [13]:
json_chunks = splitter.split_json(json_data=json_data)


In [14]:
docs = splitter.create_documents(texts=[json_data])
texts = splitter.split_text(json_data=json_data)



In [19]:
print(len(texts))

1700


In [22]:

html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about Bar.</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar.</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar.</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
    </div>
</body>
</html>
"""

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits

[Document(page_content='Foo'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Some intro text about Foo.  \nBar main section Bar subsection 1 Bar subsection 2'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, page_content='Some intro text about Bar.'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 1'}, page_content='Some text about the first subtopic of Bar.'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 2'}, page_content='Some text about the second subtopic of Bar.'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Baz'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Some text about Baz'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Some concluding text about Foo')]

In [23]:
url = "https://plato.stanford.edu/entries/goedel/"

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]

In [24]:
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


In [25]:
html_header_splits = html_splitter.split_text_from_url(url)


In [26]:
chunk_size = 500
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

In [27]:
splits = text_splitter.split_documents(html_header_splits)
splits[80:85]


[Document(metadata={'Header 1': 'Kurt Gödel', 'Header 2': '2. Gödel’s Mathematical Work', 'Header 3': '2.2 The Incompleteness Theorems', 'Header 4': '2.2.1 The First Incompleteness Theorem'}, page_content='We see that Gödel first tried to reduce the consistency problem for analysis to that of arithmetic. This seemed to require a truth definition for arithmetic, which in turn led to paradoxes, such as the Liar paradox (“This sentence is false”) and Berry’s paradox (“The least number not defined by an expression consisting of just fourteen English words”). Gödel then noticed that such paradoxes would not necessarily arise if truth were replaced by provability. But this means that arithmetic truth'),
 Document(metadata={'Header 1': 'Kurt Gödel', 'Header 2': '2. Gödel’s Mathematical Work', 'Header 3': '2.2 The Incompleteness Theorems', 'Header 4': '2.2.1 The First Incompleteness Theorem'}, page_content='means that arithmetic truth and arithmetic provability are not co-extensive — whence th

In [34]:


divs_to_split_on = [(('div','divisions'))]

html_splitter = HTMLSectionSplitter(headers_to_split_on=divs_to_split_on)
html_splits = html_splitter.split_text(html_string)
html_splits

[Document(metadata={'divisions': 'Foo\nSome intro text about Foo.\n\nBar main section\nSome intro text about Bar.\nBar subsection 1\nSome text about the first subtopic of Bar.\nBar subsection 2\nSome text about the second subtopic of Bar.\n\n\nBaz\nSome text about Baz\n\n\nSome concluding text about Foo'}, page_content='Foo \n Some intro text about Foo.'),
 Document(metadata={'divisions': 'Bar main section\nSome intro text about Bar.\nBar subsection 1\nSome text about the first subtopic of Bar.\nBar subsection 2\nSome text about the second subtopic of Bar.'}, page_content='Bar main section \n Some intro text about Bar. \n Bar subsection 1 \n Some text about the first subtopic of Bar. \n Bar subsection 2 \n Some text about the second subtopic of Bar.'),
 Document(metadata={'divisions': 'Baz\nSome text about Baz'}, page_content='Baz \n Some text about Baz \n \n \n Some concluding text about Foo')]

In [35]:
chunk_size = 500
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(html_header_splits)
splits

[Document(metadata={'divisions': 'Foo\nSome intro text about Foo.\n\nBar main section\nSome intro text about Bar.\nBar subsection 1\nSome text about the first subtopic of Bar.\nBar subsection 2\nSome text about the second subtopic of Bar.\n\n\nBaz\nSome text about Baz\n\n\nSome concluding text about Foo'}, page_content='Foo \n Some intro text about Foo.'),
 Document(metadata={'divisions': 'Bar main section\nSome intro text about Bar.\nBar subsection 1\nSome text about the first subtopic of Bar.\nBar subsection 2\nSome text about the second subtopic of Bar.'}, page_content='Bar main section \n Some intro text about Bar. \n Bar subsection 1 \n Some text about the first subtopic of Bar. \n Bar subsection 2 \n Some text about the second subtopic of Bar.'),
 Document(metadata={'divisions': 'Baz\nSome text about Baz'}, page_content='Baz \n Some text about Baz \n \n \n Some concluding text about Foo')]

In [37]:
markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='Hi this is Jim  \nHi this is Joe'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}, page_content='Hi this is Lance'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Hi this is Molly')]