#### recursively split text by characters

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

In [5]:
docs = TextLoader("TDP.txt").load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 10)
final_doc = text_splitter.split_documents(docs)

In [8]:
final_doc

[Document(metadata={'source': 'TDP.txt'}, page_content='The Telugu Desam Party follows a pro-Telugu ideology. It was founded as an alternative to the'),
 Document(metadata={'source': 'TDP.txt'}, page_content='to the Congress hegemony, by emphasizing Telugu regional pride and serving as the party for'),
 Document(metadata={'source': 'TDP.txt'}, page_content='party for farmers, backward castes and middle-class people. Since the 1990s, it has followed an'),
 Document(metadata={'source': 'TDP.txt'}, page_content='an economically liberal policy that has been seen as pro-business and pro-development.[19]'),
 Document(metadata={'source': 'TDP.txt'}, page_content='The TDP uses yellow as the background colour for its flag, with a hut, wheel and plough symbol in'),
 Document(metadata={'source': 'TDP.txt'}, page_content="symbol in the foreground. The party's electoral symbol is bicycle.")]

In [9]:
with open("TDP.txt") as f:
    txt = f.read()

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 50, chunk_overlap = 10)
text = text_splitter.create_documents([txt])

In [15]:
text

[Document(metadata={}, page_content='The Telugu Desam Party follows a pro-Telugu'),
 Document(metadata={}, page_content='ideology. It was founded as an alternative to the'),
 Document(metadata={}, page_content='to the Congress hegemony, by emphasizing Telugu'),
 Document(metadata={}, page_content='Telugu regional pride and serving as the party'),
 Document(metadata={}, page_content='the party for farmers, backward castes and'),
 Document(metadata={}, page_content='and middle-class people. Since the 1990s, it has'),
 Document(metadata={}, page_content='it has followed an economically liberal policy'),
 Document(metadata={}, page_content='policy that has been seen as pro-business and'),
 Document(metadata={}, page_content='and pro-development.[19]'),
 Document(metadata={}, page_content='The TDP uses yellow as the background colour for'),
 Document(metadata={}, page_content='for its flag, with a hut, wheel and plough symbol'),
 Document(metadata={}, page_content="symbol in the foreground.

#### character text splitter

In [16]:
from langchain_text_splitters import CharacterTextSplitter

In [17]:
test_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=100, chunk_overlap=10)
final_docs = test_splitter.split_documents(docs)

Created a chunk of size 352, which is longer than the specified 100


In [18]:
final_docs

[Document(metadata={'source': 'TDP.txt'}, page_content='The Telugu Desam Party follows a pro-Telugu ideology. It was founded as an alternative to the Congress hegemony, by emphasizing Telugu regional pride and serving as the party for farmers, backward castes and middle-class people. Since the 1990s, it has followed an economically liberal policy that has been seen as pro-business and pro-development.[19]'),
 Document(metadata={'source': 'TDP.txt'}, page_content="The TDP uses yellow as the background colour for its flag, with a hut, wheel and plough symbol in the foreground. The party's electoral symbol is bicycle.")]

In [19]:
with open("TDP.txt") as f:
    txt = f.read()
    
text_splitter = CharacterTextSplitter(chunk_size = 50, chunk_overlap = 10)
text = text_splitter.create_documents([txt])

Created a chunk of size 352, which is longer than the specified 50


In [20]:
text

[Document(metadata={}, page_content='The Telugu Desam Party follows a pro-Telugu ideology. It was founded as an alternative to the Congress hegemony, by emphasizing Telugu regional pride and serving as the party for farmers, backward castes and middle-class people. Since the 1990s, it has followed an economically liberal policy that has been seen as pro-business and pro-development.[19]'),
 Document(metadata={}, page_content="The TDP uses yellow as the background colour for its flag, with a hut, wheel and plough symbol in the foreground. The party's electoral symbol is bicycle.")]

#### HTML Text splitter

In [21]:
from langchain_text_splitters import HTMLHeaderTextSplitter

In [22]:
html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Some intro text about Foo.</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about Bar.</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar.</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar.</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
    </div>
</body>
</html>
"""

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

In [24]:
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)

In [25]:
html_header_splits

[Document(metadata={}, page_content='Foo'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Some intro text about Foo.  \nBar main section Bar subsection 1 Bar subsection 2'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, page_content='Some intro text about Bar.'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 1'}, page_content='Some text about the first subtopic of Bar.'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 2'}, page_content='Some text about the second subtopic of Bar.'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Baz'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Some text about Baz'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Some concluding text about Foo')]