# HTML Text Splitter

It is a "structure aware" chunker that splits text at the HTML element level and adds metadata for each header "relevant" to any given chunk. It can return chunk element by element or combine elements with the same metadata, with the objectives of keeping related text grouped semantically and preserving context-rich information encoded in document structures, It can be used with other text splitters as part of a chunking pipeline.

In [1]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_string = """
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>Foo</h1>
        <p>Sone intro text about Foo</p>
        <div>
            <h2>Bar main section</h2>
            <p>Some intro text about bar</p>
            <h3>Bar subsection 1</h3>
            <p>Some text about the first subtopic of Bar</p>
            <h3>Bar subsection 2</h3>
            <p>Some text about the second subtopic of Bar</p>
        </div>
        <div>
            <h2>Baz</h2>
            <p>Some text about Baz</p>
        </div>
        <br>
        <p>Some concluding text about Foo</p>
        
    </div>
</body>
</html>
"""

headers_to_split_on = [
    ("h1","Header 1"),
    ("h2","Header 2"),
    ("h3","Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits


[Document(metadata={'Header 1': 'Foo'}, page_content='Foo'),
 Document(metadata={'Header 1': 'Foo'}, page_content='Sone intro text about Foo'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, page_content='Bar main section'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section'}, page_content='Some intro text about bar'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 1'}, page_content='Bar subsection 1'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 1'}, page_content='Some text about the first subtopic of Bar'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 2'}, page_content='Bar subsection 2'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar main section', 'Header 3': 'Bar subsection 2'}, page_content='Some text about the second subtopic of Bar'),
 Document(metadata={'Header 1': 

In [2]:
url = "https://en.wikipedia.org/wiki/Virat_Kohli"

headers_to_split_on = [
    ("h1","Header 1"),
    ("h2","Header 2"),
    ("h3","Header 3"),
    ("h4","Header 4")
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits


[Document(metadata={}, page_content='Jump to content  \nMain menu  \nMain menu  \nmove to sidebar  \nhide  \nNavigation  \nMain page  \nContents  \nCurrent events  \nRandom article  \nAbout Wikipedia  \nContact us  \nContribute  \nHelp  \nLearn to edit  \nCommunity portal  \nRecent changes  \nUpload file  \nSpecial pages  \nSearch  \nSearch  \nAppearance  \nDonate  \nCreate account  \nLog in  \nPersonal tools  \nDonate  \nCreate account  \nLog in  \nPages for logged out editors  \nlearn more  \nContributions  \nTalk  \nCentralNotice'),
 Document(metadata={'Header 2': 'Contents'}, page_content='Contents'),
 Document(metadata={}, page_content='move to sidebar  \nhide  \n(Top)  \n1  \nEarly life  \n2  \nYouth career  \nToggle Youth career subsection  \n2.1  \nDelhi team  \n2.2  \nIndia U19 team  \n3  \nInternational career  \nToggle International career subsection  \n3.1  \nDebut and maiden stint (2008–2009)  \n3.2  \nRise through the ranks (2010–2011)  \n3.3  \nCaptaincy resignation and 