In [10]:
from langchain_text_splitters import HTMLHeaderTextSplitter
from bs4 import BeautifulSoup
from typing import List, Dict

class HTMLSplitter:
    """Specialized splitter for HTML content"""
    def __init__(self,
                 chunk_size: int = 1000,
                 chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        # Headers to split on with their metadata keys
        self.headers_to_split_on = [
            ("h1", "Header 1"),
            ("h2", "Header 2"),
            ("h3", "Header 3"),
            ("div", "Division")
        ]

    def split_html_content(self, html_content: str) -> List[str]:
        """Split HTML while preserving structure"""
        # Create splitter with headers
        splitter = HTMLHeaderTextSplitter(headers_to_split_on=self.headers_to_split_on)

        # Split the text
        splits = splitter.split_text(html_content)
        return splits

In [7]:
def test_html_splitting():
    """Test HTML splitter with a sample document"""
    sample_html = """
    <html>
        <body>
            <h1>AI Applications</h1>
            <div class="section">
                <h2>Machine Learning</h2>
                <p>Machine learning is transforming industries.</p>
                <ul>
                    <li>Predictive Analytics</li>
                    <li>Pattern Recognition</li>
                </ul>
            </div>
            <div class="section">
                <h2>Deep Learning</h2>
                <p>Neural networks are advancing rapidly.</p>
            </div>
        </body>
    </html>
    """

    splitter = HTMLSplitter()
    chunks = splitter.split_html_content(sample_html)

    for i, chunk in enumerate(chunks, 1):
        print(f"\nChunk {i}:")
        print(chunk)

**Additional Examples**

In [None]:
def test_complex_layouts():
   """Example 1: Complex HTML layouts with semantic structure"""
   complex_html = """
   <html>
       <body>
           <h1>AI Research Portal</h1>
           <div class="sidebar">
               <h2>Categories</h2>
               <ul>
                   <li>Machine Learning</li>
                   <li>Neural Networks</li>
               </ul>
           </div>
           <div class="main-content">
               <article>
                   <h2>Latest Research</h2>
                   <section>
                       <h3>Transformer Models</h3>
                       <p>Recent advances in transformer architectures...</p>
                   </section>
                   <section>
                       <h3>Results</h3>
                       <p>Performance benchmarks show...</p>
                   </section>
               </article>
           </div>
       </body>
   </html>
   """
   splitter = HTMLSplitter()
   chunks = splitter.split_html_content(complex_html)
   print("\nComplex Layout Results:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nChunk {i}:")
       print(chunk)

def test_nested_structures():
   """Example 2: Deeply nested HTML structures"""
   nested_html = """
   <div class="course-module">
       <h1>Programming Course</h1>
       <div class="lesson">
           <h2>Python Basics</h2>
           <div class="section">
               <h3>Variables</h3>
               <p>Understanding Python variables</p>
               <div class="example">
                   <h4>Example Code</h4>
                   <code>x = 42</code>
               </div>
           </div>
       </div>
   </div>
   """
   splitter = HTMLSplitter()
   chunks = splitter.split_html_content(nested_html)
   print("\nNested Structure Results:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nChunk {i}:")
       print(chunk)

def test_interactive_elements():
   """Example 3: Forms and interactive elements"""
   form_html = """
   <h1>User Registration</h1>
   <div class="registration-form">
       <h2>Create Account</h2>
       <form>
           <div class="form-group">
               <h3>Personal Information</h3>
               <label>Name:</label>
               <input type="text">
           </div>
           <div class="form-group">
               <h3>Account Details</h3>
               <label>Email:</label>
               <input type="email">
           </div>
       </form>
   </div>
   """
   splitter = HTMLSplitter()
   chunks = splitter.split_html_content(form_html)
   print("\nInteractive Elements Results:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nChunk {i}:")
       print(chunk)

def test_dynamic_content():
   """Example 4: Dynamic content and templates"""
   dynamic_html = """
   <h1>Dashboard</h1>
   <div class="widgets">
       <h2>Analytics</h2>
       <div class="widget" id="users-online">
           <h3>Current Users</h3>
           <p>Active users: <span class="dynamic-content">{{user_count}}</span></p>
       </div>
       <div class="widget" id="system-status">
           <h3>System Status</h3>
           <p>Status: <span class="dynamic-content">{{status}}</span></p>
       </div>
   </div>
   """
   splitter = HTMLSplitter()
   chunks = splitter.split_html_content(dynamic_html)
   print("\nDynamic Content Results:")
   for i, chunk in enumerate(chunks, 1):
       print(f"\nChunk {i}:")
       print(chunk)

def run_all_examples():
   """Run all HTML splitting examples"""
   print("Testing HTML Splitting Examples...")
   test_complex_layouts()
   test_nested_structures()
   test_interactive_elements()
   test_dynamic_content()

if __name__ == "__main__":
   run_all_examples()