In [1]:
from unstructured import partition
from unstructured.partition.html import partition_html
from unstructured.chunking.title import chunk_by_title
import os


In [None]:
# Test file
saved_file = "raw_html/Weapons.html"


In [None]:
def partition_saved_html_with_strategy(html_file_path):

    try:
        # Check if file exists
        if not os.path.exists(html_file_path):
            print(f"Error: File {html_file_path} does not exist")
            return None
        
        # Partition the HTML file with section-based chunking
        elements = partition_html(
            filename=html_file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            include_page_breaks=True,
            # Section-based chunking parameters
            max_characters=10000,  # Adjust based on your needs
            combine_text_under_n_chars= 100,  # Combine small text fragments
            )
        
        # print(f"Successfully partitioned {html_file_path}")
        # print(f"Found {len(elements)} elements")
        
        return elements
        
    except Exception as e:
        print(f"Error partitioning HTML file: {e}")
        return None

In [52]:
element = partition_saved_html_with_strategy(saved_file)

Successfully partitioned raw_html/Weapons.html
Found 15 elements


In [53]:
element

[<unstructured.documents.elements.CompositeElement at 0x1c168a28fd0>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a28310>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a29850>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2a190>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a29d90>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2a3d0>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2a510>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2a7d0>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2a910>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2ab50>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2ae90>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2af90>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2b210>,
 <unstructured.documents.elements.CompositeElement at 0x1c168a2b410>,
 <unstructured.docum

In [77]:
def partition_files(files):
    texts = []
    tables = []

    for file in files:
        print (file)
        elements = partition_saved_html_with_strategy("raw_html/" + file)

        for i in elements:
            page_title = file.split(".")[0]
            section_title = page_title

            if "CompositeElement" in str(type(i)):
                for j in i.metadata.orig_elements:
                    if "Title" in str(type(j)):
                        section_title = j.text
                    elif "Table" in str(type(j)):
                        table={}
                        table["page_title"] = page_title
                        table["section_title"] = section_title
                        table["table"] = j
                        tables.append(table)
                    else:
                        text={}
                        text["page_title"] = page_title
                        text["section_title"] = section_title
                        text["text"] = j
                        texts.append(text)
                        
            elif "Table" in str(type(i)):
                table={}
                table["page_title"] = page_title
                table["section_title"] = section_title
                table["table"] = i
                tables.append(table)
            else:
                text={}
                text["page_title"] = page_title
                text["section_title"] = section_title
                text["text"] = i
                texts.append(text)
    return texts, tables



In [78]:
from pathlib import Path

raw_html_path = Path('raw_html')
file_names = [f.name for f in raw_html_path.iterdir() if f.is_file()]
file_names = file_names[100:110]

In [80]:
texts, tables = partition_files(file_names)

Anchor.html
Successfully partitioned raw_html/Anchor.html
Found 6 elements
Anchor_(Furniture).html
Successfully partitioned raw_html/Anchor_(Furniture).html
Found 6 elements
Anchovy.html
Successfully partitioned raw_html/Anchovy.html
Found 9 elements
Ancient_Doll.html
Successfully partitioned raw_html/Ancient_Doll.html
Found 8 elements
Ancient_Drum.html
Successfully partitioned raw_html/Ancient_Drum.html
Found 6 elements
Ancient_Fruit.html
Successfully partitioned raw_html/Ancient_Fruit.html
Found 9 elements
Ancient_Fruit_Seeds.html
Successfully partitioned raw_html/Ancient_Fruit_Seeds.html
Found 5 elements
Ancient_Seed.html
Successfully partitioned raw_html/Ancient_Seed.html
Found 9 elements
Ancient_Seeds.html
Successfully partitioned raw_html/Ancient_Seeds.html
Found 5 elements
Ancient_Sword.html
Successfully partitioned raw_html/Ancient_Sword.html
Found 8 elements


In [81]:
tables

[{'page_title': 'Anchor',
  'section_title': 'Anchor',
  'table': <unstructured.documents.elements.Table at 0x1c168b33f50>},
 {'page_title': 'Anchor',
  'section_title': 'Gifting',
  'table': <unstructured.documents.elements.Table at 0x1c168bfebd0>},
 {'page_title': 'Anchor',
  'section_title': 'History',
  'table': <unstructured.documents.elements.Table at 0x1c168bf3110>},
 {'page_title': 'Anchor_(Furniture)',
  'section_title': 'Anchor (furniture)',
  'table': <unstructured.documents.elements.Table at 0x1c168bf1310>},
 {'page_title': 'Anchor_(Furniture)',
  'section_title': 'Anchor_(Furniture)',
  'table': <unstructured.documents.elements.TableChunk at 0x1c168b31850>},
 {'page_title': 'Anchor_(Furniture)',
  'section_title': 'Anchor_(Furniture)',
  'table': <unstructured.documents.elements.TableChunk at 0x1c168ac4250>},
 {'page_title': 'Anchovy',
  'section_title': 'Anchovy',
  'table': <unstructured.documents.elements.Table at 0x1c168c1ff90>},
 {'page_title': 'Anchovy',
  'section_t

In [27]:
element[3].metadata.orig_elements[1].to_dict()

{'type': 'NarrativeText',
 'element_id': '07b64c01-cb27-493b-908e-2280e8e33ba7',
 'text': 'Clubs are heavy weapons that tend to be slower than swords and daggers but with a wide swing. Their secondary attack causes the player to slam the ground in front of them, causing an area-of-effect attack in a radius around the player that has strong knockback. Damage dealt by this attack will never be a critical hit.[1]',
 'metadata': {'link_texts': ['[1]'],
  'link_urls': ['#cite_note-hammercrit-1'],
  'last_modified': '2025-10-10T20:33:16'}}