In [1]:
# from unstructured import partition
from unstructured.partition.html import partition_html
# from unstructured.chunking.title import chunk_by_title
import os
from tqdm import tqdm
import json
import requests
import re


In [2]:
def partition_saved_html_with_strategy(html_file_path):

    try:
        # Check if file exists
        if not os.path.exists(html_file_path):
            print(f"Error: File {html_file_path} does not exist")
            return None
        
        # Partition the HTML file with section-based chunking
        elements = partition_html(
            filename=html_file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            include_page_breaks=True,
            # Section-based chunking parameters
            max_characters=10000,  # Adjust based on your needs
            combine_text_under_n_chars= 100,  # Combine small text fragments
            )
        
        # print(f"Successfully partitioned {html_file_path}")
        # print(f"Found {len(elements)} elements")
        
        return elements
        
    except Exception as e:
        print(f"Error partitioning HTML file: {e}")
        return None

In [3]:
EXCLUDED_SECTIONS = [
    "references", "reference", "navigation", "navigation menu", "history",
    "see also", "notes", "external links", "trivia", "gallery",
    "quotes", "bugs", "changelog", "patch history", "credits",
    "footnotes", "footer", "acknowledgements", "disclaimer"
]

def should_exclude_section(section_title: str) -> bool:
    if not section_title:
        return False
    normalized = section_title.strip().lower()
    return any(excluded in normalized for excluded in EXCLUDED_SECTIONS)

In [4]:
def partition_files(files):
    texts = []
    tables = []

    for file in tqdm(files):
        # print (file)
        elements = partition_saved_html_with_strategy("raw_html/" + file)

        if not elements:
            continue

        for i in elements:
            page_title = file.split(".")[0]
            section_title = page_title

            if "CompositeElement" in str(type(i)):
                for j in i.metadata.orig_elements:
                    if "Title" in str(type(j)):
                        section_title = j.text.strip()
                    elif "Table" in str(type(j)):
                        if not should_exclude_section(section_title):
                            tables.append({
                                "page_title": page_title,
                                "section_title": section_title,
                                "table": j,
                            })
                    else:
                        if not should_exclude_section(section_title):
                            text_content = j.text.strip() if hasattr(j, "text") else str(j).strip()
                            if len(text_content) >= 100:
                                texts.append({
                                    "page_title": page_title,
                                    "section_title": section_title,
                                    "text": text_content,
                                })

            elif "Table" in str(type(i)):
                if not should_exclude_section(section_title):
                    tables.append({
                        "page_title": page_title,
                        "section_title": section_title,
                        "table": i,
                    })
            else:
                if not should_exclude_section(section_title):
                    text_content = i.text.strip() if hasattr(i, "text") else str(i).strip()
                    if len(text_content) >= 100:
                        texts.append({
                            "page_title": page_title,
                            "section_title": section_title,
                            "text": text_content,
                        })

    return texts, tables



In [5]:
from pathlib import Path

raw_html_path = Path('raw_html')
file_names = [f.name for f in raw_html_path.iterdir() if f.is_file()]

In [6]:
texts, tables = partition_files(file_names)

100%|██████████| 2683/2683 [05:27<00:00,  8.19it/s]


In [77]:
len(texts), len(tables)

(19039, 12010)

In [7]:
summarize_prompt = """
You are an assistant tasked with summarizing the tables. 
Give a concise and short summary of the table.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table chunk: {element}
"""

In [8]:
test_element = partition_saved_html_with_strategy("raw_html/Weapons.html")
swords_table = test_element[3].metadata.orig_elements[2].metadata.text_as_html
test_propmpt = summarize_prompt.format(element=swords_table)

In [9]:
from openai import OpenAI

client = OpenAI()
response = client.chat.completions.create(
    model="gpt-5-nano",
    messages=[{"role": "user", "content": test_propmpt}]
)

result3 = response.choices[0].message.content
result3

'28 swords listed with level, damage range, critical chance, stat bonuses, acquisition location, and buy/sell prices; ranges from level 1 Rusty Sword (2–5 dmg) to level 17 Infinity Blade (80–100 dmg) with varied bonuses (speed, defense, weight, vampiric power) and sources (mines, guilds, chests, fishing, festivals, dungeons).'

In [10]:
import asyncio
import re
import time
from tqdm.asyncio import tqdm_asyncio
from openai import AsyncOpenAI, RateLimitError, APIError, APITimeoutError

In [31]:
MAX_CONCURRENT = 15          # 2–5 is usually safe
RETRY_LIMIT = 3
BACKOFF_BASE = 2            # seconds to wait * attempt number

In [None]:
client = AsyncOpenAI()

# customize limits safely (OpenAI rate limits depend on your account tier)


async def summarize_one_table(table_info, sem):
    """Summarize a single table with retry + backoff."""
    async with sem:
        html_content = getattr(table_info["table"].metadata, "text_as_html", "")
        if not html_content:
            table_info["summary"] = "[No table HTML]"
            return table_info

        # truncate long tables to avoid token overflow
        html_content = html_content[:8000]
        prompt = summarize_prompt.format(element=html_content)

        for attempt in range(RETRY_LIMIT):
            try:
                response = await client.chat.completions.create(
                    model="gpt-5-nano",
                    messages=[{"role": "user", "content": prompt}],
                    timeout=60,
                )
                summary = response.choices[0].message.content
                summary = re.sub(r"<think>.*?</think>", "", summary, flags=re.DOTALL).strip()
                table_info["summary"] = summary
                return table_info

            except (RateLimitError, APIError, APITimeoutError) as e:
                wait_time = BACKOFF_BASE * (attempt + 1)
                print(f"Rate/API error ({attempt+1}/{RETRY_LIMIT}) for {table_info['page_title']}: waiting {wait_time}s -> {type(e).__name__}")
                await asyncio.sleep(wait_time)

            except Exception as e:
                print(f"Unexpected error ({attempt+1}/{RETRY_LIMIT}) for {table_info['page_title']}: {e}")
                await asyncio.sleep(2 * (attempt + 1))

        table_info["summary"] = "[FAILED TO SUMMARIZE]"
        return table_info


async def summarize_tables_async(tables):
    """Main async driver"""
    sem = asyncio.Semaphore(MAX_CONCURRENT)
    tasks = [summarize_one_table(t, sem) for t in tables]
    results = await tqdm_asyncio.gather(*tasks, desc="Summarizing tables", total=len(tasks))
    return results


def summarize_tables(tables):
    """Sync wrapper for normal scripts"""
    import nest_asyncio, asyncio
    nest_asyncio.apply()
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            return asyncio.ensure_future(summarize_tables_async(tables))
        else:
            return loop.run_until_complete(summarize_tables_async(tables))
    except RuntimeError:
        return asyncio.run(summarize_tables_async(tables))

In [None]:
summarized_tables = summarize_tables(tables[100:1000])

Summarizing tables:   0%|          | 0/900 [00:00<?, ?it/s]

Summarizing tables: 100%|██████████| 900/900 [26:29<00:00,  1.77s/it]


In [15]:
summarized_tables

<Task finished name='Task-1' coro=<summarize_tables_async() done, defined at C:\Users\eghiasva\AppData\Local\Temp\ipykernel_12316\3811060167.py:45> result=[{'page_title': '%271000_Years_From_Now%27', 'section_title': "'1000 Years From Now'", 'summary': "The item '10...nnot be sold.", 'table': <unstructured...002247E7D1940>}, {'page_title': '%271000_Years_From_Now%27', 'section_title': '%271000_Years_From_Now%27', 'summary': 'A categorize...ach category.', 'table': <unstructured...002247E7D2900>}, {'page_title': '%271000_Years_From_Now%27', 'section_title': '%271000_Years_From_Now%27', 'summary': 'Two categori...d Catalogue).', 'table': <unstructured...002247E5FD810>}, {'page_title': '%27Abstract%27', 'section_title': "'Abstract'", 'summary': 'Indoor-only ...not sellable.', 'table': <unstructured...002247E6056E0>}, {'page_title': '%27Abstract%27', 'section_title': '%27Abstract%27', 'summary': 'Catalog of f...ach category.', 'table': <unstructured...002247E5FE990>}, {'page_title': '%27Ab

In [22]:
tables[99]

{'page_title': '%27Spires%27',
 'section_title': "'Spires'",
 'table': <unstructured.documents.elements.Table at 0x2247e7cb310>,
 'summary': "Spires is indoor-only furniture and cannot be placed outdoors. Sources and prices: Carpenter's Shop 800g; Traveling Cart 250–2,500g; Casino 3,000g; Furniture Catalogue 0g. Sell price: cannot be sold."}

In [26]:
tables[388]

{'page_title': 'Abby',
 'section_title': 'Six Hearts',
 'table': <unstructured.documents.elements.Table at 0x22404e2eb30>,
 'summary': 'Abigail, in the graveyard, wants adventures and asks about swords; player dialogue changes friendship: +10 for two positive replies, -100 for a dangerous reply, and 0 for "No." Pierre interrupts about cooking; Abigail blames gender expectations, they hide to talk, and the cutscene ends with her asking you to untangle her hair.'}

In [None]:
summarized_tables = summarize_tables(tables[1000:2000])

Summarizing tables: 100%|██████████| 1000/1000 [09:52<00:00,  1.69it/s]


In [34]:
tables[1998]

{'page_title': 'Chef%27s_Bundle',
 'section_title': 'Pantry',
 'table': <unstructured.documents.elements.Table at 0x2240631de80>,
 'summary': 'Fall Crops Bundle includes Corn (Summer/Fall), Eggplant, Pumpkin, and Yam (Fall; 3% drop from Duggies on Mines floors 6–29); reward is a Bee House (1).'}

In [None]:
summarized_tables = summarize_tables(tables[2000:4000])

Summarizing tables: 100%|██████████| 2000/2000 [16:11<00:00,  2.06it/s]


In [36]:
tables[3999]

{'page_title': 'Fall_Crops_Bundle',
 'section_title': 'Traveling Cart Availability',
 'table': <unstructured.documents.elements.Table at 0x224069aeba0>,
 'summary': 'Three columns of item lists with asterisks marking silver+ quality. Column 1: Aged Roe, Amethyst*, Ancient Doll*, Ancient Fruit, Aquamarine, Blobfish*, Bone Fragment*, Diamond*. Column 2: Dinosaur, Mayonnaise, Earth Crystal, Emerald*, Fire Quartz, Frozen Geode, Frozen Tear, Hay, Lava Eel*, Prismatic Shard. Column 3: Quartz Roe*, Ruby*, Squid Ink*, Sweet Gem Berry*, Topaz*, Void Salmon, Wheat Flour*, White Algae*.'}

In [None]:
summarized_tables = summarize_tables(tables[4000:8000])

Summarizing tables: 100%|██████████| 4000/4000 [30:38<00:00,  2.18it/s]


In [38]:
tables[7999]

{'page_title': 'Pepper',
 'section_title': 'Hot Pepper',
 'table': <unstructured.documents.elements.Table at 0x2240c07a270>,
 'summary': 'Hot Pepper: Fiery hot with a hint of sweetness. Seed: Pepper Seeds. Growth time: 5 days. Season: Summer. XP: 9 Farming XP. Energy/Health: 13 5 18 8 23 10 33 14. Sell prices: Base Tiller (+10%) 40g 50g 60g 80g; 44g 55g 66g 88g. Artisan Sell Prices (+40%) 120g 150g 180g 240g 130g 325g 168g 210g 252g 336g 182g 455g.'}

In [None]:
summarized_tables = summarize_tables(tables[8000:12010])

Summarizing tables: 100%|██████████| 4010/4010 [31:18<00:00,  2.14it/s] 


In [41]:
tables[-1]

{'page_title': 'Yoba%E2%80%99s_Blessing',
 'section_title': 'Available Buffs',
 'table': <unstructured.documents.elements.Table at 0x2240c88aeb0>,
 'summary': '- The table lists temporary buffs that alter player stats (Farming, Mining, Fishing, Foraging, Attack, Defense, Max Energy, Luck, Magnetic Radius, Speed) or grant special effects (invincibility, debuffs, immunity, etc.).\n- Buffs are primarily triggered by consuming various foods or ingredients (and some drinks) or by specific actions (e.g., killing enemies with certain rings or using Monster Musk).\n- Durations are mostly variable, but a few have fixed short durations: Monster Musk 10m, Oil of Garlic 10m, Squid Ink Ravioli 3m, Tipsy 30s, Adrenaline Rush 3s, Warrior Energy 5s, Yoba’s Blessing 5s, Burnt 6s, Darkness 5s, Frozen 2s, Jinxed 8s, Nauseated 2m, Slimed 2.5–3s, Weakness 10s.'}

In [49]:
tables2= tables.copy()

In [60]:
def serialize_tables(tables):
    serialized = []
    for t in tables:
        try:
            html = getattr(t["table"].metadata, "text_as_html", None)
            if not html and hasattr(t["table"], "text"):
                html = t["table"].text
            serialized.append({
                "page_title": t["page_title"],
                "section_title": t["section_title"],
                "table_html": html or "[No HTML available]",
                "summary": t.get("summary", "")
            })
        except Exception as e:
            serialized.append({
                "page_title": t["page_title"],
                "section_title": t["section_title"],
                "table_html": "[Serialization error]",
                "summary": t.get("summary", ""),
                "error": str(e)
            })
    return serialized

In [69]:
serialized_tables = serialize_tables(tables2)

In [70]:
serialized_tables[1000]

{'page_title': 'Barbed_Hook',
 'section_title': 'Barbed Hook',
 'table_html': '<table><tr><td>Barbed Hook</td></tr><tr><td/></tr><tr><td>Makes your catch more secure, causing the &quot;fishing bar&quot; to cling to your catch. Works best on slow, weak fish.</td></tr><tr><td>Information</td></tr><tr><td>Source</td><td>Fish Shop • Crafting • Festival of Ice</td></tr><tr><td>Sell Price</td><td>data-sort-value=&quot;500&quot;&gt; 500g</td></tr><tr><td>Crafting</td></tr><tr><td>Recipe Source</td><td>Fishing (Level 8)</td></tr><tr><td>Ingredients</td><td>Copper Bar (1) Iron Bar (1) Gold Bar (1)</td></tr></table>',
 'summary': 'Barbed Hook: a fishing item that makes catches more secure (best for slow/weak fish); available from Fish Shop, Crafting, and Festival of Ice; sells for 500g; crafted from Copper Bar, Iron Bar, and Gold Bar at Fishing level 8.'}

In [71]:

with open("summarized_tables.json", "w", encoding="utf-8") as f:
    json.dump(serialized_tables, f, ensure_ascii=False, indent=2)


In [72]:
with open("summarized_tables2.jsonl", "w", encoding="utf-8") as f:
    for row in serialized_tables:
        json.dump(row, f, ensure_ascii=False)
        f.write("\n")

In [76]:
len(texts)

19039

In [79]:
with open("summarized_texts.json", "w", encoding="utf-8") as f:
    json.dump(texts, f, ensure_ascii=False, indent=2)


In [80]:
with open("summarized_texts2.jsonl", "w", encoding="utf-8") as f:
    for row in texts:
        json.dump(row, f, ensure_ascii=False)
        f.write("\n")