## Reading the raw HTMLs and using unstructured to chunk those HTMLs into texts and tables

In [None]:
from unstructured.partition.html import partition_html
import os
from tqdm import tqdm
import json
import requests
import re
import asyncio
import time
from tqdm.asyncio import tqdm_asyncio
from openai import AsyncOpenAI, RateLimitError, APIError, APITimeoutError


In [None]:
def partition_saved_html_with_strategy(html_file_path):

    try:
        # Check if file exists
        if not os.path.exists(html_file_path):
            print(f"Error: File {html_file_path} does not exist")
            return None
        
        # Partition the HTML file with section-based chunking
        elements = partition_html(
            filename=html_file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            include_page_breaks=True,
            # Section-based chunking parameters
            max_characters=10000,  
            combine_text_under_n_chars= 100,  # Combine small text fragments
            )
        
        return elements
        
    except Exception as e:
        print(f"Error partitioning HTML file: {e}")
        return None

In [None]:
EXCLUDED_SECTIONS = [
    "references", "reference", "navigation", "navigation menu", "history",
    "see also", "notes", "external links", "trivia", "gallery",
    "quotes", "bugs", "changelog", "patch history", "credits",
    "footnotes", "footer", "acknowledgements", "disclaimer"
]

def should_exclude_section(section_title: str) -> bool:
    if not section_title:
        return False
    normalized = section_title.strip().lower()
    return any(excluded in normalized for excluded in EXCLUDED_SECTIONS)

In [None]:
def partition_files(files):
    texts = []
    tables = []

    for file in tqdm(files):
        elements = partition_saved_html_with_strategy("raw_html/" + file)

        if not elements:
            continue

        for i in elements:
            page_title = file.split(".")[0]
            section_title = page_title

            if "CompositeElement" in str(type(i)):
                for j in i.metadata.orig_elements:
                    if "Title" in str(type(j)):
                        section_title = j.text.strip()
                    elif "Table" in str(type(j)):
                        if not should_exclude_section(section_title):
                            tables.append({
                                "page_title": page_title,
                                "section_title": section_title,
                                "table": j,
                            })
                    else:
                        if not should_exclude_section(section_title):
                            text_content = j.text.strip() if hasattr(j, "text") else str(j).strip()
                            if len(text_content) >= 100:
                                texts.append({
                                    "page_title": page_title,
                                    "section_title": section_title,
                                    "text": text_content,
                                })

            elif "Table" in str(type(i)):
                if not should_exclude_section(section_title):
                    tables.append({
                        "page_title": page_title,
                        "section_title": section_title,
                        "table": i,
                    })
            else:
                if not should_exclude_section(section_title):
                    text_content = i.text.strip() if hasattr(i, "text") else str(i).strip()
                    if len(text_content) >= 100:
                        texts.append({
                            "page_title": page_title,
                            "section_title": section_title,
                            "text": text_content,
                        })

    return texts, tables



In [None]:
from pathlib import Path

raw_html_path = Path('raw_html')
file_names = [f.name for f in raw_html_path.iterdir() if f.is_file()]

In [None]:
texts, tables = partition_files(file_names)

In [None]:
len(texts), len(tables)

## Generating a summary for each table to help us in the retrieval process

In [None]:
summarize_prompt = """
You are an assistant tasked with summarizing the tables. 
Give a concise and short summary of the table.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table chunk: {element}
"""

In [None]:
MAX_CONCURRENT = 15          
RETRY_LIMIT = 3
BACKOFF_BASE = 2            # seconds to wait * attempt number

In [None]:
client = AsyncOpenAI()

async def summarize_one_table(table_info, sem):
    """Summarize a single table with retry + backoff."""
    async with sem:
        html_content = getattr(table_info["table"].metadata, "text_as_html", "")
        if not html_content:
            table_info["summary"] = "[No table HTML]"
            return table_info

        # truncate long tables to avoid token overflow
        html_content = html_content[:8000]
        prompt = summarize_prompt.format(element=html_content)

        for attempt in range(RETRY_LIMIT):
            try:
                response = await client.chat.completions.create(
                    model="gpt-5-nano",
                    messages=[{"role": "user", "content": prompt}],
                    timeout=60,
                )
                summary = response.choices[0].message.content
                summary = re.sub(r"<think>.*?</think>", "", summary, flags=re.DOTALL).strip()
                table_info["summary"] = summary
                return table_info

            except (RateLimitError, APIError, APITimeoutError) as e:
                wait_time = BACKOFF_BASE * (attempt + 1)
                print(f"Rate/API error ({attempt+1}/{RETRY_LIMIT}) for {table_info['page_title']}: waiting {wait_time}s -> {type(e).__name__}")
                await asyncio.sleep(wait_time)

            except Exception as e:
                print(f"Unexpected error ({attempt+1}/{RETRY_LIMIT}) for {table_info['page_title']}: {e}")
                await asyncio.sleep(2 * (attempt + 1))

        table_info["summary"] = "[FAILED TO SUMMARIZE]"
        return table_info


async def summarize_tables_async(tables):
    """Main async driver"""
    sem = asyncio.Semaphore(MAX_CONCURRENT)
    tasks = [summarize_one_table(t, sem) for t in tables]
    results = await tqdm_asyncio.gather(*tasks, desc="Summarizing tables", total=len(tasks))
    return results


def summarize_tables(tables):
    """Sync wrapper for normal scripts"""
    import nest_asyncio, asyncio
    nest_asyncio.apply()
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            return asyncio.ensure_future(summarize_tables_async(tables))
        else:
            return loop.run_until_complete(summarize_tables_async(tables))
    except RuntimeError:
        return asyncio.run(summarize_tables_async(tables))

In [None]:
summarized_tables = summarize_tables(tables)
tables2= tables.copy()

In [None]:
def serialize_tables(tables):
    serialized = []
    for t in tables:
        try:
            html = getattr(t["table"].metadata, "text_as_html", None)
            if not html and hasattr(t["table"], "text"):
                html = t["table"].text
            serialized.append({
                "page_title": t["page_title"],
                "section_title": t["section_title"],
                "table_html": html or "[No HTML available]",
                "summary": t.get("summary", "")
            })
        except Exception as e:
            serialized.append({
                "page_title": t["page_title"],
                "section_title": t["section_title"],
                "table_html": "[Serialization error]",
                "summary": t.get("summary", ""),
                "error": str(e)
            })
    return serialized



In [None]:
serialized_tables = serialize_tables(tables2)

In [None]:
with open("data/summarized_tables.json", "w", encoding="utf-8") as f:
    json.dump(serialized_tables, f, ensure_ascii=False, indent=2)


In [None]:
with open("data/summarized_texts.json", "w", encoding="utf-8") as f:
    json.dump(texts, f, ensure_ascii=False, indent=2)
