In [1]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# -----------------------------------------------------------
# Define Structured DOM Parsing Function (Version 2)
# -----------------------------------------------------------

def parse_patch_notes_structured(filepath):
    '''
    Reads the Diablo IV patch notes HTML file.
    Uses BeautifulSoup to preserve DOM structure.
    Traverses elements sequentially (no get_text flattening).
    Detects patch headers using regex.
    Tracks hierarchical context dynamically.
    Returns a structured pandas DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read and Clean HTML
    # -----------------------------------------------------------

    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    for tag in soup(["script", "style"]):
        tag.decompose()


    # -----------------------------------------------------------
    # STEP 2: Patch Header Pattern
    # -----------------------------------------------------------

    patch_pattern = re.compile(
        r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*—(.+)"
    )


    # -----------------------------------------------------------
    # STEP 3: Initialize Context State
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None

    current_major_section = None
    current_subsection = None
    current_category = None


    # -----------------------------------------------------------
    # STEP 4: Traverse DOM Sequentially
    # -----------------------------------------------------------

    for element in soup.find_all(["p", "li", "h1", "h2", "h3", "h4", "h5"]):

        text = element.get_text(strip=True)

        if not text:
            continue

        # --- Detect Patch Header ---
        patch_match = patch_pattern.match(text)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)

            # Reset section context when new patch starts
            current_major_section = None
            current_subsection = None
            current_category = None

            continue


        # --- Skip anything before first patch header ---
        if current_patch is None:
            continue


        # -----------------------------------------------------------
        # SECTION DETECTION (Dynamic)
        # -----------------------------------------------------------

        if text in [
            "Bug Fixes", "Game Updates", "Balance Update",
            "Expansion", "Base Game", "Accessibility",
            "Dungeons and The Pit", "Gameplay",
            "User Interface and User Experience",
            "Miscellaneous", "Activities"
        ]:
            current_major_section = text
            current_subsection = None
            current_category = None
            continue


        # Subsection detection (class names, etc.)
        if text in [
            "Barbarian", "Druid", "Sorcerer",
            "Rogue", "Necromancer", "Paladin",
            "Spiritborn"
        ]:
            current_subsection = text
            current_category = None
            continue


        # Category detection
        if text in [
            "Skills", "Passives", "Items",
            "Legendary Aspects", "Paragon",
            "Tempering", "Enchantments",
            "Unique Items"
        ]:
            current_category = text
            continue


        # -----------------------------------------------------------
        # Record Actual Content Nodes
        # -----------------------------------------------------------

        records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "major_section": current_major_section,
            "subsection": current_subsection,
            "category": current_category,
            "tag_type": element.name,
            "full_text": text
        })


    # -----------------------------------------------------------
    # STEP 5: Build DataFrame
    # -----------------------------------------------------------

    df = pd.DataFrame(records)

    return df

In [3]:
# -----------------------------------------------------------
# Run Structured Parser (Version 2)
# -----------------------------------------------------------

input_file = "Diablo_IV_Patch_Notes.html"

df_v2 = parse_patch_notes_structured(input_file)

print("✅ Structured Parsing Complete.")
print("Total structured records extracted:", len(df_v2))

✅ Structured Parsing Complete.
Total structured records extracted: 0
