In [1]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# -----------------------------------------------------------
# Define Structured DOM Parsing Function (Version 2)
# -----------------------------------------------------------

def parse_patch_notes_structured(filepath):
    '''
    Reads the Diablo IV patch notes HTML file.
    Uses BeautifulSoup to preserve DOM structure.
    Traverses elements sequentially (no get_text flattening).
    Detects patch headers using regex.
    Tracks hierarchical context dynamically.
    Returns a structured pandas DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read and Clean HTML
    # -----------------------------------------------------------

    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    for tag in soup(["script", "style"]):
        tag.decompose()


    # -----------------------------------------------------------
    # STEP 2: Patch Header Pattern
    # -----------------------------------------------------------

    patch_pattern = re.compile(
        r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*—(.+)"
    )


    # -----------------------------------------------------------
    # STEP 3: Initialize Context State
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None

    current_major_section = None
    current_subsection = None
    current_category = None


    # -----------------------------------------------------------
    # STEP 4: Traverse DOM Sequentially
    # -----------------------------------------------------------

    for element in soup.find_all(["p", "li", "h1", "h2", "h3", "h4", "h5"]):

        text = element.get_text(strip=True)

        if not text:
            continue

        # --- Detect Patch Header ---
        patch_match = patch_pattern.match(text)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)

            # Reset section context when new patch starts
            current_major_section = None
            current_subsection = None
            current_category = None

            continue


        # --- Skip anything before first patch header ---
        if current_patch is None:
            continue


        # -----------------------------------------------------------
        # SECTION DETECTION (Dynamic)
        # -----------------------------------------------------------

        if text in [
            "Bug Fixes", "Game Updates", "Balance Update",
            "Expansion", "Base Game", "Accessibility",
            "Dungeons and The Pit", "Gameplay",
            "User Interface and User Experience",
            "Miscellaneous", "Activities"
        ]:
            current_major_section = text
            current_subsection = None
            current_category = None
            continue


        # Subsection detection (class names, etc.)
        if text in [
            "Barbarian", "Druid", "Sorcerer",
            "Rogue", "Necromancer", "Paladin",
            "Spiritborn"
        ]:
            current_subsection = text
            current_category = None
            continue


        # Category detection
        if text in [
            "Skills", "Passives", "Items",
            "Legendary Aspects", "Paragon",
            "Tempering", "Enchantments",
            "Unique Items"
        ]:
            current_category = text
            continue


        # -----------------------------------------------------------
        # Record Actual Content Nodes
        # -----------------------------------------------------------

        records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "major_section": current_major_section,
            "subsection": current_subsection,
            "category": current_category,
            "tag_type": element.name,
            "full_text": text
        })


    # -----------------------------------------------------------
    # STEP 5: Build DataFrame
    # -----------------------------------------------------------

    df = pd.DataFrame(records)

    return df

In [3]:
# -----------------------------------------------------------
# Run Structured Parser (Version 2)
# -----------------------------------------------------------

input_file = "Diablo_IV_Patch_Notes.html"

df_v2 = parse_patch_notes_structured(input_file)

print("✅ Structured Parsing Complete.")
print("Total structured records extracted:", len(df_v2))

✅ Structured Parsing Complete.
Total structured records extracted: 0


In [4]:
# -----------------------------------------------------------
# Inspect First 25 Structured Rows
# -----------------------------------------------------------

pd.set_option("display.max_colwidth", None)

df_v2.head(25)

In [5]:
# -----------------------------------------------------------
# DIAGNOSTIC — What tags actually exist?
# -----------------------------------------------------------

with open(input_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

unique_tags = sorted(set(tag.name for tag in soup.find_all(True)))

print("Unique tags found in HTML:")
print(unique_tags[:40])
print("Total unique tag types:", len(unique_tags))

Unique tags found in HTML:
['a', 'article', 'b', 'blockquote', 'blz-age-gate', 'blz-footer-icon', 'blz-footer-link', 'blz-header', 'blz-icon', 'blz-icon-button', 'blz-icon-button-icon', 'blz-icon-group', 'blz-image', 'blz-loading-spinner', 'blz-locale-selector', 'blz-logo-globe', 'blz-logo-main', 'blz-nav', 'blz-nav-content-block', 'blz-nav-content-block-private', 'blz-nav-dropdown', 'blz-nav-footer', 'blz-nav-legal-ratings-private', 'blz-nav-link', 'blz-nav-link-icon', 'blz-news-social-sharing', 'blz-timestamp', 'body', 'br', 'button', 'circle', 'clippath', 'defs', 'div', 'em', 'footer', 'g', 'h1', 'h2', 'h3']
Total unique tag types: 71


In [6]:
# -----------------------------------------------------------
# DIAGNOSTIC — Print All h2 Text
# -----------------------------------------------------------

for tag in soup.find_all("h2"):
    print("H2:", repr(tag.get_text()))

H2: 'Base Game'
H2: 'Expansion'
H2: 'Base Game'
H2: 'Base Game'
H2: 'Expansion'
H2: 'Base Game'
H2: 'Balance Update'
H2: 'Masterworking'
H2: 'Tempering'
H2: 'Toughness'
H2: 'Miscellaneous'
H2: 'Bug Fixes'
H2: 'Game Updates'
H2: 'Bug Fixes'
H2: 'Game Updates'
H2: 'Balance Updates'
H2: 'Enemy Updates'
H2: 'Economy Updates'
H2: 'Nightmare Dungeons'
H2: 'Monster Updates'
H2: 'Bug Fixes'
H2: ''
H2: 'Do Not Sell or Share My Personal Information'


In [7]:
# -----------------------------------------------------------
# DIAGNOSTIC — Locate Patch Header Node
# -----------------------------------------------------------

for tag in soup.find_all(True):
    text = tag.get_text(strip=True)
    if "2.5.3 Build" in text:
        print("TAG:", tag.name)
        print("TEXT:", repr(text))
        print("PARENT:", tag.parent.name)
        print("-" * 50)
        break

TAG: html
TEXT: 'Diablo IV Patch Notes — Diablo IV — Blizzard NewsundefinedDiablo IVDiablo IV Patch NotesBlizzard EntertainmentThe Diablo IV team has been monitoring your feedback. As we introduce patches to keep your experience in Sanctuary smooth, we will update the below list and denote whether the fixes are intended for PC, Xbox, PlayStation™, or all platforms.2.5.3 Build #70356 (All Platforms)—January 28, 2026Bug FixesFixed an issue where the Executioner Monster Affix sound effect played continuously.Developer’s Note:This affix will be re-enabled with the release of this patch.Fixed an issue where certain Silent Chests in Nahantu did not count towards the Season Rank objective Test of Luck.Fixed an issue where Zagraal in the Dark Citadel didn\'t drop loot.Fixed an issue where some Tower bosses had significantly more health than others.Fixed an issue where an error would occur when trying to view the profile of a leaderboard entry and the player had a private profile.Fixed an issue

In [9]:
# -----------------------------------------------------------
# DIAGNOSTIC — Locate Patch Header Node
# -----------------------------------------------------------

for tag in soup.find_all(True):
    text = tag.get_text(strip=True)
    if "2.5.3 Build" in text:
        print("TAG:", tag.name)
        print("TEXT:", repr(text))
        print("PARENT:", tag.parent.name)
        print("GRANDPARENT:", tag.parent.name)
        print("-" * 60)
        break

TAG: html
TEXT: 'Diablo IV Patch Notes — Diablo IV — Blizzard NewsundefinedDiablo IVDiablo IV Patch NotesBlizzard EntertainmentThe Diablo IV team has been monitoring your feedback. As we introduce patches to keep your experience in Sanctuary smooth, we will update the below list and denote whether the fixes are intended for PC, Xbox, PlayStation™, or all platforms.2.5.3 Build #70356 (All Platforms)—January 28, 2026Bug FixesFixed an issue where the Executioner Monster Affix sound effect played continuously.Developer’s Note:This affix will be re-enabled with the release of this patch.Fixed an issue where certain Silent Chests in Nahantu did not count towards the Season Rank objective Test of Luck.Fixed an issue where Zagraal in the Dark Citadel didn\'t drop loot.Fixed an issue where some Tower bosses had significantly more health than others.Fixed an issue where an error would occur when trying to view the profile of a leaderboard entry and the player had a private profile.Fixed an issue

In [10]:
# -----------------------------------------------------------
# DIAGNOSTIC — Locate Patch Header Node
# -----------------------------------------------------------

for tag in soup.find_all(True):
    text = tag.get_text(strip=True)
    if "2.5.3 Build" in text:
        print("TAG:", tag.name)
        print("TEXT:", repr(text))
        print("PARENT:", tag.parent.name if tag.parent else None)
        print("GRANDPARENT:", tag.parent.parent.name if tag.parent and tag.parent.parent else None)
        print("-" * 60)
        break

TAG: html
TEXT: 'Diablo IV Patch Notes — Diablo IV — Blizzard NewsundefinedDiablo IVDiablo IV Patch NotesBlizzard EntertainmentThe Diablo IV team has been monitoring your feedback. As we introduce patches to keep your experience in Sanctuary smooth, we will update the below list and denote whether the fixes are intended for PC, Xbox, PlayStation™, or all platforms.2.5.3 Build #70356 (All Platforms)—January 28, 2026Bug FixesFixed an issue where the Executioner Monster Affix sound effect played continuously.Developer’s Note:This affix will be re-enabled with the release of this patch.Fixed an issue where certain Silent Chests in Nahantu did not count towards the Season Rank objective Test of Luck.Fixed an issue where Zagraal in the Dark Citadel didn\'t drop loot.Fixed an issue where some Tower bosses had significantly more health than others.Fixed an issue where an error would occur when trying to view the profile of a leaderboard entry and the player had a private profile.Fixed an issue

In [11]:
# -----------------------------------------------------------
# DIAGNOSTIC — Recreate Soup + Locate Patch Header Node
# -----------------------------------------------------------

from bs4 import BeautifulSoup

with open("Diablo_IV_Patch_Notes.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

found = False

for tag in soup.find_all(True):
    text = tag.get_text(strip=True)
    if "2.5.3 Build" in text:
        print("TAG:", tag.name)
        print("TEXT:", repr(text))
        print("PARENT:", tag.parent.name if tag.parent else None)
        print("GRANDPARENT:", tag.parent.parent.name if tag.parent and tag.parent.parent else None)
        print("GREAT-GRANDPARENT:", tag.parent.parent.parent.name 
              if tag.parent and tag.parent.parent and tag.parent.parent.parent else None)
        print("-" * 60)
        found = True
        break

if not found:
    print("❌ '2.5.3 Build' not found anywhere in parsed DOM.")

TAG: html
TEXT: 'Diablo IV Patch Notes — Diablo IV — Blizzard NewsundefinedDiablo IVDiablo IV Patch NotesBlizzard EntertainmentThe Diablo IV team has been monitoring your feedback. As we introduce patches to keep your experience in Sanctuary smooth, we will update the below list and denote whether the fixes are intended for PC, Xbox, PlayStation™, or all platforms.2.5.3 Build #70356 (All Platforms)—January 28, 2026Bug FixesFixed an issue where the Executioner Monster Affix sound effect played continuously.Developer’s Note:This affix will be re-enabled with the release of this patch.Fixed an issue where certain Silent Chests in Nahantu did not count towards the Season Rank objective Test of Luck.Fixed an issue where Zagraal in the Dark Citadel didn\'t drop loot.Fixed an issue where some Tower bosses had significantly more health than others.Fixed an issue where an error would occur when trying to view the profile of a leaderboard entry and the player had a private profile.Fixed an issue

In [12]:
# -----------------------------------------------------------
# DIAGNOSTIC — Does raw file contain patch header string?
# -----------------------------------------------------------

with open("Diablo_IV_Patch_Notes.html", "r", encoding="utf-8") as f:
    raw_html = f.read()

print("Contains '2.5.3 Build'? ->", "2.5.3 Build" in raw_html)
print("Contains 'Executioner Monster'? ->", "Executioner Monster" in raw_html)

Contains '2.5.3 Build'? -> True
Contains 'Executioner Monster'? -> True


In [13]:
# -----------------------------------------------------------
# DIAGNOSTIC — Locate Exact Patch Header Text Node
# -----------------------------------------------------------

from bs4 import NavigableString

with open("Diablo_IV_Patch_Notes.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

for element in soup.descendants:
    if isinstance(element, NavigableString):
        if "2.5.3 Build" in element:
            print("FOUND TEXT NODE:")
            print(repr(element.strip()))
            print("PARENT TAG:", element.parent.name)
            print("GRANDPARENT:", element.parent.parent.name)
            print("-" * 60)
            break

FOUND TEXT NODE:
'2.5.3 Build #70356 (All Platforms)—January 28, 2026'
PARENT TAG: a
GRANDPARENT: div
------------------------------------------------------------
