In [1]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime

In [2]:
# -----------------------------------------------------------
# Define Robust Text-Based Parsing Function (Final Version)
# -----------------------------------------------------------

def parse_patch_notes_text(filepath):
    '''
    Reads the Diablo IV patch notes HTML file.
    Extracts ordered text nodes directly.
    Uses regex-based state tracking for structure.
    Returns structured DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read HTML
    # -----------------------------------------------------------

    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")


    # -----------------------------------------------------------
    # STEP 2: Define Patterns
    # -----------------------------------------------------------

    patch_pattern = re.compile(
        r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*?([A-Za-z]+\s+\d{1,2},\s+\d{4})"
    )

    section_headers = {
        "Bug Fixes", "Game Updates", "Balance Update",
        "Balance Updates", "Expansion", "Base Game",
        "Accessibility", "Gameplay", "Miscellaneous",
        "Activities", "Enemy Updates", "Economy Updates",
        "Monster Updates", "Nightmare Dungeons",
        "Masterworking", "Tempering", "Toughness"
    }

    category_headers = {
        "Skills", "Passives", "Items",
        "Legendary Aspects", "Paragon",
        "Enchantments", "Unique Items"
    }


    # -----------------------------------------------------------
    # STEP 3: Initialize State
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None

    current_section = None
    current_category = None

    previous_buffer = None


    # -----------------------------------------------------------
    # STEP 4: Iterate Text Nodes in Order
    # -----------------------------------------------------------

    for element in soup.descendants:

        if not isinstance(element, NavigableString):
            continue

        text = element.strip()

        if not text:
            continue


        # --- Detect Patch Header ---
        patch_match = patch_pattern.search(text)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)

            current_section = None
            current_category = None

            continue


        # Skip anything before first patch header
        if current_patch is None:
            continue


        # --- Detect Section Headers ---
        if text in section_headers:
            current_section = text
            current_category = None
            continue


        # --- Detect Category Headers ---
        if text in category_headers:
            current_category = text
            continue


        # --- Detect Previous / Now Blocks ---
        if text.startswith("Previous"):
            previous_buffer = text.replace("Previous:", "").strip()
            continue

        if text.startswith("Now") and previous_buffer:
            now_text = text.replace("Now:", "").strip()

            records.append({
                "patch": current_patch,
                "build": current_build,
                "date": current_date,
                "section": current_section,
                "category": current_category,
                "change_type": "comparison",
                "previous": previous_buffer,
                "now": now_text,
                "full_text": f"Changed from {previous_buffer} to {now_text}"
            })

            previous_buffer = None
            continue


        # --- Regular Content ---
        records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "section": current_section,
            "category": current_category,
            "change_type": "single",
            "previous": None,
            "now": None,
            "full_text": text
        })


    # -----------------------------------------------------------
    # STEP 5: Build DataFrame
    # -----------------------------------------------------------

    df = pd.DataFrame(records)

    return df

In [3]:
# -----------------------------------------------------------
# Run Robust Text Parser
# -----------------------------------------------------------

input_file = "Diablo_IV_Patch_Notes.html"

df_text = parse_patch_notes_text(input_file)

print("✅ Text-Based Parsing Complete.")
print("Total records extracted:", len(df_text))

✅ Text-Based Parsing Complete.
Total records extracted: 2281
