In [1]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re
import pandas as pd
from datetime import datetime

In [2]:
# -----------------------------------------------------------
# Define Clean Structured Text Parsing Function (Version 3)
# -----------------------------------------------------------

def parse_patch_notes_clean(filepath):
    '''
    Parses CLEAN Diablo IV patch notes text file.

    This version:
    - Does NOT rely on DOM structure.
    - Reads file line-by-line.
    - Detects patch headers via regex.
    - Tracks hierarchical structure dynamically.
    - Classifies change types (fix/update/balance).
    - Returns structured pandas DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read File as Plain Text
    # -----------------------------------------------------------

    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines()]

    # Remove empty lines
    lines = [line for line in lines if line]


    # -----------------------------------------------------------
    # STEP 2: Patch Header Pattern
    # -----------------------------------------------------------

    patch_pattern = re.compile(
        r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*—(.+)"
    )


    # -----------------------------------------------------------
    # STEP 3: Initialize Context State
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None

    current_major_section = None
    current_subsection = None
    current_category = None

    previous_text = None
    awaiting_now = False


    # -----------------------------------------------------------
    # STEP 4: Iterate Line-by-Line
    # -----------------------------------------------------------

    for line in lines:

        # --- Detect Patch Header ---
        patch_match = patch_pattern.match(line)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)

            # Reset hierarchy
            current_major_section = None
            current_subsection = None
            current_category = None

            continue


        # Skip everything before first patch header
        if current_patch is None:
            continue


        # -----------------------------------------------------------
        # SECTION DETECTION
        # -----------------------------------------------------------

        if line in [
            "Bug Fixes",
            "Game Updates",
            "Balance Update",
            "Balance Updates",
            "Enemy Updates",
            "Economy Updates",
            "Healing Updates",
            "Monster Updates"
        ]:
            current_major_section = line
            current_subsection = None
            current_category = None
            continue


        # -----------------------------------------------------------
        # SUBSECTION DETECTION (Classes / Expansion / Base Game)
        # -----------------------------------------------------------

        if line in [
            "Base Game",
            "Expansion",
            "Barbarian",
            "Druid",
            "Sorcerer",
            "Rogue",
            "Necromancer",
            "Paladin",
            "Spiritborn",
            "All Classes",
            "Vessel of Hatred"
        ]:
            current_subsection = line
            current_category = None
            continue


        # -----------------------------------------------------------
        # CATEGORY DETECTION
        # -----------------------------------------------------------

        if line in [
            "Skills",
            "Passives",
            "Items",
            "Unique Items",
            "Legendary Aspects",
            "Paragon",
            "Tempering",
            "Enchantments",
            "Affixes",
            "Power",
            "General",
            "Activities",
            "Gameplay",
            "Accessibility",
            "Miscellaneous"
        ]:
            current_category = line
            continue


        # -----------------------------------------------------------
        # HANDLE Previous / Now BLOCKS
        # -----------------------------------------------------------

        if line.startswith("Previous"):
            previous_text = line
            awaiting_now = True
            continue

        if line.startswith("Now") and awaiting_now:
            records.append({
                "patch": current_patch,
                "build": current_build,
                "date": current_date,
                "major_section": current_major_section,
                "subsection": current_subsection,
                "category": current_category,
                "change_type": "comparison",
                "previous": previous_text,
                "now": line,
                "full_text": previous_text + " " + line
            })

            previous_text = None
            awaiting_now = False
            continue


        # -----------------------------------------------------------
        # DETECT CHANGE TYPE
        # -----------------------------------------------------------

        if line.startswith("Fixed"):
            change_type = "bug_fix"
        elif any(word in line.lower() for word in ["increased", "reduced", "adjusted", "changed", "replaced"]):
            change_type = "balance_change"
        else:
            change_type = "update"


        # -----------------------------------------------------------
        # RECORD STANDARD ENTRY
        # -----------------------------------------------------------

        records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "major_section": current_major_section,
            "subsection": current_subsection,
            "category": current_category,
            "change_type": change_type,
            "previous": None,
            "now": None,
            "full_text": line
        })


    # -----------------------------------------------------------
    # STEP 5: Build DataFrame
    # -----------------------------------------------------------

    df = pd.DataFrame(records)

    return df

In [3]:
# -----------------------------------------------------------
# Run Clean Parser (Version 3)
# -----------------------------------------------------------

input_file = "clean_patch_notes.html"

df_v3 = parse_patch_notes_clean(input_file)

print("✅ Clean Structured Parsing Complete.")
print("Total structured records extracted:", len(df_v3))

✅ Clean Structured Parsing Complete.
Total structured records extracted: 0


In [4]:
# -----------------------------------------------------------
# DIAGNOSTIC — Show First 40 Raw Lines of File
# -----------------------------------------------------------

with open("clean_patch_notes.html", "r", encoding="utf-8") as f:
    raw_preview = f.readlines()

for i, line in enumerate(raw_preview[:40]):
    print(f"{i+1}: {repr(line)}")

1: '<!DOCTYPE html><html lang="en-US"><head><title>Diablo IV Patch Notes — Diablo IV — Blizzard News</title><meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=yes"><meta property="og:type" content="website"><meta http-equiv="Blz-App-Path" content="/"><meta property="og:locale" content="en_US"><link rel="canonical" href="https://news.blizzard.com/en-us/article/24244466/diablo-iv-patch-notes"/><link rel="alternate" href="https://news.blizzard.com/de-de/article/24244466/diablo-iv-patch-notes" hreflang="de-de"/><link rel="alternate" href="https://news.blizzard.com/en-gb/article/24244466/diablo-iv-patch-notes" hreflang="en-gb"/><link rel="alternate" href="https://news.blizzard.com/es-es/article/24244466/diablo-iv-patch-notes" hreflang="es-es"/><link rel="alternate" href="https://news.blizzard.com/es-mx/article/24244466/diablo-iv-patch-notes" hreflang="es-mx"/><link rel="alternate" href="https://news.blizzard.com/fr-fr/article/24244466/diablo-iv-patch-notes" href