In [1]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re              
import pandas as pd    
from bs4 import BeautifulSoup

In [None]:
# -----------------------------------------------------------
# Define Improved Parsing Function
# -----------------------------------------------------------

def parse_patch_notes(filepath):
    '''
    Reads the Diablo IV patch notes HTML file.
    Uses BeautifulSoup to remove scripts and styles.
    Extracts visible text only.
    Parses patch headers and change entries.
    Returns a clean pandas DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read and Clean HTML
    # -----------------------------------------------------------

    # Open file as raw HTML
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Remove script and style elements
    for tag in soup(["script", "style"]):
        tag.decompose()

    # Extract only visible text
    text = soup.get_text(separator="\n")

    # Split into clean lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]

    # -----------------------------------------------------------
    # STEP 2: Patch Header Pattern
    # -----------------------------------------------------------

    patch_pattern = re.compile(r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*—(.+)")

    # -----------------------------------------------------------
    # STEP 3: Prepare storage
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None
    current_section = None

    # -----------------------------------------------------------
    # STEP 4: Loop through cleaned lines
    # -----------------------------------------------------------

    i = 0

    while i < len(lines):

        line = lines[i]

        # Detect patch header
        patch_match = patch_pattern.match(line)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)
            i += 1
            continue

        # Detect sections
        if line in [
            "Bug Fixes", "Game Updates", "Balance Update",
            "Base Game", "Expansion", "Accessibility",
            "Skills", "Passives", "Items",
            "Legendary Aspects", "Paragon",
            "Tempering", "Miscellaneous"
        ]:
            current_section = line
            i += 1
            continue

        # Detect Previous/Now comparison
        if line.startswith("Previous"):

            previous_text = line.replace("Previous:", "").strip()

            if i + 1 < len(lines) and lines[i + 1].startswith("Now"):

                now_text = lines[i + 1].replace("Now:", "").strip()

                # ✅ Only record if we have already detected a patch header
                if current_patch is not None:

                    records.append({
                        "patch": current_patch,
                        "build": current_build,
                        "date": current_date,
                        "section": current_section,
                        "change_type": "comparison",
                        "previous": previous_text,
                        "now": now_text,
                        "full_text": f"Changed from {previous_text} to {now_text}"
                    })

                i += 2
                continue

        # Only record lines AFTER first patch header is found
        if current_patch is not None:

        records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "section": current_section,
            "change_type": "single",
            "previous": None,
            "now": None,
            "full_text": line
        })

        i += 1

    df = pd.DataFrame(records)

    return df

In [3]:
# -----------------------------------------------------------
# Run Parser
# -----------------------------------------------------------

# Assigning file to variable and creating dataframe
input_file = "Diablo_IV_Patch_Notes.html"

df = parse_patch_notes(input_file)

print("✅ Parsing complete.")
print("Total records extracted:", len(df))

✅ Parsing complete.
Total records extracted: 1283


In [4]:
# -----------------------------------------------------------
# Inspect First 20 Rows
# -----------------------------------------------------------

df.head(20)

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text
0,,,,,single,,,Diablo IV Patch Notes — Diablo IV — Blizzard News
1,,,,,single,,,undefined
2,,,,,single,,,Diablo IV
3,,,,,single,,,Diablo IV Patch Notes
4,,,,,single,,,Blizzard Entertainment
5,,,,,single,,,The Diablo IV team has been monitoring your fe...
6,2.5.3,70356.0,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the Executioner Monster A...
7,2.5.3,70356.0,"January 28, 2026",Bug Fixes,single,,,Developer’s Note:
8,2.5.3,70356.0,"January 28, 2026",Bug Fixes,single,,,This affix will be re-enabled with the release...
9,2.5.3,70356.0,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where certain Silent Chests in ...


In [5]:
df["change_type"].value_counts()

change_type
single        1219
comparison      64
Name: count, dtype: int64