In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re

with open("Diablo_IV_Patch_Notes.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

data = []

panels = soup.find_all("div", class_="panel")

for panel in panels:
    
    # Extract version/build from panel-title
    title_div = panel.find("div", class_="panel-title")
    if not title_div:
        continue
    
    title_text = title_div.get_text(strip=True)
    
    # Extract version (e.g., 2.5.3)
    version_match = re.search(r"\d+\.\d+\.\d+", title_text)
    version = version_match.group(0) if version_match else "UNKNOWN"
    
    # Extract build number
    build_match = re.search(r"Build\s+#?(\d+)", title_text)
    build = build_match.group(1) if build_match else "UNKNOWN"
    
    # Extract panel body
    body = panel.find("div", class_="panel-body")
    if not body:
        continue
    
    # Extract all bullet points within this patch
    bullets = body.find_all("li")
    
    for idx, li in enumerate(bullets):
        text = li.get_text(separator=" ", strip=True)
        if text:
            data.append({
                "version": version,
                "build": build,
                "bullet_text": text
            })

df_bullets = pd.DataFrame(data)

print("Total bullets extracted:", len(df_bullets))
df_bullets.head()

Total bullets extracted: 863


Unnamed: 0,version,build,bullet_text
0,2.5.3,70356,Fixed an issue where the Executioner Monster A...
1,2.5.3,70356,Fixed an issue where certain Silent Chests in ...
2,2.5.3,70356,Fixed an issue where Zagraal in the Dark Citad...
3,2.5.3,70356,Fixed an issue where some Tower bosses had sig...
4,2.5.3,70356,Fixed an issue where an error would occur when...


In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import re

# Load HTML
with open("Diablo_IV_Patch_Notes.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

data = []

# Each patch is inside a div.panel
panels = soup.find_all("div", class_="panel")

for panel in panels:

    # --- Extract Title Block ---
    title_div = panel.find("div", class_="panel-title")
    if not title_div:
        continue

    title_text = title_div.get_text(strip=True)

    # --- Extract Version ---
    version_match = re.search(r"\d+\.\d+\.\d+", title_text)
    version = version_match.group(0) if version_match else "UNKNOWN"

    # --- Extract Build ---
    build_match = re.search(r"Build\s+#?(\d+)", title_text)
    build = build_match.group(1) if build_match else "UNKNOWN"

    # --- Extract Date (embedded in title text) ---
    date_match = re.search(
        r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}",
        title_text
    )
    date = date_match.group(0) if date_match else "UNKNOWN"

    # --- Extract Patch Body ---
    body = panel.find("div", class_="panel-body")
    if not body:
        continue

    # Extract true bullet points
    bullets = body.find_all("li")

    for li in bullets:
        text = li.get_text(separator=" ", strip=True)

        if text:
            data.append({
                "version": version,
                "build": build,
                "date": date,
                "bullet_text": text
            })

# Create DataFrame
df_bullets = pd.DataFrame(data)

print("Total bullets extracted:", len(df_bullets))

# Verify structure
display(
    df_bullets[["version", "build", "date"]]
    .drop_duplicates()
    .sort_values("version")
)

Total bullets extracted: 863


Unnamed: 0,version,build,date
127,2.5.0,69713,"December 11, 2025"
91,2.5.1,69864,"December 18, 2025"
11,2.5.2,70156,"January 12, 2026"
0,2.5.3,70356,"January 28, 2026"
