In [36]:
# -----------------------------------------------------------
# Import Required Libraries
# -----------------------------------------------------------

import re              
import pandas as pd    
from bs4 import BeautifulSoup
from datetime import datetime

In [16]:
# -----------------------------------------------------------
# Define Improved Parsing Function
# -----------------------------------------------------------

def parse_patch_notes(filepath):
    '''
    Reads the Diablo IV patch notes HTML file.
    Uses BeautifulSoup to remove scripts and styles.
    Extracts visible text only.
    Parses patch headers and change entries.
    Returns a clean pandas DataFrame.
    '''

    # -----------------------------------------------------------
    # STEP 1: Read and Clean HTML
    # -----------------------------------------------------------

    # Open file as raw HTML
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Remove script and style elements
    for tag in soup(["script", "style"]):
        tag.decompose()

    # Extract only visible text
    text = soup.get_text(separator="\n")

    # Split into clean lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]

    # -----------------------------------------------------------
    # STEP 2: Patch Header Pattern
    # -----------------------------------------------------------

    patch_pattern = re.compile(r"(\d+\.\d+\.\d+)\s+Build\s+#(\d+).*—(.+)")

    # -----------------------------------------------------------
    # STEP 3: Prepare storage
    # -----------------------------------------------------------

    records = []

    current_patch = None
    current_build = None
    current_date = None
    current_section = None

    # -----------------------------------------------------------
    # STEP 4: Loop through cleaned lines
    # -----------------------------------------------------------

    i = 0

    while i < len(lines):

        line = lines[i]

        # Detect patch header
        patch_match = patch_pattern.match(line)

        if patch_match:
            current_patch = patch_match.group(1)
            current_build = patch_match.group(2)
            current_date = patch_match.group(3)
            i += 1
            continue

        # Detect sections
        if line in [
            "Bug Fixes", "Game Updates", "Balance Update",
            "Base Game", "Expansion", "Accessibility",
            "Skills", "Passives", "Items",
            "Legendary Aspects", "Paragon",
            "Tempering", "Miscellaneous"
            ]:
            current_section = line
            i += 1
            continue

        # Detect Previous/Now comparison
        if line.startswith("Previous"):

            previous_text = line.replace("Previous:", "").strip()

            if i + 1 < len(lines) and lines[i + 1].startswith("Now"):

                now_text = lines[i + 1].replace("Now:", "").strip()

                # Only record if we have already detected a patch header
                if current_patch is not None:

                    records.append({
                        "patch": current_patch,
                        "build": current_build,
                        "date": current_date,
                        "section": current_section,
                        "change_type": "comparison",
                        "previous": previous_text,
                        "now": now_text,
                        "full_text": f"Changed from {previous_text} to {now_text}"
                    })

                i += 2
                continue

        # Only record lines AFTER first patch header is found
        if current_patch is not None:

            records.append({
            "patch": current_patch,
            "build": current_build,
            "date": current_date,
            "section": current_section,
            "change_type": "single",
            "previous": None,
            "now": None,
            "full_text": line
             })

        i += 1

    df = pd.DataFrame(records)

    return df

In [17]:
# -----------------------------------------------------------
# Run Parser
# -----------------------------------------------------------

# Assigning file to variable and creating dataframe
input_file = "Diablo_IV_Patch_Notes.html"

df = parse_patch_notes(input_file)

print("✅ Parsing complete.")
print("Total records extracted:", len(df))

✅ Parsing complete.
Total records extracted: 1277


In [19]:
# Parse date
df["date_parsed"] = pd.to_datetime(df["date"])

# Extract time components
df["year"] = df["date_parsed"].dt.year
df["month"] = df["date_parsed"].dt.month

# Split patch into sortable parts
df[["patch_major", "patch_minor", "patch_hotfix"]] = (
    df["patch"]
    .str.split(".", expand=True)
    .astype(int)
)

In [20]:
# Removing parse date
df = df.drop(columns=["date_parsed", "year", "month"], errors="ignore")

In [21]:
# -----------------------------------------------------------
# Inspect First 20 Rows
# -----------------------------------------------------------

df.head(20)

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text,patch_major,patch_minor,patch_hotfix
0,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the Executioner Monster A...,2,5,3
1,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Developer’s Note:,2,5,3
2,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,This affix will be re-enabled with the release...,2,5,3
3,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where certain Silent Chests in ...,2,5,3
4,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where Zagraal in the Dark Citad...,2,5,3
5,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where some Tower bosses had sig...,2,5,3
6,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where an error would occur when...,2,5,3
7,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where resetting a piece of mast...,2,5,3
8,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the reward for defeating ...,2,5,3
9,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where other Divine Gifts could ...,2,5,3


In [13]:
df["change_type"].value_counts()

change_type
single        1213
comparison      64
Name: count, dtype: int64

In [22]:
df.isna().sum()
df["section"].value_counts()
df["patch"].value_counts()

patch
2.5.0    1125
2.5.2      95
2.5.1      44
2.5.3      13
Name: count, dtype: int64

In [23]:
# ---------------------------------------
# Light Structural Cleanup
# ---------------------------------------

'''
Purpose:
Perform light structural cleanup before feature engineering and classification.

This includes:
- Trimming whitespace
- Removing empty rows
- Normalizing section names
- Checking duplicates
- Ensuring version sorting works
'''

# --- Trim whitespace from key string columns ---
string_cols = ["patch", "build", "date", "section", "change_type", "full_text"]

for col in string_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# --- Remove rows where full_text is empty or NaN-like ---
df = df[df["full_text"].notna()]
df = df[df["full_text"].str.len() > 0]

# --- Normalize section names (strip + consistent casing) ---
df["section"] = df["section"].str.strip()

# --- Drop exact duplicate rows (if any) ---
df = df.drop_duplicates()

# --- Sort by proper patch order using numeric parts ---
if all(col in df.columns for col in ["patch_major", "patch_minor", "patch_hotfix"]):
    df = df.sort_values(
        by=["patch_major", "patch_minor", "patch_hotfix"],
        ascending=[False, False, False]
    ).reset_index(drop=True)

# --- Final sanity check ---
print("✅ Structural cleanup complete.")
print(f"Total rows: {len(df)}")

✅ Structural cleanup complete.
Total rows: 1175


In [24]:
df.head()

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text,patch_major,patch_minor,patch_hotfix
0,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the Executioner Monster A...,2,5,3
1,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Developer’s Note:,2,5,3
2,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,This affix will be re-enabled with the release...,2,5,3
3,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where certain Silent Chests in ...,2,5,3
4,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where Zagraal in the Dark Citad...,2,5,3


In [25]:
print("Rows after cleanup:", len(df))

print("\nChange type counts:")
print(df["change_type"].value_counts())

print("\nAny empty full_text remaining?")
print((df["full_text"].str.len() == 0).sum())

Rows after cleanup: 1175

Change type counts:
change_type
single        1111
comparison      64
Name: count, dtype: int64

Any empty full_text remaining?
0


In [26]:
df["patch"].value_counts()

patch
2.5.0    1024
2.5.2      94
2.5.1      44
2.5.3      13
Name: count, dtype: int64

In [27]:
# ---------------------------------------
# DIAGNOSTIC — Recreate Original Parsed DF
# ---------------------------------------

df_original = parse_patch_notes(input_file)

print("Original rows:", len(df_original))

Original rows: 1277


In [28]:
# ---------------------------------------
# DIAGNOSTIC — Compare Removed Rows
# ---------------------------------------

print("Original rows:", len(df_original))
print("Cleaned rows:", len(df))

# Find rows that existed originally but are missing now
removed_rows = pd.concat([df_original, df]).drop_duplicates(keep=False)

print("Rows removed during cleanup:", len(removed_rows))

Original rows: 1277
Cleaned rows: 1175
Rows removed during cleanup: 2297


In [29]:
removed_rows.head(20)

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text,patch_major,patch_minor,patch_hotfix
0,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the Executioner Monster A...,,,
1,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Developer’s Note:,,,
2,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,This affix will be re-enabled with the release...,,,
3,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where certain Silent Chests in ...,,,
4,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where Zagraal in the Dark Citad...,,,
5,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where some Tower bosses had sig...,,,
6,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where an error would occur when...,,,
7,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where resetting a piece of mast...,,,
8,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where the reward for defeating ...,,,
9,2.5.3,70356,"January 28, 2026",Bug Fixes,single,,,Fixed an issue where other Divine Gifts could ...,,,


In [30]:
removed_rows["patch"].value_counts()
removed_rows["change_type"].value_counts()

change_type
single        2169
comparison     128
Name: count, dtype: int64

In [31]:
# Rows that were in original but not in cleaned
actually_removed = df_original.merge(
    df,
    how="left",
    indicator=True
).query('_merge == "left_only"')

len(actually_removed)

2

In [32]:
actually_removed[["patch", "section", "full_text"]]

Unnamed: 0,patch,section,full_text
666,2.5.0,Game Updates,Changed from Gain 60 Maximum Resource. When ta...
975,2.5.0,Legendary Aspects,Changed from Gain the effect of the Teleport E...


In [33]:
# Check if these exact rows exist in cleaned df
df[
    (df["patch"] == "2.5.0") &
    (df["full_text"].str.contains("Gain 60 Maximum Resource"))
]

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text,patch_major,patch_minor,patch_hotfix
638,2.5.0,69713,"December 11, 2025",Game Updates,comparison,"Gain 60 Maximum Resource. When taking damage, ...",,Changed from Gain 60 Maximum Resource. When ta...,2,5,0


In [34]:
df[
    (df["patch"] == "2.5.0") &
    (df["full_text"].str.contains("Teleport"))
]

Unnamed: 0,patch,build,date,section,change_type,previous,now,full_text,patch_major,patch_minor,patch_hotfix
304,2.5.0,69713,"December 11, 2025",Tempering,single,,,"We heard the feedback from PTR loud and clear,...",2,5,0
306,2.5.0,69713,"December 11, 2025",Tempering,single,,,Teleport Enchantment,2,5,0
307,2.5.0,69713,"December 11, 2025",Tempering,single,,,Adjusted the description to specify that it in...,2,5,0
308,2.5.0,69713,"December 11, 2025",Skills,single,,,Shimmering Teleport,2,5,0
309,2.5.0,69713,"December 11, 2025",Skills,comparison,Previous (PTR): Teleport Dazes enemies at its ...,You gain 25%[+] All Resistance for 3 seconds a...,Changed from Previous (PTR): Teleport Dazes en...,2,5,0
324,2.5.0,69713,"December 11, 2025",Legendary Aspects,single,,,Teleport's Cooldown reduction increased from 1...,2,5,0
329,2.5.0,69713,"December 11, 2025",Legendary Aspects,comparison,Previous (PTR): Casting Teleport grants a stac...,Each enemy hit by Teleport increases the damag...,Changed from Previous (PTR): Casting Teleport ...,2,5,0
332,2.5.0,69713,"December 11, 2025",Legendary Aspects,comparison,"Previous (PTR): After Casting Teleport, Close ...",Teleport pulls Close enemies to you and grants...,Changed from Previous (PTR): After Casting Tel...,2,5,0
335,2.5.0,69713,"December 11, 2025",Legendary Aspects,single,,,Ranks to Shocking Impact replaced with 30-40% ...,2,5,0
449,2.5.0,69713,"December 11, 2025",Bug Fixes,single,,,Fixed an issue where the Teleport Enchantment ...,2,5,0


In [35]:
# ---------------------------------------
# Save Clean Baseline Dataset
# ---------------------------------------

'''
Purpose:
Save the cleaned, structurally verified patch note dataset
before feature engineering and classification.
'''

output_path = "diablo_iv_patch_notes_clean_baseline.csv"

df.to_csv(output_path, index=False)

print(f"✅ Baseline dataset saved to: {output_path}")
print(f"Total rows saved: {len(df)}")

✅ Baseline dataset saved to: diablo_iv_patch_notes_clean_baseline.csv
Total rows saved: 1175


In [37]:
# Saving timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_path = f"diablo_iv_patch_notes_clean_{timestamp}.csv"

df.to_csv(output_path, index=False)

print(f"✅ Timestamped dataset saved to: {output_path}")

✅ Timestamped dataset saved to: diablo_iv_patch_notes_clean_20260221_1452.csv
