In [2]:
import json
import os

# Load JSON
with open("diablo_iv_patches_structured.json", "r", encoding="utf-8") as f:
    patches = json.load(f)

total_bullets = 0

for patch in patches:
    # Count line breaks as proxy for bullet points
    lines = patch["content"].split("\n")
    non_empty_lines = [line for line in lines if line.strip() != ""]
    total_bullets += len(non_empty_lines)

print("Total text segments (approx bullets):", total_bullets)

Total text segments (approx bullets): 1374


In [3]:
import json
import pandas as pd
import os

# Load JSON
with open("diablo_iv_patches_structured.json", "r", encoding="utf-8") as f:
    patches = json.load(f)

rows = []

for patch in patches:
    version = patch.get("version")
    build = patch.get("build")
    date = patch.get("date")
    
    lines = patch["content"].split("\n")
    
    for line in lines:
        clean_line = line.strip()
        
        # Skip empty lines
        if clean_line == "":
            continue
        
        # Optional: skip section headers (simple heuristic)
        if len(clean_line.split()) <= 2:
            continue
        
        rows.append({
            "version": version,
            "build": build,
            "date": date,
            "bullet_text": clean_line
        })

df_bullets = pd.DataFrame(rows)

print("Total bullet-level samples:", len(df_bullets))

df_bullets.to_csv("diablo_iv_bullets.csv", index=False)

Total bullet-level samples: 957


In [9]:
import pandas as pd

df = pd.read_csv("diablo_iv_bullets.csv")

buff_words = [
    "increase", "increased", "gain", "bonus",
    "improve", "improved", "enhance", "enhanced",
    "boost", "boosted"
]

nerf_words = [
    "reduce", "reduced", "decrease", "decreased",
    "lowered", "no longer", "less", "down from",
    "cooldown increased", "damage reduced"
]

bug_words = [
    "bug", "fixed", "issue", "resolved",
    "crash", "exploit", "addressed"
]

new_words = [
    "added", "introducing", "new",
    "now available", "unlocked",
    "new dungeon", "new mode"
]

qol_words = [
    "quality of life", "ui", "tooltip",
    "clarified", "improved visibility",
    "updated description"
]

gameplay_words = [
    "mechanic", "system", "spawn",
    "drop rate", "difficulty",
    "scaling", "interaction",
    "behavior", "enemy density"
]

def contains_any(text, keywords):
    text = text.lower()
    return int(any(word in text for word in keywords))

df["BUFF"] = df["bullet_text"].apply(lambda x: contains_any(x, buff_words))
df["NERF"] = df["bullet_text"].apply(lambda x: contains_any(x, nerf_words))
df["BUG_FIX"] = df["bullet_text"].apply(lambda x: contains_any(x, bug_words))
df["NEW_CONTENT"] = df["bullet_text"].apply(lambda x: contains_any(x, new_words))
df["QOL"] = df["bullet_text"].apply(lambda x: contains_any(x, qol_words))
df["GAMEPLAY"] = df["bullet_text"].apply(lambda x: contains_any(x, gameplay_words))

# No fallback rule anymore

df.to_csv("diablo_iv_bullets_weak_labeled_v2.csv", index=False)

print("Improved weak labeling complete.")

Improved weak labeling complete.


In [5]:
import pandas as pd

df = pd.read_csv("diablo_iv_bullets_weak_labeled.csv")

print(df.head())
print(df.columns)
print("Total rows:", len(df))

  version  build              date  \
0   2.5.3  70356  January 28, 2026   
1   2.5.3  70356  January 28, 2026   
2   2.5.3  70356  January 28, 2026   
3   2.5.3  70356  January 28, 2026   
4   2.5.3  70356  January 28, 2026   

                                         bullet_text  BUFF  NERF  BUG_FIX  \
0  Fixed an issue where the Executioner Monster A...     0     0        1   
1  This affix will be re-enabled with the release...     0     0        0   
2  Fixed an issue where certain Silent Chests in ...     0     0        1   
3  Fixed an issue where Zagraal in the Dark Citad...     0     0        1   
4  Fixed an issue where some Tower bosses had sig...     0     0        1   

   NEW_CONTENT  QOL  GAMEPLAY  
0            0    0         0  
1            0    0         1  
2            0    0         0  
3            0    0         0  
4            0    0         0  
Index(['version', 'build', 'date', 'bullet_text', 'BUFF', 'NERF', 'BUG_FIX',
       'NEW_CONTENT', 'QOL', 'GAMEPLAY'

In [6]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]
print(df[label_cols].sum())

BUFF           231
NERF            86
BUG_FIX        184
GAMEPLAY       461
NEW_CONTENT     19
QOL             49
dtype: int64


In [7]:
df["label_count"] = df[label_cols].sum(axis=1)
print(df["label_count"].value_counts())

label_count
1    888
2     65
3      4
Name: count, dtype: int64


In [10]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

print(df[label_cols].sum())

df["label_count"] = df[label_cols].sum(axis=1)
print(df["label_count"].value_counts())

BUFF           232
NERF           111
BUG_FIX        184
GAMEPLAY        51
NEW_CONTENT     28
QOL             49
dtype: int64
label_count
1    435
0    417
2     95
3     10
Name: count, dtype: int64


In [11]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

df["OTHER"] = (df[label_cols].sum(axis=1) == 0).astype(int)

print(df["OTHER"].sum())

417


In [12]:
df.to_csv("diablo_iv_bullets_weak_labeled_v3.csv", index=False)

In [13]:
other_df = df[df["OTHER"] == 1]

print(len(other_df))
print(other_df["bullet_text"].sample(20))

417
608                           Aspect of Anger Management
900    The following elixirs and incenses have been u...
818      Rare: 5 Veiled Crystals and 5,000-200,000 gold.
683    Previous: Hemorrhage grants 1.6% of your Maxim...
826    Salvaging a Rare item provides 1 Common Salvag...
673                            Blighted Corpse Explosion
776                          Esadora's Overflowing Cameo
839         Scrolls of Restoration have been redesigned.
825          Yields for Item Salvage have been adjusted.
729    Now: Teleport generates 3 Charges of Crackling...
277               Maximum quality is now 25, up from 20.
833    The number of Temper rolls available on an ite...
529    Maximum Life affix replaced with Critical Stri...
613                                    Bane of Ahjad-Den
796    With the changes to Renown, the pace of the ea...
409    Armor percentage affix replaced with All Resis...
254    Previous: When you Freeze an enemy there is a ...
253                        

In [14]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

df["OTHER"] = (df[label_cols].sum(axis=1) == 0).astype(int)

print("OTHER count:", df["OTHER"].sum())

OTHER count: 417


In [15]:
df.to_csv("diablo_iv_bullets_weak_labeled_v3.csv", index=False)

In [16]:
other_df = df[df["OTHER"] == 1]

print("Total OTHER rows:", len(other_df))
print(other_df["bullet_text"].sample(25, random_state=42))

Total OTHER rows: 417
472                                +424-457 Maximum Life
885                           Duration has been doubled.
912    Characters that skip either campaign now unloc...
285                         Item Quality 15: 66 Obducite
243    Now: Casting a Skill grants 3%[+] All Resistan...
920    For more details on Season Rank, please read t...
863                               Healing Potion Changes
559                           Aspect of the Prudent Heat
166                                    Fields of Crimson
203    Now: Damaging 150-100 Enemies with Nature Magi...
806                                     Altars of Lilith
600    Previous: Each point of Fury generated while a...
895    Instances of single element resistance affixes...
145    Azmodan has been granted resilience to bring h...
900    The following elixirs and incenses have been u...
704     Maximum Life % affix replaced with Intelligence.
377                   Path of the Emissary: Unique Boots
567    Ca

In [17]:
import pandas as pd
import re

df = pd.read_csv("diablo_iv_bullets.csv")

# ---------- Keyword Lists ---------- #

buff_words = [
    "increase", "increased", "gain", "bonus",
    "improve", "improved", "enhance", "enhanced",
    "boost", "boosted",
    "doubled", "now grants", "now provides",
    "now deals", "now lasts", "now increases",
    "grants", "provides", "up from"
]

nerf_words = [
    "reduce", "reduced", "decrease", "decreased",
    "lowered", "no longer", "less", "fewer",
    "down from", "halved",
    "now only", "reduced to",
    "cooldown increased to"
]

bug_words = [
    "bug", "fixed", "issue", "resolved",
    "crash", "exploit", "addressed"
]

new_words = [
    "added", "introducing", "new",
    "now available", "unlocked",
    "new dungeon", "new mode"
]

qol_words = [
    "quality of life", "ui", "tooltip",
    "clarified", "improved visibility",
    "updated description",
    "can now", "no longer requires",
    "skip", "automatically",
    "cancelled", "redesigned"
]

gameplay_words = [
    "mechanic", "system", "spawn",
    "drop rate", "difficulty",
    "scaling", "interaction",
    "behavior", "enemy density"
]

# ---------- Helper Functions ---------- #

def contains_any(text, keywords):
    text = text.lower()
    return int(any(word in text for word in keywords))

def contains_numeric_change(text):
    text = text.lower()
    pattern = r"\+\d+|\d+%|\d+-\d+|up from|down from"
    return int(bool(re.search(pattern, text)))

# ---------- Label Assignment ---------- #

df["BUFF"] = df["bullet_text"].apply(
    lambda x: int(
        contains_any(x, buff_words) or
        ("now:" in x.lower() and contains_numeric_change(x))
    )
)

df["NERF"] = df["bullet_text"].apply(
    lambda x: int(
        contains_any(x, nerf_words)
    )
)

df["BUG_FIX"] = df["bullet_text"].apply(lambda x: contains_any(x, bug_words))
df["NEW_CONTENT"] = df["bullet_text"].apply(lambda x: contains_any(x, new_words))
df["QOL"] = df["bullet_text"].apply(lambda x: contains_any(x, qol_words))
df["GAMEPLAY"] = df["bullet_text"].apply(lambda x: contains_any(x, gameplay_words))

# ---------- Temporary OTHER ---------- #

label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

df["OTHER"] = (df[label_cols].sum(axis=1) == 0).astype(int)

# ---------- Diagnostics ---------- #

print("Label Totals:")
print(df[label_cols + ["OTHER"]].sum())

df["label_count"] = df[label_cols].sum(axis=1)
print("\nLabel Count Distribution:")
print(df["label_count"].value_counts())

df.to_csv("diablo_iv_bullets_weak_labeled_v4.csv", index=False)

print("\n✅ v4 labeling complete.")

Label Totals:
BUFF           295
NERF           114
BUG_FIX        184
GAMEPLAY        51
NEW_CONTENT     28
QOL             59
OTHER          352
dtype: int64

Label Count Distribution:
label_count
1    491
0    352
2    102
3     12
Name: count, dtype: int64

✅ v4 labeling complete.


In [18]:
other_df.to_csv("diablo_iv_OTHER_v4.csv", index=False)

In [19]:
import re

def is_header_like(text):
    text = text.strip()

    # Keep if contains numeric change pattern
    if re.search(r"\d+|\[x\]|%|from .* to", text.lower()):
        return False

    # Keep if ends with period
    if text.endswith("."):
        return False

    # Remove if very short and no punctuation
    if len(text.split()) <= 4 and not re.search(r"\.|,|%", text):
        return True

    # Remove if mostly Title Case
    words = text.split()
    if all(w.istitle() for w in words):
        return True

    return False

In [20]:
df = df[~df["bullet_text"].apply(is_header_like)]

In [21]:
df["IS_HEADER"] = df["bullet_text"].apply(is_header_like)

In [22]:
df["IS_HEADER"].value_counts()

IS_HEADER
False    840
Name: count, dtype: int64

In [24]:
df[df["IS_HEADER"] == True]["bullet_text"].sample(n=min(20, len(df[df["IS_HEADER"] == True])), random_state=42)

Series([], Name: bullet_text, dtype: object)

In [25]:
df[df["IS_HEADER"] == False]["bullet_text"].sample(20, random_state=42)

804    Killing the last Champion in a pack, or any El...
933    Fixed an issue with Infiltrator's Aspect where...
34                         Various updates to The Tower.
692    Maximum Life % affix replaced with Maximum Lif...
107    Various fixes and improvements to consistency,...
282    Masterworking cost has been adjusted, and scal...
644    Now restores 100 spirit instead of fully resto...
394    Now: Your Maximum Vigor is increased by 50%, a...
527    Maximum Life % affix replaced with Maximum Lif...
624      Fortify reduced from 50% to 5% of Maximum Life.
508    For each Defensive Skill not on your Action Ba...
292        Movement Speed changed from 16-25% to 15-25%.
288    Resource gain Tempering affixes have been adde...
74     Fixed an issue where Belial could be damaged b...
772    Now: After Casting Teleport, Close enemies are...
245    Barrier per Rank reduced from 10% to 5% of Max...
46     Fixed an issue where the player would get stuc...
219    Damage increased from 40

In [27]:
df.to_csv("diablo_iv_with_header_flag_v4.csv", index=False)

In [28]:
import re

def detect_numeric_direction(text):
    """
    Detects 'from X to Y' numeric patterns and determines direction.
    Returns 'BUFF', 'NERF', or None.
    """
    pattern = r"from\s+(\d+\.?\d*)%?\s+to\s+(\d+\.?\d*)%?"
    matches = re.findall(pattern, text.lower())

    for before, after in matches:
        before = float(before)
        after = float(after)

        if after > before:
            return "BUFF"
        elif after < before:
            return "NERF"

    return None

In [29]:
def classify_patch_v5(text):
    text_lower = text.lower()

    # --- Explicit Keyword Rules ---
    if any(word in text_lower for word in ["increased", "now grants", "now deals", "improved"]):
        return "BUFF"

    if any(word in text_lower for word in ["reduced", "decreased", "no longer", "removed"]):
        return "NERF"

    if any(word in text_lower for word in ["fixed", "issue where", "bug"]):
        return "BUGFIX"

    # --- Numeric Direction Detection ---
    direction = detect_numeric_direction(text)
    if direction:
        return direction

    # --- Fallback ---
    return "OTHER"

In [30]:
df["LABEL_V5"] = df["bullet_text"].apply(classify_patch_v5)