In [2]:
import json
import os

# Load JSON
with open("diablo_iv_patches_structured.json", "r", encoding="utf-8") as f:
    patches = json.load(f)

total_bullets = 0

for patch in patches:
    # Count line breaks as proxy for bullet points
    lines = patch["content"].split("\n")
    non_empty_lines = [line for line in lines if line.strip() != ""]
    total_bullets += len(non_empty_lines)

print("Total text segments (approx bullets):", total_bullets)

Total text segments (approx bullets): 1374


In [3]:
import json
import pandas as pd
import os

# Load JSON
with open("diablo_iv_patches_structured.json", "r", encoding="utf-8") as f:
    patches = json.load(f)

rows = []

for patch in patches:
    version = patch.get("version")
    build = patch.get("build")
    date = patch.get("date")
    
    lines = patch["content"].split("\n")
    
    for line in lines:
        clean_line = line.strip()
        
        # Skip empty lines
        if clean_line == "":
            continue
        
        # Optional: skip section headers (simple heuristic)
        if len(clean_line.split()) <= 2:
            continue
        
        rows.append({
            "version": version,
            "build": build,
            "date": date,
            "bullet_text": clean_line
        })

df_bullets = pd.DataFrame(rows)

print("Total bullet-level samples:", len(df_bullets))

df_bullets.to_csv("diablo_iv_bullets.csv", index=False)

Total bullet-level samples: 957


In [9]:
import pandas as pd

df = pd.read_csv("diablo_iv_bullets.csv")

buff_words = [
    "increase", "increased", "gain", "bonus",
    "improve", "improved", "enhance", "enhanced",
    "boost", "boosted"
]

nerf_words = [
    "reduce", "reduced", "decrease", "decreased",
    "lowered", "no longer", "less", "down from",
    "cooldown increased", "damage reduced"
]

bug_words = [
    "bug", "fixed", "issue", "resolved",
    "crash", "exploit", "addressed"
]

new_words = [
    "added", "introducing", "new",
    "now available", "unlocked",
    "new dungeon", "new mode"
]

qol_words = [
    "quality of life", "ui", "tooltip",
    "clarified", "improved visibility",
    "updated description"
]

gameplay_words = [
    "mechanic", "system", "spawn",
    "drop rate", "difficulty",
    "scaling", "interaction",
    "behavior", "enemy density"
]

def contains_any(text, keywords):
    text = text.lower()
    return int(any(word in text for word in keywords))

df["BUFF"] = df["bullet_text"].apply(lambda x: contains_any(x, buff_words))
df["NERF"] = df["bullet_text"].apply(lambda x: contains_any(x, nerf_words))
df["BUG_FIX"] = df["bullet_text"].apply(lambda x: contains_any(x, bug_words))
df["NEW_CONTENT"] = df["bullet_text"].apply(lambda x: contains_any(x, new_words))
df["QOL"] = df["bullet_text"].apply(lambda x: contains_any(x, qol_words))
df["GAMEPLAY"] = df["bullet_text"].apply(lambda x: contains_any(x, gameplay_words))

# No fallback rule anymore

df.to_csv("diablo_iv_bullets_weak_labeled_v2.csv", index=False)

print("Improved weak labeling complete.")

Improved weak labeling complete.


In [5]:
import pandas as pd

df = pd.read_csv("diablo_iv_bullets_weak_labeled.csv")

print(df.head())
print(df.columns)
print("Total rows:", len(df))

  version  build              date  \
0   2.5.3  70356  January 28, 2026   
1   2.5.3  70356  January 28, 2026   
2   2.5.3  70356  January 28, 2026   
3   2.5.3  70356  January 28, 2026   
4   2.5.3  70356  January 28, 2026   

                                         bullet_text  BUFF  NERF  BUG_FIX  \
0  Fixed an issue where the Executioner Monster A...     0     0        1   
1  This affix will be re-enabled with the release...     0     0        0   
2  Fixed an issue where certain Silent Chests in ...     0     0        1   
3  Fixed an issue where Zagraal in the Dark Citad...     0     0        1   
4  Fixed an issue where some Tower bosses had sig...     0     0        1   

   NEW_CONTENT  QOL  GAMEPLAY  
0            0    0         0  
1            0    0         1  
2            0    0         0  
3            0    0         0  
4            0    0         0  
Index(['version', 'build', 'date', 'bullet_text', 'BUFF', 'NERF', 'BUG_FIX',
       'NEW_CONTENT', 'QOL', 'GAMEPLAY'

In [6]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]
print(df[label_cols].sum())

BUFF           231
NERF            86
BUG_FIX        184
GAMEPLAY       461
NEW_CONTENT     19
QOL             49
dtype: int64


In [7]:
df["label_count"] = df[label_cols].sum(axis=1)
print(df["label_count"].value_counts())

label_count
1    888
2     65
3      4
Name: count, dtype: int64


In [10]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

print(df[label_cols].sum())

df["label_count"] = df[label_cols].sum(axis=1)
print(df["label_count"].value_counts())

BUFF           232
NERF           111
BUG_FIX        184
GAMEPLAY        51
NEW_CONTENT     28
QOL             49
dtype: int64
label_count
1    435
0    417
2     95
3     10
Name: count, dtype: int64


In [11]:
label_cols = ["BUFF","NERF","BUG_FIX","GAMEPLAY","NEW_CONTENT","QOL"]

df["OTHER"] = (df[label_cols].sum(axis=1) == 0).astype(int)

print(df["OTHER"].sum())

417
