In [5]:
%pip install -U --force-reinstall pyarrow
%pip install -U transformers accelerate sentencepiece torch

Collecting pyarrow
  Downloading pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.1 kB)
Downloading pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 21.0.0
    Uninstalling pyarrow-21.0.0:
      Successfully uninstalled pyarrow-21.0.0
Successfully installed pyarrow-23.0.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
# packages 

import pandas as pd
import numpy as np
import torch as th
import re
import json
from pathlib import Path
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss 

from transformers import ( 
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# pull in labels and text data from Brian 

data_path = Path.cwd()
csv_path = data_path / "Brian's Work/diablo4_bullets_labeled_silver.csv"
if not csv_path.exists():
    csv_path = data_path.parent / "Brian's Work/diablo4_bullets_labeled_silver.csv"

df = pd.read_csv(csv_path)

df = df[["bullet_id", "bullet_text", "auto_label"]].dropna()

df["auto_label"] = df["auto_label"].astype(str).str.strip()

texts = df["bullet_text"].astype(str).tolist()

label_lists = df["auto_label"].apply(lambda x: [x]).tolist()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(label_lists)

label_names = mlb.classes_.tolist()

print("Rows:", len(texts))
print("Y shape:", Y.shape)
print("Label names:", label_names)
print(df["auto_label"].value_counts())

Rows: 794
Y shape: (794, 6)
Label names: ['Buff', 'Bugfix', 'Nerf', 'New Content', 'Other', 'QoL']
auto_label
Buff           276
Other          187
Bugfix         176
Nerf           108
QoL             45
New Content      2
Name: count, dtype: int64


In [4]:
# Training/Val

X_train, X_val, y_train, y_val = train_test_split(
    texts, Y, test_size=0.2, random_state=69)

In [7]:
# Tokenizer + loading data

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 256

class ChunkDataset(th.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': th.tensor(label, dtype=th.float32)
        }
    
train_ds = ChunkDataset(X_train, y_train, tokenizer, MAX_LEN)
val_ds = ChunkDataset(X_val, y_val, tokenizer, MAX_LEN)

print("Train ds:", len(train_ds), "Val ds:", len(val_ds))

Train ds: 635 Val ds: 159


In [8]:
# model creation

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=Y.shape[1],
    problem_type='multi_label_classification'
)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 2188.57it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- 

In [9]:
# Metrics / performance

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))  # Sigmoid to get probabilities
    preds = (probs > 0.5).astype(int)  # Thresholding

    micro_f1 = f1_score(labels, preds, average='micro', zero_division=0)
    macro_f1 = f1_score(labels, preds, average='macro', zero_division=0)
    hloss = hamming_loss(labels, preds)

    return {"micro_f1": micro_f1, "macro_f1": macro_f1, "hamming_loss": hloss}

In [None]:
# Training arugments

args = TrainingArguments(
    output_dir="bert_diablo_multilabel_checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
)

In [13]:
# Trainer 

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

In [None]:
# Save

def get_next_run_dir(base_name: str, root_dir: str = "models") -> Path:
    root = Path(root_dir)
    root.mkdir(parents=True, exist_ok=True)

    pattern = re.compile(rf"^{re.escape(base_name)}_(\d+)$")
    max_n = 0 

    for p in root.iterdir():
        if p.is_dir():
            match = pattern.match(p.name)
            if match:
                max_n = max(max_n, int(match.group(1)))

    next_n = max_n + 1
    next_dir = root / f"{base_name}_{next_n:03d}"
    next_dir.mkdir(parents=True, exist_ok=False)
    return next_dir

# Train
trainer.train()
metrics = trainer.evaluate()
print("eval metrics:", metrics)


# Creating a new version each time 
run_dir = get_next_run_dir("bert_diablo_multilabel", root_dir="models")
print (f"Saving model to {run_dir}")

# Saving the model, tokenizer and map
trainer.save_model(str(run_dir))
tokenizer.save_pretrained(str(run_dir))

with open(run_dir / "label_names.json", "w") as f:
    json.dump(label_names, f, indent=2)

print("Saved:", run_dir)

  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,No log,0.379994,0.223464,0.141215,0.145702,4.999,31.806,2.0


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
  super().__init__(loader)
