In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


***Utils***

In [4]:
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import json
import numpy as np
import itertools, csv
from mention import Mention, decode_bio
from score import ScoringCounts, score_mentions

In [29]:
# model_name = "distilbert-base-cased"
# model_name = "bert-base-cased"
model_name = "bert-base-uncased"

In [15]:
TRAIN_PATH = "/content/drive/MyDrive/cosi216 final project/annotation_dataset/train_BIO.jsonl"
DEV_PATH = "/content/drive/MyDrive/cosi216 final project/annotation_dataset/dev_BIO.jsonl"
TEST_PATH = "/content/drive/MyDrive/cosi216 final project/annotation_dataset/test_BIO.jsonl"

In [6]:
# Label Vocab
labels  = ["O",
           "B-CHAR","I-CHAR",
           "B-LOCA","I-LOCA",
           "B-CREA","I-CREA",
           "B-ITEM","I-ITEM"]
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

In [7]:
def align(instance):
    words = instance["text"]
    gold  = [label2id[x] for x in instance["label"]]

    # Step 2: tokenizer
    encode = tokenizer(words, is_split_into_words=True, truncation=True)
    word_ids = encode.word_ids()

    aligned = []
    prev = None
    for wid in word_ids:
        if wid is None:
            aligned.append(-100)
        elif wid != prev:
            aligned.append(gold[wid])
        else:
            aligned.append(gold[wid] if gold[wid] != 0 else 0)
        prev = wid
    encode["word_ids"] = word_ids
    encode["labels"] = aligned
    return encode

In [8]:
def reverse_align(word_ids, subword_labels):
    word_labels = []

    current_word_id = None
    current_word_labels = []

    for word_id, label in zip(word_ids, subword_labels):
        if word_id is None:
            continue
        if word_id != current_word_id:
            if current_word_labels:
                valid_labels = [l for l in current_word_labels if l != -100]
                word_labels.append(valid_labels[0] if valid_labels else 0)
            current_word_id = word_id
            current_word_labels = [label]
        else:
            current_word_labels.append(label)

    # last word
    if current_word_labels:
        valid_labels = [l for l in current_word_labels if l != -100]
        word_labels.append(valid_labels[0] if valid_labels else 0)

    return word_labels

In [9]:
def compute_preds(eval_pred):
    logits = eval_pred[0]
    labels = eval_pred[1]
    preds = np.argmax(logits, axis=-1)

    true_preds, true_labels = [], []
    for pred_row, gold_row in zip(preds, labels):
        p_sentence = []
        for p, g in zip(pred_row, gold_row):
            if g == -100:
                continue
            p_sentence.append(id2label[p])
        true_preds.append(p_sentence)

    return true_preds

In [10]:
def calculation_matrix(reference_labels, predict_labels):
    reference = decode_bio(reference_labels)
    predict = decode_bio(predict_labels)
    matrix = score_mentions(reference, predict)
    return matrix, reference, predict

In [11]:
def evaluate(y_true, y_pred):
    TP = FP = FN = 0
    output_list = []
    for y, y_hat in zip(y_true, y_pred):

      matrix, reference, predict = calculation_matrix(y, y_hat)
      TP += matrix[0]
      FP += matrix[1]
      FN += matrix[2]

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "overall_f1": f1,
        "TP": TP,
        "FP": FP,
        "FN": FN
    }

In [12]:
def get_metrics(predictions, dev_tokenized):
    true_preds = compute_preds(predictions)
    results = []
    for i, item in enumerate(true_preds):
      y_hat = reverse_align(dev_tokenized[i]["word_ids"][1:-1], true_preds[i])
      results.append(y_hat)
    return results

***Start training***

In [16]:
with open(TRAIN_PATH, "r", encoding="latin1") as f:
  train_dataset = Dataset.from_list(list(map(json.loads, f)))
with open(DEV_PATH, "r", encoding="latin1") as f:
  dev_dataset = Dataset.from_list(list(map(json.loads, f)))
with open(TEST_PATH, "r", encoding="latin1") as f:
  test_dataset = Dataset.from_list(list(map(json.loads, f)))

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForTokenClassification(tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [31]:
train_tokenized = train_dataset.map(align, batched=False)
train_tokenized = train_tokenized.remove_columns(["text", "label"])
dev_tokenized = dev_dataset.map(align, batched=False)
dev_tokenized = dev_tokenized.remove_columns(["text", "label"])
test_tokenized = test_dataset.map(align, batched=False)
test_tokenized = test_tokenized.remove_columns(["text", "label"])

Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

Map:   0%|          | 0/184 [00:00<?, ? examples/s]

Map:   0%|          | 0/184 [00:00<?, ? examples/s]

In [33]:
grid = {
    "learning_rate": [5e-05],
    # learning_rate": [5e-05, 3e-05],
    "per_device_train_batch_size": [8],
    "num_train_epochs": [5],
}

results = []
for lr, bs, ep in itertools.product(
        grid["learning_rate"],
        grid["per_device_train_batch_size"],
        grid["num_train_epochs"]):
    run_name = f"bert_lr{lr}_bs{bs}_ep{ep}"
    args = TrainingArguments(
        run_name,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        learning_rate=lr,
        num_train_epochs=ep,
        logging_steps=50,
        save_strategy="no",
        report_to="none"
    )
    model = AutoModelForTokenClassification.from_pretrained(
           model_name,
           num_labels=len(labels),
           id2label=id2label, label2id=label2id)
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tokenized,
        data_collator=data_collator
    )
    trainer.train()
    dev_predictions = trainer.predict(dev_tokenized)
    dev_true_preds = get_metrics(dev_predictions, dev_tokenized)
    dev_true_ref = dev_dataset["label"]
    dev_metrics = evaluate(dev_true_ref, dev_true_preds)

    test_predictions = trainer.predict(test_tokenized)
    test_true_preds = get_metrics(test_predictions, test_tokenized)
    test_true_ref = test_dataset["label"]
    test_metrics = evaluate(test_true_ref, test_true_preds)

    results.append({
       "run": run_name,
        "lr": lr,
        "batch_size": bs,
        "epochs": ep,
        "dev_overall_f1": dev_metrics["overall_f1"],
        "dev_TP": dev_metrics["TP"],
        "dev_FP": dev_metrics["FP"],
        "dev_FN": dev_metrics["FN"],
        "test_overall_f1": test_metrics["overall_f1"],
        "test_TP": test_metrics["TP"],
        "test_FP": test_metrics["FP"],
        "test_FN": test_metrics["FN"],
       })

# save to CSV
with open("hp_sweep_bert_uncase.csv","w",newline="") as out:
    w = csv.DictWriter(out, fieldnames=results[0].keys())
    w.writeheader()
    w.writerows(results)

print("Grid search finished, results in hp_sweep_bert_uncase.csv")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6396
100,0.3602
150,0.2778
200,0.2314
250,0.1961
300,0.1086
350,0.1213
400,0.0978
450,0.0608
500,0.0831


Grid search finished, results in hp_sweep_bert_uncase.csv
