In [1]:
import json

def load_ndjson(path):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data

# Load all splits
train_1 = load_ndjson("train00.json")
train_2 = load_ndjson("train01.json")
train_3 = load_ndjson("train02.json")
train_4 = load_ndjson("train03.json")
dev_data = load_ndjson("valid.json")
test_data = load_ndjson("test.json")

# Merge train splits
train_data = train_1 + train_2 + train_3 + train_4
print(f"Train sentences: {len(train_data)}  | Dev: {len(dev_data)} | Test: {len(test_data)}")

Train sentences: 59924  | Dev: 8528 | Test: 8262


In [2]:
# Optional: quick alignment check
def check_alignment(data, name):
    bad = sum(1 for e in data if len(e["tokens"]) != len(e["tags"]))
    print(f"{name}: {bad} misaligned sentences")

check_alignment(train_data, "train")
check_alignment(dev_data,   "dev")
check_alignment(test_data,  "test")

train: 0 misaligned sentences
dev: 0 misaligned sentences
test: 0 misaligned sentences


In [7]:
# Load the label dictionary (tag string -> index) and build inverse mapping
with open("label.json", encoding="utf-8") as f:
    tag2idx = json.load(f)


tag2idx['<PAD_TAG>'] = len(tag2idx)
idx2tag = {i: t for t, i in tag2idx.items()}
PAD_TAG_IDX = tag2idx['<PAD_TAG>']
num_labels = len(tag2idx)


# Convert dataset entries into (word, tag_str) pairs per sentence
def to_pairs(data):
    sents = []
    for e in data:
        tokens = e["tokens"]
        tags_i = e["tags"]
        tags_s = [idx2tag[int(t)] for t in tags_i]
        sents.append(list(zip(tokens, tags_s)))
    return sents

train_sents = to_pairs(train_data)
dev_sents   = to_pairs(dev_data)
test_sents  = to_pairs(test_data)
print("Sample sentence:", train_sents[0][:10])


Sample sentence: [('People', 'O'), ('start', 'O'), ('their', 'O'), ('own', 'O'), ('businesses', 'O'), ('for', 'O'), ('many', 'O'), ('reasons', 'O'), ('.', 'O')]


# Tokenizer and WordPiece label alignment

In [4]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
MAX_LEN = 128  # adjust based on sentence lengths

def encode_sentence(sent):
    """
    Align labels to first subword of each token.
    Mask other subwords and special/pad tokens with -100.
    """
    tokens = [w for (w, _) in sent]
    tags   = [tag2idx[t] for (_, t) in sent]  # convert tag strings to ids

    enc = tokenizer(tokens,
                    is_split_into_words=True,
                    truncation=True,
                    padding="max_length",
                    max_length=MAX_LEN)

    word_ids = enc.word_ids()
    aligned_labels = []
    prev_word = None
    for wid in word_ids:
        if wid is None:
            aligned_labels.append(-100)          # [CLS], [SEP], padding
        elif wid != prev_word:
            aligned_labels.append(tags[wid])     # first subword gets label
        else:
            aligned_labels.append(-100)          # subsequent subwords ignored in loss
        prev_word = wid

    enc["labels"] = aligned_labels
    return enc

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Dataset class

In [5]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences):
        self.encodings = [encode_sentence(s) for s in sentences]

    def __getitem__(self, idx):
        return {k: torch.tensor(v) for k, v in self.encodings[idx].items()}

    def __len__(self):
        return len(self.encodings)

train_ds = NERDataset(train_sents)
dev_ds   = NERDataset(dev_sents)
test_ds  = NERDataset(test_sents)

# Model and metrics

In [8]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=71dbe39fede46f4afcf69ce423a2a4b4ee5573da9fccbf0f472a16600bb97808
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [9]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from evaluate import load as load_metric

In [11]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_labels,
    id2label=idx2tag,    # index -> tag string
    label2id=tag2idx     # tag string -> index
)

from evaluate import load as load_metric
seqeval = load_metric("seqeval")

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    true_preds, true_labels = [], []
    for pred, lab in zip(preds, labels):
        pred_tags, lab_tags = [], []
        for p_i, l_i in zip(pred, lab):
            if l_i == -100:
                continue
            pred_tags.append(idx2tag[p_i])
            lab_tags.append(idx2tag[l_i])
        true_preds.append(pred_tags)
        true_labels.append(lab_tags)

    res = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": res["overall_precision"],
        "recall": res["overall_recall"],
        "f1": res["overall_f1"],
        "accuracy": res["overall_accuracy"],
    }

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

# Training arguments

In [13]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="bert-pos-output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="logs",
)

# Train

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mslimaniaymen113[0m ([33mslimaniaymen113-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.306
1000,0.1113
1500,0.0921
2000,0.0847
2500,0.0807
3000,0.0795
3500,0.0716
4000,0.0572
4500,0.0537
5000,0.0513


# Evaluate on the test set

In [None]:
test_results = trainer.evaluate(test_ds)
print("Test results:", test_results)

In [None]:
from seqeval.metrics import classification_report

pred_output = trainer.predict(test_ds)
preds = pred_output.predictions.argmax(-1)
labels = pred_output.label_ids

true_seq, pred_seq = [], []
for pred, lab in zip(preds, labels):
    t_tags, p_tags = [], []
    for p_i, l_i in zip(pred, lab):
        if l_i == -100:
            continue
        t_tags.append(idx2tag[l_i])
        p_tags.append(idx2tag[p_i])
    true_seq.append(t_tags)
    pred_seq.append(p_tags)

print(classification_report(true_seq, pred_seq, digits=3))