In [3]:
!pip install seqeval -q
!pip install -U transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.2
    Uninstalling transformers-4.56.2:
      Successfully uninstalled transformers-4.56.2
Successfully installed tra

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# ===== Baseline: BioBERT fine-tuning on E3C few-shot =====
import os, random, numpy as np
from pathlib import Path
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          DataCollatorForTokenClassification, TrainingArguments, Trainer)
from seqeval.metrics import classification_report, f1_score

# ---- paths ----
BASE = Path("/content/drive/MyDrive/small_data_NER_project")
DATA_DIR = BASE/"conll/fewshot_k5_seed42"   # <-- change to fewshot_k1_seed42 / k10 / k20 if needed
OUT_DIR  = BASE/"results"/"biobert_k5_full"

# ---- read CoNLL ----
def read_conll(path):
    sents, tokens, labels = [], [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                if tokens:
                    sents.append({"tokens":tokens, "ner_tags":labels})
                    tokens, labels = [], []
            else:
                parts = line.split()
                tok, lab = parts[0], parts[-1]
                tokens.append(tok); labels.append(lab)
    if tokens: sents.append({"tokens":tokens, "ner_tags":labels})
    return sents

train = read_conll(DATA_DIR/"train.conll")
dev   = read_conll(DATA_DIR/"dev.conll")
test  = read_conll(DATA_DIR/"test.conll")

print(f"Loaded: train={len(train)} dev={len(dev)} test={len(test)}")
print("Sample:", train[0]["tokens"][:12], "\n", train[0]["ner_tags"][:12])

Loaded: train=5 dev=200 test=851
Sample: ['After', 'hemodynamic', 'stabilization', ',', 'an', 'oesophageo-gastro-duodenoscopy', 'was', 'performed', 'which', 'showed', ':', 'The'] 
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
# ---- build label list (BIO) ----
all_labels = sorted({l for ex in (train+dev+test) for l in ex["ner_tags"]})
if "O" in all_labels:
    all_labels.remove("O"); all_labels = ["O"] + all_labels
label2id = {l:i for i,l in enumerate(all_labels)}
id2label = {i:l for l,i in label2id.items()}
num_labels = len(all_labels)
print("Labels:", all_labels)

# ---- HF datasets ----
ds = DatasetDict({
    "train": Dataset.from_list(train),
    "validation": Dataset.from_list(dev),
    "test": Dataset.from_list(test),
})

# ---- tokenizer & alignment ----
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_align(batch):
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)
    labels = []
    for i, lbls in enumerate(batch["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev_word = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                # Only label the first wordpiece; rest -> -100
                if wid != prev_word:
                    aligned.append(label2id.get(lbls[wid], label2id["O"]))
                else:
                    aligned.append(-100)
                prev_word = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized = ds.map(tokenize_align, batched=True, remove_columns=["tokens","ner_tags"])


Labels: ['O', 'B-ety', 'I-ety']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [7]:
# ---- model ----
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id)

# ---- metrics ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    pred_tags, true_tags = [], []
    for p, l in zip(preds, labels):
        pt, lt = [], []
        for pi, li in zip(p, l):
            if li == -100:  # skip subword positions
                continue
            pt.append(id2label[int(pi)])
            lt.append(id2label[int(li)])
        pred_tags.append(pt); true_tags.append(lt)
    f1 = f1_score(true_tags, pred_tags)
    return {"f1": f1}


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# ---- training args ----
from transformers import TrainingArguments

OUT_DIR.mkdir(parents=True, exist_ok=True)

args = TrainingArguments(
    output_dir=str(OUT_DIR),
    do_train=True,
    do_eval=True,       # 显式启用验证
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir=str(OUT_DIR / "logs"),
    logging_steps=20,
    save_steps=500,     # 每多少步保存一次
    seed=42
)

collator = DataCollatorForTokenClassification(tokenizer)

In [9]:
import os
os.environ["WANDB_MODE"] = "disabled"
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss


TrainOutput(global_step=10, training_loss=0.4827435970306396, metrics={'train_runtime': 63.341, 'train_samples_per_second': 0.789, 'train_steps_per_second': 0.158, 'total_flos': 2271056796900.0, 'train_loss': 0.4827435970306396, 'epoch': 10.0})

In [10]:
!nvidia-smi
import torch; print("cuda?", torch.cuda.is_available())

Tue Oct  7 21:38:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P0             27W /   70W |    1974MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
# ---- evaluate (dev + test) ----
def eval_split(name):
    out = trainer.evaluate(tokenized[name])
    print(f"{name.upper()} F1:", round(out["eval_f1"], 4))
    return out["eval_f1"]

f1_dev  = eval_split("validation")
f1_test = eval_split("test")

# ---- save predictions + detailed report on test ----
pred_logits = trainer.predict(tokenized["test"]).predictions
pred_ids = pred_logits.argmax(-1)
pred_tags, true_tags = [], []
for p, l in zip(pred_ids, tokenized["test"]["labels"]):
    pt, lt = [], []
    for pi, li in zip(p, l):
        if li == -100:
            continue
        pt.append(id2label[int(pi)])
        lt.append(id2label[int(li)])
    pred_tags.append(pt); true_tags.append(lt)

print("\nClassification report (test):")
print(classification_report(true_tags, pred_tags))

# save minimal metrics
import json
with open(OUT_DIR/"metrics.json","w") as f:
    json.dump({"f1_dev": float(f1_dev), "f1_test": float(f1_test)}, f, indent=2)
print(f"\nSaved metrics to {OUT_DIR}/metrics.json")

VALIDATION F1: 0.0
TEST F1: 0.0

Classification report (test):


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ety       0.00      0.00      0.00       516

   micro avg       0.00      0.00      0.00       516
   macro avg       0.00      0.00      0.00       516
weighted avg       0.00      0.00      0.00       516


Saved metrics to /content/drive/MyDrive/small_data_NER_project/results/biobert_k5_full/metrics.json
