In [5]:
#!/usr/bin/env python
"""
Joint intent-classification + entity-tagging pipeline
(added id2label / label2id so inference shows real tag names)
"""

import ast, numpy as np
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, BertForSequenceClassification, BertForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import f1_score as seq_f1

CSV_PATH   = "TransitChat-Conversational_Route_and_Schedule_Dataset.csv"
MODEL_NAME = "bert-base-uncased"
INTENT_DIR, SLOT_DIR = "bert_intent_model", "bert_slot_model"
MAX_LEN, EPOCHS, BS, LR, SEED = 128, 3, 16, 2e-5, 42

# ───────────────────────── helpers ──────────────────────────
def tok_intent(batch, tok):
    enc = tok(batch["query"], truncation=True, padding="max_length",
              max_length=MAX_LEN)
    enc["labels"] = batch["intent"]
    return enc

def make_slot_labels(example, tok, slot2id):
    text = example["query"]
    ents = ast.literal_eval(example["entities"])
    enc  = tok(text, return_offsets_mapping=True, truncation=True,
               padding="max_length", max_length=MAX_LEN)
    tags = ["O"] * len(enc.input_ids)
    lo_text = text.lower()
    for t, val in ents.items():
        if not val: continue
        start = lo_text.find(val.lower())
        if start == -1: continue
        end = start + len(val)
        for i, (s, e) in enumerate(enc.offset_mapping):
            if s >= end or e <= start: continue
            tags[i] = f"{'B' if s==start else 'I'}-{t}"
    enc["labels"] = [
        slot2id.get(tags[i], slot2id["O"]) if enc.offset_mapping[i]!=(0,0) else -100
        for i in range(len(tags))
    ]
    return {k: v for k, v in enc.items() if k != "offset_mapping"}

def cls_metrics(pred):
    logits, labels = pred
    y_pred = logits.argmax(-1)
    acc = accuracy_score(labels, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, y_pred, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

def seq_metrics(eval_pred, id2slot):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    true_tags, pred_tags = [], []
    for p_row, l_row in zip(preds, labels):
        seq_true, seq_pred = [], []
        for p_id, l_id in zip(p_row, l_row):
            if l_id == -100:
                continue
            seq_true.append(id2slot[l_id])
            seq_pred.append(id2slot[p_id])
        true_tags.append(seq_true)
        pred_tags.append(seq_pred)
    return {"f1": seq_f1(true_tags, pred_tags)}

# ───────────────────────── training wrappers ─────────────────
def train_intent(ds, id2lbl, lbl2id):
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(id2lbl),
        id2label=id2lbl,
        label2id=lbl2id,             # ★ 映射写进 config
    )
    args = TrainingArguments(
        INTENT_DIR, num_train_epochs=EPOCHS,
        per_device_train_batch_size=BS, per_device_eval_batch_size=BS,
        learning_rate=LR, eval_strategy="epoch", save_strategy="epoch",
        seed=SEED, load_best_model_at_end=True, metric_for_best_model="f1"
    )
    Trainer(model=model, args=args,
            train_dataset=ds["train"], eval_dataset=ds["validation"],
            compute_metrics=cls_metrics).train()
    model.save_pretrained(INTENT_DIR)

def train_slots(ds, tok, id2slot, slot2id):
    model = BertForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(id2slot),
        id2label=id2slot,
        label2id=slot2id,            # ★ 同样写入
    )
    args = TrainingArguments(
        SLOT_DIR, num_train_epochs=EPOCHS,
        per_device_train_batch_size=BS, per_device_eval_batch_size=BS,
        learning_rate=LR, eval_strategy="epoch", save_strategy="epoch",
        seed=SEED, load_best_model_at_end=True, metric_for_best_model="f1"
    )
    collator = DataCollatorForTokenClassification(tok)
    Trainer(model=model, args=args, data_collator=collator,
            train_dataset=ds["train"], eval_dataset=ds["validation"],
            compute_metrics=lambda p: seq_metrics(p, id2slot)).train()
    model.save_pretrained(SLOT_DIR)

# ────────────────────────── main ────────────────────────────
def main():
    raw = load_dataset("csv", data_files=CSV_PATH)["train"]

    # intent label map
    intent_lbl = ClassLabel(names=sorted(set(raw["intent"])))
    raw = raw.cast_column("intent", intent_lbl)
    id2lbl_int = {i: s for i, s in enumerate(intent_lbl.names)}
    lbl2id_int = {s: i for i, s in id2lbl_int.items()}

    # slot label map (BIO)
    slot_types = sorted({k for s in raw["entities"]
                         for k in ast.literal_eval(s)})
    slot_tags = ["O"] + [f"{io}-{t}" for t in slot_types for io in ("B", "I")]
    slot2id = {tag: idx for idx, tag in enumerate(slot_tags)}
    id2slot = {idx: tag for tag, idx in slot2id.items()}

    # split
    split = raw.train_test_split(0.2, seed=SEED, stratify_by_column="intent")
    val_t = split["test"].train_test_split(0.5, seed=SEED,
                                           stratify_by_column="intent")
    ds = {"train": split["train"],
          "validation": val_t["train"],
          "test": val_t["test"]}

    tok = AutoTokenizer.from_pretrained(MODEL_NAME)

    ds_int = {k: v.map(lambda b: tok_intent(b, tok), batched=True,
                       remove_columns=v.column_names) for k, v in ds.items()}

    ds_slot = {k: v.map(lambda ex: make_slot_labels(ex, tok, slot2id),
                        batched=False, remove_columns=v.column_names)
               for k, v in ds.items()}

    train_intent(ds_int, id2lbl_int, lbl2id_int)
    tok.save_pretrained(INTENT_DIR)

    train_slots(ds_slot, tok, id2slot, slot2id)
    tok.save_pretrained(SLOT_DIR)

    print("✅ finished training both heads")

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.225029,1.0,1.0,1.0,1.0
2,No log,0.054017,1.0,1.0,1.0,1.0
3,No log,0.033804,1.0,1.0,1.0,1.0


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.351976,0.819672
2,No log,0.059697,1.0
3,No log,0.033263,1.0


✅ finished training both heads


In [1]:
from transformers import pipeline

intent_clf = pipeline("text-classification",
                      model="bert_intent_model",
                      tokenizer="bert_intent_model")

slot_tagger = pipeline("token-classification",
                       model="bert_slot_model",
                       tokenizer="bert_slot_model",
                       aggregation_strategy="simple")  # merge B/I spans

text = "Is the train from JFK Airport to San Francisco running next Monday?"

intent_pred  = intent_clf(text)[0]
slot_preds   = slot_tagger(text)

# Convert slot predictions ➜ dict
entities = {p['entity_group'].split('-')[-1]: p['word'] for p in slot_preds}

print(intent_pred)   # {'label': 'route_query', 'score': 0.97}
print(entities)      # {'source': 'JFK Airport', 'destination': 'Central Park', ...}


Device set to use cuda:0
Device set to use cuda:0


{'label': 'status_query', 'score': 0.8607860803604126}
{'transport_mode': 'train', 'source': 'jfk airport', 'destination': 'san francisco', 'date': 'next monday'}


### 可以了，下面是精准度测试

In [11]:
#!/usr/bin/env python
"""
evaluate_intent_slot.py  ·  精准度评测脚本（intent + slot）

✓ intent 头：accuracy / precision / recall / F1
✓ slot  头：seqeval micro-F1 + 明细报告
"""

import ast, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, BertForSequenceClassification, BertForTokenClassification,
    DataCollatorForTokenClassification
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import classification_report, f1_score as seq_f1
from torch.utils.data import DataLoader

# ─── paths / hparams ───
CSV_PATH   = "TransitChat-Conversational_Route_and_Schedule_Dataset.csv"
INTENT_DIR = "bert_intent_model"
SLOT_DIR   = "bert_slot_model"
MAX_LEN    = 128
BATCH_SIZE = 32
SEED       = 42
torch.manual_seed(SEED)

# ─── Load models & label maps ───
intent_tok = AutoTokenizer.from_pretrained(INTENT_DIR)
slot_tok   = AutoTokenizer.from_pretrained(SLOT_DIR)

intent_model = BertForSequenceClassification.from_pretrained(INTENT_DIR)
slot_model   = BertForTokenClassification.from_pretrained(SLOT_DIR)

id2intent = intent_model.config.id2label
intent2id = intent_model.config.label2id
id2slot   = slot_model.config.id2label
slot2id   = slot_model.config.label2id

# ─── Prepare test split (与训练脚本保持一致) ───
raw = load_dataset("csv", data_files=CSV_PATH)["train"]

# 1️⃣ 把 intent 列转成 ClassLabel，名称顺序必须与模型一致
def _intent_name(idx):
    if idx in id2intent:          # 键是 int
        return id2intent[idx]
    return id2intent[str(idx)]    # 键被序列化成 str

intent_names = [_intent_name(i) for i in range(len(id2intent))]
intent_cl    = ClassLabel(names=intent_names)
raw          = raw.cast_column("intent", intent_cl)

# 2️⃣ 重现同样的 80-10-10 划分
split = raw.train_test_split(0.20, seed=SEED, stratify_by_column="intent")
test  = split["test"].train_test_split(0.50, seed=SEED,
                                       stratify_by_column="intent")["test"]

# ─── Dataset helpers ───
def tok_int_batch(batch):
    enc = intent_tok(batch["query"], truncation=True, padding="max_length",
                     max_length=MAX_LEN)
    enc["labels"] = batch["intent"] 
    return enc

def make_slot_labels(ex):
    text = ex["query"]
    ents = ast.literal_eval(ex["entities"])
    enc  = slot_tok(text, return_offsets_mapping=True, truncation=True,
                    padding="max_length", max_length=MAX_LEN)
    tags = ["O"] * len(enc.input_ids)
    lo   = text.lower()
    for t, val in ents.items():
        if not val: continue
        start = lo.find(val.lower())
        if start == -1: continue
        end = start + len(val)
        for i, (s, e) in enumerate(enc.offset_mapping):
            if s >= end or e <= start: continue
            tags[i] = f"{'B' if s==start else 'I'}-{t}"
    enc["labels"] = [
        slot2id.get(tags[i], slot2id["O"]) if enc.offset_mapping[i]!=(0,0) else -100
        for i in range(len(tags))
    ]
    del enc["offset_mapping"]
    return enc

intent_ds = test.map(tok_int_batch, batched=True,
                     remove_columns=test.column_names).with_format("torch")

slot_ds   = test.map(make_slot_labels, batched=False,
                     remove_columns=test.column_names).with_format("torch")

# ─── INTENT evaluation ───
int_loader = DataLoader(intent_ds, batch_size=BATCH_SIZE)
intent_model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in int_loader:
        labels = batch.pop("labels")
        logits = intent_model(**batch).logits
        y_pred.extend(torch.argmax(logits, -1).cpu().tolist())
        y_true.extend(labels.cpu().tolist())

acc = accuracy_score(y_true, y_pred)
p,r,f1,_ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)

# ─── SLOT evaluation ───
slot_loader = DataLoader(
    slot_ds, batch_size=BATCH_SIZE,
    collate_fn=DataCollatorForTokenClassification(slot_tok)
)
slot_model.eval()
true_tags, pred_tags = [], []

def _slot_name(idx):
    if idx in id2slot:
        return id2slot[idx]
    return id2slot.get(str(idx), "O")

with torch.no_grad():
    for batch in slot_loader:
        labels = batch.pop("labels")
        mask   = labels != -100
        logits = slot_model(**batch).logits
        preds  = torch.argmax(logits, -1)
        for p_row, l_row, m_row in zip(preds, labels, mask):
            t_seq, p_seq = [], []
            for pid, lid, m in zip(p_row, l_row, m_row):
                if not m: 
                    continue
                t_seq.append(_slot_name(int(lid)))
                p_seq.append(_slot_name(int(pid)))
            true_tags.append(t_seq)
            pred_tags.append(p_seq)

slot_f1 = seq_f1(true_tags, pred_tags)

# ─── Report ───
print("──── Intent classification ────")
print(f"accuracy : {acc:.4f}")
print(f"precision: {p :.4f}")
print(f"recall   : {r :.4f}")
print(f"f1       : {f1:.4f}")

print("\n──── Slot tagging ─────────────")
print(f"micro-F1 : {slot_f1:.4f}")
print("\nDetailed seqeval report:")
print(classification_report(true_tags, pred_tags))


──── Intent classification ────
accuracy : 1.0000
precision: 1.0000
recall   : 1.0000
f1       : 1.0000

──── Slot tagging ─────────────
micro-F1 : 1.0000

Detailed seqeval report:
                precision    recall  f1-score   support

          date       1.00      1.00      1.00        34
   destination       1.00      1.00      1.00        50
        source       1.00      1.00      1.00        50
          time       1.00      1.00      1.00        18
transport_mode       1.00      1.00      1.00        50

     micro avg       1.00      1.00      1.00       202
     macro avg       1.00      1.00      1.00       202
  weighted avg       1.00      1.00      1.00       202

