### Load train data

In [26]:
import json
import pandas as pd
from tqdm.auto import tqdm

In [27]:
with open("data/train_data/entity_train_data.json","r",encoding="utf-8") as f:
    entity_train_data=json.load(f)

In [28]:
test=entity_train_data["restaurant"][0]
test

{'query': ['Where can I dine at Xiayicheng',
  'What is the menu of Xiayicheng restaurant?',
  'Can you reserve a table for me at Xiayicheng tonight?'],
 'entities': [{'entity_text': 'Xiayicheng', 'entity_label': 'restaurant'}]}

In [29]:
label_mapping={
    "restaurant":"restaurant_name",
    "facility":"facility_name",
    "building":"building_name",
    "handbook":"handbook_topic",
    "greeting":"greeting"
}

def construct_data(data_item, category):
    """
    构建训练数据：为每个查询语句创建对应的实体标注行
    """
    rows = []
    queries = data_item['query']
    entities = data_item['entities']
    
    # 为每个查询语句和每个实体创建一行数据
    for query in queries:
        for entity in entities:
            entity["entity_label"]=category
            rows.append({
                'query': query,
                'entity_text': entity['entity_text'],
                'entity_label': entity['entity_label']
            })
    
    return pd.DataFrame(rows)

train_df=pd.DataFrame()
for category, data_items in entity_train_data.items():
    for data_item in data_items:
        train_df=pd.concat([train_df,construct_data(data_item, category)],ignore_index=True)
train_df["index"]=train_df.index
train_df

Unnamed: 0,query,entity_text,entity_label,index
0,Where can I dine at Xiayicheng,Xiayicheng,restaurant,0
1,What is the menu of Xiayicheng restaurant?,Xiayicheng,restaurant,1
2,Can you reserve a table for me at Xiayicheng t...,Xiayicheng,restaurant,2
3,What's the menu of xiayicheng?,xiayicheng,restaurant,3
4,I want to reserve a table at xiayicheng,xiayicheng,restaurant,4
...,...,...,...,...
5231,Can I get a translation for welcome to xmum,welcome to xmum,greeting,5231
5232,I'd like to know more about the welcome to xmu...,welcome to xmum,greeting,5232
5233,How do I use the portal?,Welcome to XMUM,greeting,5233
5234,What are the upcoming events on campus?,Welcome to XMUM,greeting,5234


In [30]:
train_df.to_csv("data/train_data/entity_train_data.csv",index=False)

In [31]:
train_df["entity_label"].value_counts()

entity_label
facility      1086
restaurant    1075
building      1075
handbook      1043
greeting       957
Name: count, dtype: int64

### Train entity recognition model

#### Load and process train data

`spaCy` needs specific tuple data format: `("QUERY", {"entities":[(start, end, "LABEL")...]})`.

So we need to convert train data to that format.

In [32]:
def convert_to_spacy_format(data_item):
    """修复版本的数据转换函数"""
    temp = []
    try:
        query = data_item['query']
        entity_text = data_item['entity_text']
        
        # 尝试精确匹配
        start = query.find(entity_text)
        if start == -1:
            # 如果找不到精确匹配，尝试忽略大小写
            start = query.lower().find(entity_text.lower())
            if start != -1:
                # 重新计算正确的结束位置
                actual_entity = query[start:start+len(entity_text)]
                end = start + len(actual_entity)
            else:
                # 如果还是找不到，跳过这个样本
                print(f"Skipping misaligned entity: '{entity_text}' in '{query}'")
                return None
        else:
            end = start + len(entity_text)
        
        # 验证对齐
        extracted = query[start:end]
        if extracted.lower() != entity_text.lower():
            print(f"Alignment error: expected '{entity_text}', got '{extracted}'")
            return None
            
        temp.append((query, {"entities": [(start, end, data_item['entity_label'])]}))
        
    except Exception as e:
        print(f"Error at index {data_item.get('index', 'unknown')}: {e}")
        return None
    
    return temp

In [33]:
spacy_train_data=train_df.apply(convert_to_spacy_format,axis=1)

Skipping misaligned entity: 'Tuk Tuk Thai&Taro' in 'Where is the nearest restaurant'
Skipping misaligned entity: 'hualianhaoliangcha' in 'What is the phone number of Hualian Haoliangcha?'
Skipping misaligned entity: 'hualianhaoliangcha' in 'Where can I find Hualian Haoliangcha in Shenzhen?'
Skipping misaligned entity: 'hualianhaoliangcha' in 'Is Hualian Haoliangcha open on Sunday?'
Skipping misaligned entity: 'hualianhaoliangcha' in 'How much does a meal cost at Hualian Haoliangcha?'
Skipping misaligned entity: 'shanyicheng' in 'What is the address of Shanyi Cheng?'
Skipping misaligned entity: 'shanyicheng' in 'Where can I find Shanyi Cheng for dinner?'
Skipping misaligned entity: 'shanyicheng' in 'Is Shanyi Cheng a good place for a family meal?'
Skipping misaligned entity: 'admissions office' in 'What are the admission requirements for international students?'
Skipping misaligned entity: 'cake shops' in 'Can you recommend a good cake shop nearby?'
Skipping misaligned entity: 'cake sho

In [34]:
spacy_train_data.dropna(inplace=True)
spacy_train_data

0       [(Where can I dine at Xiayicheng, {'entities':...
1       [(What is the menu of Xiayicheng restaurant?, ...
2       [(Can you reserve a table for me at Xiayicheng...
3       [(What's the menu of xiayicheng?, {'entities':...
4       [(I want to reserve a table at xiayicheng, {'e...
                              ...                        
5228    [(What does it mean by welcome?, {'entities': ...
5229    [(How do I get to the welcome desk?, {'entitie...
5230    [(What is the meaning of welcome to xmum, {'en...
5231    [(Can I get a translation for welcome to xmum,...
5232    [(I'd like to know more about the welcome to x...
Length: 4899, dtype: object

In [35]:
type(spacy_train_data[0])

list

In [36]:
spacy_train_data.to_csv("data/train_data/spacy_entity_train_data.csv",index=True)

#### Train a `spaCy` model

In [37]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from thinc.api import Adam, Config
import random

In [38]:
spacy_train_data

0       [(Where can I dine at Xiayicheng, {'entities':...
1       [(What is the menu of Xiayicheng restaurant?, ...
2       [(Can you reserve a table for me at Xiayicheng...
3       [(What's the menu of xiayicheng?, {'entities':...
4       [(I want to reserve a table at xiayicheng, {'e...
                              ...                        
5228    [(What does it mean by welcome?, {'entities': ...
5229    [(How do I get to the welcome desk?, {'entitie...
5230    [(What is the meaning of welcome to xmum, {'en...
5231    [(Can I get a translation for welcome to xmum,...
5232    [(I'd like to know more about the welcome to x...
Length: 4899, dtype: object

In [39]:
entity_recognizer=spacy.blank("en")

In [40]:
if "ner" not in entity_recognizer.pipe_names:
    ner = entity_recognizer.add_pipe("ner", last=True)

In [41]:
final_train_data=[]
for data_item in spacy_train_data:
    final_train_data.extend(data_item)

In [42]:
final_train_data

[('Where can I dine at Xiayicheng', {'entities': [(20, 30, 'restaurant')]}),
 ('What is the menu of Xiayicheng restaurant?',
  {'entities': [(20, 30, 'restaurant')]}),
 ('Can you reserve a table for me at Xiayicheng tonight?',
  {'entities': [(34, 44, 'restaurant')]}),
 ("What's the menu of xiayicheng?", {'entities': [(19, 29, 'restaurant')]}),
 ('I want to reserve a table at xiayicheng',
  {'entities': [(29, 39, 'restaurant')]}),
 ('Is xiayicheng open for lunch today?',
  {'entities': [(3, 13, 'restaurant')]}),
 ('Where is Xia Yi Cheng located?', {'entities': [(9, 21, 'restaurant')]}),
 ('What time does Xia Yi Cheng open?', {'entities': [(15, 27, 'restaurant')]}),
 ('Is Xia Yi Cheng a good place to eat?',
  {'entities': [(3, 15, 'restaurant')]}),
 ('Is Tuk Tuk Thai&Taro open on weekends?',
  {'entities': [(3, 20, 'restaurant')]}),
 ('What are the opening hours for Tuk Tuk Thai&Taro?',
  {'entities': [(31, 48, 'restaurant')]}),
 ('What is the address of Tuk Tuk Thai?',
  {'entities': [

In [43]:
labels_added = set()
for data_item in final_train_data:
    query, annotation = data_item
    for ent in annotation.get("entities", []):
        label = ent[2] 
        if label not in labels_added:
            ner.add_label(label)
            labels_added.add(label)
            print(f"Added label: {label}")

print(f"\nAdded {len(labels_added)} labels: {labels_added}")


Added label: restaurant
Added label: facility
Added label: building
Added label: handbook
Added label: greeting

Added 5 labels: {'restaurant', 'facility', 'greeting', 'building', 'handbook'}


In [44]:
print("Training entity recognizer...")

# 修复：使用initialize()替代begin_training()
entity_recognizer.initialize()
optimizer = Adam(learn_rate=0.001)
# optimizer = entity_recognizer.create_optimizer()

for i in range(200):
    random.shuffle(final_train_data)
    losses = {}

    batches = minibatch(final_train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = entity_recognizer.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        
        # 修复：只传递examples参数，losses作为关键字参数
        entity_recognizer.update(examples, losses=losses)
    
    print(f"Iteration {i+1}, loss: {losses}")

print("Training completed!")
    

Training entity recognizer...




Iteration 1, loss: {'ner': np.float32(7662.4453)}
Iteration 2, loss: {'ner': np.float32(7235.299)}
Iteration 3, loss: {'ner': np.float32(7306.3447)}
Iteration 4, loss: {'ner': np.float32(7174.588)}
Iteration 5, loss: {'ner': np.float32(7119.6597)}
Iteration 6, loss: {'ner': np.float32(6926.452)}
Iteration 7, loss: {'ner': np.float32(6848.4214)}
Iteration 8, loss: {'ner': np.float32(6650.7314)}
Iteration 9, loss: {'ner': np.float32(6190.44)}
Iteration 10, loss: {'ner': np.float32(6093.1304)}
Iteration 11, loss: {'ner': np.float32(5988.7383)}
Iteration 12, loss: {'ner': np.float32(5644.7163)}
Iteration 13, loss: {'ner': np.float32(5770.3203)}
Iteration 14, loss: {'ner': np.float32(5491.2456)}
Iteration 15, loss: {'ner': np.float32(5605.7183)}
Iteration 16, loss: {'ner': np.float32(5446.277)}
Iteration 17, loss: {'ner': np.float32(5247.6133)}
Iteration 18, loss: {'ner': np.float32(5443.1523)}
Iteration 19, loss: {'ner': np.float32(5273.1304)}
Iteration 20, loss: {'ner': np.float32(5036.65

In [45]:
# 保存训练好的模型
entity_recognizer.to_disk("data/trained_model/entity_recognizer")
print("Model saved to data/trained_model/entity_recognizer")


Model saved to data/trained_model/entity_recognizer


In [46]:
entity_recognizer=spacy.load("data/trained_model/entity_recognizer")

In [47]:
# 测试训练好的模型
test_texts = [
    "Where is KK便利店 located?",
    "I want to go to A3图书馆 tomorrow morning",
    "What time does 星巴克STARBUCKS open?",
    "Can I book a room in B1 Activity Building?"
]

print("测试实体识别结果:")
for text in test_texts:
    doc = entity_recognizer(text)
    print(f"\n文本: {text}")
    if doc.ents:
        for ent in doc.ents:
            print(f"  实体: '{ent.text}' -> 标签: {ent.label_}")
    else:
        print("  未识别到实体")


测试实体识别结果:

文本: Where is KK便利店 located?
  实体: 'KK便利店' -> 标签: building

文本: I want to go to A3图书馆 tomorrow morning
  未识别到实体

文本: What time does 星巴克STARBUCKS open?
  未识别到实体

文本: Can I book a room in B1 Activity Building?
  实体: 'B1' -> 标签: handbook


#### Test the model

In [48]:
from sklearn.metrics import classification_report

In [49]:
### 模型性能评估 - Classification Report

import numpy as np
from sklearn.metrics import classification_report
from collections import defaultdict, Counter
import random

# 设置随机种子以确保结果可重现
random.seed(42)

# 准备测试数据 - 从训练数据中随机抽取20%作为测试集
test_size = int(len(final_train_data) * 0.2)
test_data = random.sample(final_train_data, test_size)

print(f"总训练数据量: {len(final_train_data)}")
print(f"测试数据量: {test_size}")
print("="*50)


总训练数据量: 4899
测试数据量: 979


In [50]:
# 提取真实标签和预测标签
y_true = []
y_pred = []
training_counts = Counter()  # 统计每个标签在训练数据中的数量

# 统计训练数据中每个标签的出现次数
for text, annotations in final_train_data:
    for ent in annotations.get("entities", []):
        training_counts[ent[2]] += 1

# 对测试数据进行预测并收集结果
for text, annotations in test_data:
    # 获取真实标签
    true_entities = []
    for ent in annotations.get("entities", []):
        start, end, label = ent
        true_entities.append((start, end, label))
    
    # 使用模型进行预测
    doc = entity_recognizer(text)
    pred_entities = []
    for ent in doc.ents:
        pred_entities.append((ent.start_char, ent.end_char, ent.label_))
    
    # 创建标签序列用于评估
    # 为了简化，我们基于实体的存在与否来创建标签
    true_labels_in_text = set([label for _, _, label in true_entities])
    pred_labels_in_text = set([label for _, _, label in pred_entities])
    
    # 收集所有可能的标签
    all_labels = true_labels_in_text.union(pred_labels_in_text)
    
    for label in labels_added:
        y_true.append(1 if label in true_labels_in_text else 0)
        y_pred.append(1 if label in pred_labels_in_text else 0)

print(f"评估样本数: {len(y_true) // len(labels_added)}")
print(f"标签类别数: {len(labels_added)}")


评估样本数: 979
标签类别数: 5


In [51]:
# 生成基础分类报告
target_names = sorted(list(labels_added))
report = classification_report(
    y_true, y_pred, 
    target_names=target_names,
    output_dict=True,
    zero_division=0
)

# 创建自定义的分类报告，包含训练数据量信息
print("实体识别模型 - 分类报告")
print("="*80)
print(f"{'标签':<20} {'精确率':<10} {'召回率':<10} {'F1分数':<10} {'支持度':<10} {'训练数据量':<10}")
print("-"*80)

# 显示每个标签的性能指标
for label in target_names:
    precision = report[label]['precision']
    recall = report[label]['recall']
    f1_score = report[label]['f1-score']
    support = int(report[label]['support'])
    train_count = training_counts[label]
    
    print(f"{label:<20} {precision:<10.3f} {recall:<10.3f} {f1_score:<10.3f} {support:<10} {train_count:<10}")

print("-"*80)

# 显示总体性能指标
macro_avg = report['macro avg']
weighted_avg = report['weighted avg']
total_support = sum(training_counts.values())

print(f"{'宏平均':<20} {macro_avg['precision']:<10.3f} {macro_avg['recall']:<10.3f} {macro_avg['f1-score']:<10.3f} {int(macro_avg['support']):<10} {total_support:<10}")
print(f"{'加权平均':<20} {weighted_avg['precision']:<10.3f} {weighted_avg['recall']:<10.3f} {weighted_avg['f1-score']:<10.3f} {int(weighted_avg['support']):<10} {total_support:<10}")

print("\n性能总结:")
print(f"• 总标签数: {len(labels_added)}")
print(f"• 总训练样本数: {len(final_train_data)}")
print(f"• 总测试样本数: {test_size}")
print(f"• 平均每个标签的训练数据量: {total_support/len(labels_added):.1f}")
print(f"• 整体准确率: {(sum(1 for i in range(len(y_true)) if y_true[i] == y_pred[i]) / len(y_true)):.3f}")


ValueError: Number of classes, 2, does not match size of target_names, 5. Try specifying the labels parameter

In [None]:
# 训练数据分布分析
print("\n训练数据分布分析:")
print("="*50)
print(f"{'标签':<20} {'训练数据量':<15} {'百分比':<10}")
print("-"*50)

sorted_counts = sorted(training_counts.items(), key=lambda x: x[1], reverse=True)
for label, count in sorted_counts:
    percentage = (count / total_support) * 100
    print(f"{label:<20} {count:<15} {percentage:<10.2f}%")

print(f"\n数据分布统计:")
print(f"• 最多的标签: {sorted_counts[0][0]} ({sorted_counts[0][1]} 个样本)")
print(f"• 最少的标签: {sorted_counts[-1][0]} ({sorted_counts[-1][1]} 个样本)")
print(f"• 数据分布差异: {sorted_counts[0][1] / sorted_counts[-1][1]:.1f}倍")
