### Load train data

In [4]:
import json
import pandas as pd
from tqdm.auto import tqdm

In [5]:
with open("data/train_data/entity_train_data.json","r",encoding="utf-8") as f:
    entity_train_data=json.load(f)

In [6]:
test=entity_train_data["restaurant"][0]
test

{'query': ["What's the menu of 夏1城?",
  'Can I order from 夏1城 for delivery?',
  'Where is 夏1城 located on campus?'],
 'entities': [{'entity_text': '夏1城', 'entity_label': 'restaurant_name'}]}

In [21]:
def construct_data(data_item):
    temp={
        'entity_text': [entity['entity_text'] for entity in data_item['entities']]*len(data_item['query']),  
        'entity_label': [entity['entity_label'] for entity in data_item['entities']]*len(data_item['query']),
        'query': data_item['query']
    }
    temp=pd.DataFrame(temp)
    return temp
train_df=pd.DataFrame()
for v in entity_train_data.values():
    for data_item in v:
        train_df=pd.concat([train_df,construct_data(data_item)],ignore_index=True)
train_df["index"]=train_df.index
train_df

Unnamed: 0,entity_text,entity_label,query,index
0,KK便利店,business_name,Where is KK便利店 located?,0
1,KK便利店,business_name,Can I buy snacks at KK便利店?,1
2,KK便利店,business_name,What are the operating hours of KK便利店?,2
3,KIMS SALON,business_name,What is the phone number of KIMS SALON?,3
4,KIMS SALON,business_name,I am looking for a haircut at KIMS SALON.,4
...,...,...,...,...
340,Contact Information,handbook_topic,How do I find the Contact Information for univ...,340
341,Contact Information,handbook_topic,What is the Contact Information for student se...,341
342,Contact Information,handbook_topic,Where can I get the Contact Information for ac...,342
343,Clubs and Societies at XMUM,handbook_topic,What are the activities offered by Clubs and S...,343


In [8]:
train_df.to_csv("data/train_data/entity_train_data.csv",index=False)

### Train entity recognition model

#### Load and process train data

`spaCy` needs specific tuple data format: `("QUERY", {"entities":[(start, end, "LABEL")...]})`.

So we need to convert train data to that format.

In [33]:
def convert_to_spacy_format(data_item):
    temp=[]
    try:
        start=data_item['query'].index(data_item['entity_text'])
        end=start+len(data_item['entity_text'])
        temp.append((data_item["query"], {"entities":[(start, end, data_item['entity_label'])]}))
    except:
        print(f"Error at index {data_item['index']}: {data_item['query']} {data_item['entity_text']} {data_item['entity_label']}")
        return None
    return temp

In [34]:
spacy_train_data=train_df.apply(convert_to_spacy_format,axis=1)

Error at index 22: Where is the Peoples Bookstore located? 人民书局 business_name
Error at index 23: Is Peoples Bookstore open on weekends? 人民书局 business_name
Error at index 24: Can I find English books at Peoples Bookstore? 人民书局 business_name
Error at index 41: What is the menu at Let's Kopitiam? Let's kopitiam restaurant_name
Error at index 42: I love their dishes at Let's Kopitiam. Let's kopitiam restaurant_name
Error at index 43: Does Let's Kopitiam have any promotions? Let's kopitiam restaurant_name
Error at index 54: What are the reviews about the food at 大城小厶? 大城小厨 restaurant_name
Error at index 113: What is the address of Chinese Muslim Sizzling Wok? 中国會星尊绋 restaurant_name
Error at index 114: Can I book a table at Chinese Muslim Sizzling Wok for tonight? 中国會星尊绋 restaurant_name
Error at index 115: I love their food, what time does Chinese Muslim Sizzling Wok open? 中国會星尊绋 restaurant_name
Error at index 144: How late is 啵啑啡魚泡泡鸡PaoPaoPot open? 啵啵鱼泡泡鸡PaoPaoPot restaurant_name
Error at i

In [46]:
spacy_train_data.dropna(inplace=True)
spacy_train_data

0      [(Where is KK便利店 located?, {'entities': [(9, 1...
1      [(Can I buy snacks at KK便利店?, {'entities': [(2...
2      [(What are the operating hours of KK便利店?, {'en...
3      [(What is the phone number of KIMS SALON?, {'e...
4      [(I am looking for a haircut at KIMS SALON., {...
                             ...                        
340    [(How do I find the Contact Information for un...
341    [(What is the Contact Information for student ...
342    [(Where can I get the Contact Information for ...
343    [(What are the activities offered by Clubs and...
344    [(Can I find information about sports teams in...
Length: 320, dtype: object

In [49]:
type(spacy_train_data[0])

list

In [40]:
spacy_train_data.to_csv("data/train_data/spacy_entity_train_data.csv",index=False)

#### Train a `spaCy` model

In [37]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

In [42]:
entity_recognizer=spacy.blank("en")

In [43]:
if "ner" not in entity_recognizer.pipe_names:
    ner = entity_recognizer.add_pipe("ner", last=True)

In [66]:
final_train_data=[]
for data_item in spacy_train_data:
    final_train_data.extend(data_item)

In [68]:
labels_added = set()
for data_item in final_train_data:
    query, annotation = data_item
    for ent in annotation.get("entities", []):
        label = ent[2] 
        if label not in labels_added:
            ner.add_label(label)
            labels_added.add(label)
            print(f"Added label: {label}")

print(f"\nAdded {len(labels_added)} labels: {labels_added}")


Added label: business_name
Added label: restaurant_name
Added label: facility_name
Added label: building_name
Added label: handbook_topic

Added 5 labels: {'restaurant_name', 'facility_name', 'building_name', 'handbook_topic', 'business_name'}


In [71]:
print("Training entity recognizer...")

# 修复：使用initialize()替代begin_training()
entity_recognizer.initialize()
optimizer = entity_recognizer.create_optimizer()

for i in range(100):
    random.shuffle(final_train_data)
    losses = {}

    batches = minibatch(final_train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = entity_recognizer.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        
        # 修复：只传递examples参数，losses作为关键字参数
        entity_recognizer.update(examples, losses=losses)
    
    print(f"Iteration {i+1}, loss: {losses}")

print("Training completed!")
    

Training entity recognizer...




Iteration 0, loss: {'ner': 756.923224181857}
Iteration 1, loss: {'ner': 362.7646831630668}
Iteration 2, loss: {'ner': 99.99888387987188}
Iteration 3, loss: {'ner': 33.007253018469484}
Iteration 4, loss: {'ner': 10.259407063499822}
Iteration 5, loss: {'ner': 0.04959470595438095}
Iteration 6, loss: {'ner': 0.018940685107898758}
Iteration 7, loss: {'ner': 5.136161576557317e-05}
Iteration 8, loss: {'ner': 8.275047059479117e-05}
Iteration 9, loss: {'ner': 4.346378509956001e-06}
Iteration 10, loss: {'ner': 4.82890302485067e-06}
Iteration 11, loss: {'ner': 3.380761885218649e-05}
Iteration 12, loss: {'ner': 1.794749709246132e-06}
Iteration 13, loss: {'ner': 2.555594527662854e-06}
Iteration 14, loss: {'ner': 2.7151037501189987e-07}
Iteration 15, loss: {'ner': 1.9789654765424434e-06}
Iteration 16, loss: {'ner': 3.1111996312097046e-07}
Iteration 17, loss: {'ner': 1.3391124106149046e-06}
Iteration 18, loss: {'ner': 2.331401182585845e-06}
Iteration 19, loss: {'ner': 3.9513446238563235e-07}
Iteratio

In [72]:
# 保存训练好的模型
entity_recognizer.to_disk("data/trained_model/entity_recognizer")
print("Model saved to data/trained_model/entity_recognizer")


Model saved to data/trained_model/entity_recognizer


In [73]:
# 测试训练好的模型
test_texts = [
    "Where is KK便利店 located?",
    "I want to go to A3图书馆 tomorrow morning",
    "What time does 星巴克STARBUCKS open?",
    "Can I book a room in B1 Activity Building?"
]

print("测试实体识别结果:")
for text in test_texts:
    doc = entity_recognizer(text)
    print(f"\n文本: {text}")
    if doc.ents:
        for ent in doc.ents:
            print(f"  实体: '{ent.text}' -> 标签: {ent.label_}")
    else:
        print("  未识别到实体")


测试实体识别结果:

文本: Where is KK便利店 located?
  实体: 'KK便利店' -> 标签: business_name

文本: I want to go to A3图书馆 tomorrow morning
  实体: 'A3图书馆' -> 标签: restaurant_name

文本: What time does 星巴克STARBUCKS open?
  实体: '星巴克STARBUCKS' -> 标签: restaurant_name

文本: Can I book a room in B1 Activity Building?
  实体: 'B1 Activity Building' -> 标签: building_name


#### Test the model

In [74]:
from sklearn.metrics import classification_report