### Import modules and write LLM instructions

In [12]:
import os
import re
import json
from tqdm.auto import tqdm
import pickle
import json

from openai import OpenAI
api_key=json.load(open("api.json"))[0]
client=OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

import ollama
model="llama3.1:8b"

In [13]:
intent_instruction='''You are an expert NLU data generation assistant for a campus chatbot.
Your task is to generate {num_query} varied data items for given entity and intent.
**Instructions:**
1. You should generate {num_query} possible queries corresponding to the given entity and intent.
2. The output should be a vaild json snippet like {example}
3. The {num_query} queries should be included in a list.
in json file that is "query":[{num_query} generated] 
4. Each possible query must contain one or some of the given entities.
5. Each possible query must contain one or some of the given intents.
6. You CANNOT add new eneities in the generated query! 
For example:
"intent": "ask_business_location", "generated_query":"What are the business hours of KIMS SALON?"
then a NEW intent "ask_business_time" is added.
7. Output json snippet ONLY. 

**Exapmle:**
{
  "query": ["where is Nasi Kandar restaurant?", "how can i go to Nasi Kandar restaurant?"],
  "intent": "ask_restaurant_location",
  "entities": [
    {"Nasi Kandar restaurant":"restaurant_name"} 
  ]
}
'''

### Construct a class for intent generation

In [16]:
class intent_data_generator:
    def __init__(self,intent_lib_path=r"data\IE\IE_lib.json"):
        self.data=json.load(open(intent_lib_path,"r",encoding="utf-8"))
        self.categories=list(self.data.keys())
        self.intent_responses={
          category:None for category in self.categories
        }
    
    def create_intent_prompt(self,category,num_query):
        message_set=[
          {"role":"system","content":intent_instruction}
          ]
        for intent in self.data[category][1:]:
          for entity_name in self.data[category][0]:
            prompt=f"""**Your Task:**
            Generate {num_query} new and unique query examples for the intent '{intent}' containing entity {entity_name}:{category}.
            """
            message_set.append({
              "role":"user",
              "content":prompt
            })
        return message_set
    
    def get_intent_response(self,category,num_query):
      message_set=self.create_intent_prompt(category,num_query)
      response_set=[]
      for msg in tqdm(message_set[1:],desc=f"Generating data for {category}"):
        try:
            # response=client.chat.completions.create(
            #     model="deepseek-chat",
            #     messages=[
            #       message_set[0],msg
            #     ],
            #     stream=False
            # ).choices[0].message.content
            response=ollama.chat(model=model, messages=[
                  message_set[0],msg
                ])["message"]["content"]

            response=re.sub(
              pattern=r"```json\n|\n```",
              repl='',
              string=response
            )
            try:
                json.loads(response)
                response_set.append(response)
            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON response for {category}, skipping this item")
                continue
                
        except Exception as e:
            print(f"Error generating response for {category}: {e}")
            continue
            
      self.intent_responses[category]=response_set

    def save_intent_data(self,file_path):
        # 使用写入模式而不是追加模式
        with open(file_path,"w",encoding="utf-8") as f:
            f.write("{\n")
            valid_categories=[]
            
            # 先过滤出有有效数据的类别
            for category in self.categories:
                if (self.responses[category] is not None and 
                    len(self.responses[category]) > 0):
                    valid_categories.append(category)
            
            for i, category in enumerate(valid_categories):
                f.write(f'"{category}":[\n')
                
                # 过滤出有效的JSON数据项
                valid_items=[]
                for data_item in self.responses[category]:
                    try:
                        # 验证JSON格式
                        json.loads(data_item)
                        valid_items.append(data_item)
                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON item in {category}")
                        continue
                
                # 写入有效的数据项
                for j, data_item in enumerate(valid_items):
                    f.write(data_item)
                    if j == len(valid_items) - 1:
                        f.write("\n")
                    else:
                        f.write(",\n")
                
                # 添加类别间的逗号
                if i == len(valid_categories) - 1:
                    f.write("]\n")
                else:
                    f.write("],\n")
                    
            f.write("}")
            
        print(f"数据已保存到 {file_path}")
        print(f"成功保存的类别: {valid_categories}")
        for category in self.categories:
            if category not in valid_categories:
                print(f"警告: {category} 类别没有有效数据")


In [15]:
intent_generator=intent_data_generator()

FileNotFoundError: [Errno 2] No such file or directory: 'data\\IE\\intent_lib.json'

### Generate data for intent recognition

In [10]:
for c in tqdm(intent_generator.categories,desc="Generating data"):
    intent_generator.get_intent_response(c,4)
with open("data/intent_generator_new.pkl", "wb") as f:
    pickle.dump(intent_generator, f)

Generating data:   0%|          | 0/5 [00:00<?, ?it/s]

Generating data for business:   0%|          | 0/39 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [18]:
with open("data/intent_generator_new.pkl","rb") as f:
    intent_generator=pickle.load(f)
intent_generator.save_intent_data("data/intent_train_data_new.json")

数据已保存到 data/intent_train_data.json
成功保存的类别: ['business', 'restaurant', 'facility', 'building', 'handbook']


### Construct a class for entity recognition

In [39]:
# 实体识别数据生成指令
entity_instruction='''You are an expert NER (Named Entity Recognition) data generation assistant for a campus chatbot.
Your task is to generate training data for entity recognition in Chinese queries.

**Your Task:**
Generate varied Chinese query examples that contain entities from the given categories and entity types.

**Output Format:**
Each generated item should be a JSON object with this exact structure:
{
    "query": "I want to go to A3 library tomorrow morning",
    "entities": [
        {"entity_text": "tomorrow morning", "entity_label": "time"},
        {"entity_text": "A3 library", "entity_label": "facility_name"}
    ]
}

**Entity Label Types:**
- business_name: business name (e.g. KK便利店, KIMS SALON)
- restaurant_name: restaurant name (e.g. 夏1城, 大树下)
- facility_name: facility name (e.g. library, gym, swimming pool)
- building_name: building name (e.g. B1 Activity Building, A Zone Teaching Building)
- handbook_topic: handbook topic (e.g. birthday cake recommendation, campus clinic)
- time: time expression (e.g. tomorrow morning, next Wednesday, tonight)
- location: location expression (e.g. on campus, below the dormitory)

**Instructions:**
1. Generate natural Chinese queries that students would ask
2. Include 1-3 entities per query
3. Entities must match the provided entity names exactly
4. Time and location entities can be flexible expressions
5. Output valid JSON only, no additional text
6. Each query should be realistic and contextually appropriate

**Example:**
{
    "query": ["What is the opening time of KIMS SALON tomorrow?", "What is the opening time of KIMS SALON tomorrow?"],
    "entities": [
        {"entity_text": "KIMS SALON", "entity_label": "business_name"},
        {"entity_text": "tomorrow", "entity_label": "time"}
    ]
}
'''

In [40]:
class entity_data_generator:
    def __init__(self, intent_lib_path=r"data\IE\IE_lib.json"):
        self.data=json.load(open(intent_lib_path, "r", encoding="utf-8"))
        self.categories=list(self.data.keys())
        
        # Entity label mapping
        self.entity_label_mapping={
            "business": "business_name",
            "restaurant": "restaurant_name", 
            "facility": "facility_name",
            "building": "building_name",
            "handbook": "handbook_topic",
            "time": "time",
            "location": "location"
        }
        
        self.entity_responses={
            category: None for category in self.categories
        }
    
    def create_entity_prompt(self, category, num_query):
        entity_label=self.entity_label_mapping[category]
        entity_names=self.data[category][0]  
        
        message_set=[
            {"role": "system", "content": entity_instruction}
        ]
        
        # 为每个实体名称创建生成任务
        for entity_name in entity_names:
            prompt=f"""**Your Task:**
Generate {num_query} English query examples containing the entity "{entity_name}".

**Requirements:**
1. Each query must contain "{entity_name}" as an entity
2. Optionally include time/location entities for more realistic queries
3. Cannot add new entities (exclude time and location) in the generated query!
4. Output format:
{{
  "query": ["query1", "query2", "query3"],
  "entities": [
    {{"entity_text": "entity_name", "entity_label": "entity_label"}}
  ]
}}
5. Output json snippet ONLY. Do not add any other text.

**Entity to include:**
- Entity: "{entity_name}"
- Label: "{entity_label}"
"""
            message_set.append({
                "role": "user",
                "content": prompt
            })
        
        return message_set
    
    def get_entity_response(self, category, num_query):
        message_set=self.create_entity_prompt(category, num_query)
        response_set=[]
        
        for msg in tqdm(message_set[1:], desc=f"Generating entity data for {category}"):

            response=ollama.chat(model=model, messages=[
                message_set[0], msg
            ])["message"]["content"]
            
            response=re.sub(
                pattern=r"```json\n|\n```|```",
                repl='',
                string=response
            )
            
            try:
                json.loads(response)
                response_set.append(response)
            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON response for {category}, skipping this item")
                continue
            except Exception as e:
                print(f"Error generating response for {category}: {e}")
                continue
        
        self.entity_responses[category]=response_set
    
    def save_entity_data(self,file_path="data/train_data/entity_train_data.json"):
        # 使用写入模式而不是追加模式
        with open(file_path,"w",encoding="utf-8") as f:
            f.write("{\n")
            valid_categories=[]
            
            # 先过滤出有有效数据的类别
            for category in self.categories:
                if (self.entity_responses[category] is not None and 
                    len(self.entity_responses[category]) > 0):
                    valid_categories.append(category)
            
            for i, category in enumerate(valid_categories):
                f.write(f'"{category}":[\n')
                
                # 过滤出有效的JSON数据项
                valid_items=[]
                for data_item in self.entity_responses[category]:
                    try:
                        # 验证JSON格式
                        json.loads(data_item)
                        valid_items.append(data_item)
                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON item in {category}")
                        continue
                
                # 写入有效的数据项
                for j, data_item in enumerate(valid_items):
                    f.write(data_item)
                    if j == len(valid_items) - 1:
                        f.write("\n")
                    else:
                        f.write(",\n")
                
                # 添加类别间的逗号
                if i == len(valid_categories) - 1:
                    f.write("]\n")
                else:
                    f.write("],\n")
                    
            f.write("}")
            
        print(f"Data saved to {file_path}")
        print(f"Successfully saved categories: {valid_categories}")
        for category in self.categories:
            if category not in valid_categories:
                print(f"Warning: no valid data for {category}")


In [41]:
# Create entity data generator
entity_generator=entity_data_generator()

### Generate data for entity recognition

In [42]:
for c in tqdm(entity_generator.categories,desc="Generating data"):
    entity_generator.get_entity_response(c,4)
with open("data/entity_generator.pkl", "wb") as f:
    pickle.dump(entity_generator, f)

Generating data:   0%|          | 0/5 [00:00<?, ?it/s]

Generating entity data for business:   0%|          | 0/13 [00:00<?, ?it/s]



Generating entity data for restaurant:   0%|          | 0/68 [00:00<?, ?it/s]



Generating entity data for facility:   0%|          | 0/20 [00:00<?, ?it/s]



Generating entity data for building:   0%|          | 0/5 [00:00<?, ?it/s]



Generating entity data for handbook:   0%|          | 0/35 [00:00<?, ?it/s]



In [43]:
entity_generator.save_entity_data()

Data saved to data/train_data/entity_train_data.json
Successfully saved categories: ['business', 'restaurant', 'facility', 'building', 'handbook']


In [44]:
# 测试为餐厅类别生成数据
test_generator=entity_data_generator()
test_generator.get_entity_response("restaurant", 2)

# 查看生成的数据
if test_generator.entity_responses["restaurant"]:
    print("生成的示例数据：")
    for i, item in enumerate(test_generator.entity_responses["restaurant"][:3]):
        print(f"\n示例 {i+1}:")
        try:
            parsed=json.loads(item)
            print(json.dumps(parsed, ensure_ascii=False, indent=2))
        except json.JSONDecodeError:
            print(f"JSON解析错误: {item}")
else:
    print("没有生成数据")


Generating entity data for restaurant:   0%|          | 0/68 [00:00<?, ?it/s]

生成的示例数据：

示例 1:
{
  "query": [
    "What is the opening time of 夏1城 tomorrow?",
    "Is 夏1城 open on Sundays?",
    "Can I reserve a table at 夏1城 tonight?"
  ],
  "entities": [
    {
      "entity_text": "夏1城",
      "entity_label": "restaurant_name"
    },
    {
      "entity_text": "tomorrow",
      "entity_label": "time"
    }
  ]
}

示例 2:
{
  "query": [
    "I want to book a table at Tuk Tuk Thai&Taro tonight.",
    "What is the phone number of Tuk Tuk Thai&Taro?"
  ],
  "entities": [
    {
      "entity_text": "Tuk Tuk Thai&Taro",
      "entity_label": "restaurant_name"
    }
  ]
}

示例 3:
{
  "query": [
    "What is the menu of 大树下?",
    "I want to order food from 大树下 tonight",
    "Can I make a reservation at 大树下 tomorrow night?"
  ],
  "entities": [
    {
      "entity_text": "大树下",
      "entity_label": "restaurant_name"
    }
  ]
}


In [45]:
def validate_entity_data(file_path):
    """验证生成的实体数据质量"""
    with open(file_path, "r", encoding="utf-8") as f:
        data=json.load(f)
    
    print(f"总共加载了 {len(data)} 条数据")
    
    # 统计各类实体
    entity_stats={}
    query_lengths=[]
    entity_counts=[]
    
    valid_count=0
    invalid_items=[]
    
    for i, item in enumerate(data):
        try:
            # 检查必需字段
            if 'query' not in item or 'entities' not in item:
                invalid_items.append(f"第{i+1}条: 缺少必需字段")
                continue
                
            query=item['query']
            entities=item['entities']
            
            # 检查query是否为字符串
            if not isinstance(query, str) or len(query.strip()) == 0:
                invalid_items.append(f"第{i+1}条: query无效")
                continue
                
            # 检查entities是否为列表
            if not isinstance(entities, list) or len(entities) == 0:
                invalid_items.append(f"第{i+1}条: entities无效")
                continue
                
            # 检查每个entity的格式
            valid_entities=True
            for entity in entities:
                if not isinstance(entity, dict):
                    invalid_items.append(f"第{i+1}条: entity不是字典格式")
                    valid_entities=False
                    break
                    
                if 'entity_text' not in entity or 'entity_label' not in entity:
                    invalid_items.append(f"第{i+1}条: entity缺少必需字段")
                    valid_entities=False
                    break
                    
                entity_text=entity['entity_text']
                entity_label=entity['entity_label']
                
                # 检查实体文本是否在查询中
                if entity_text not in query:
                    invalid_items.append(f"第{i+1}条: 实体文本'{entity_text}'不在查询中")
                    valid_entities=False
                    break
                    
                # 统计实体类型
                entity_stats[entity_label]=entity_stats.get(entity_label, 0) + 1
            
            if valid_entities:
                valid_count += 1
                query_lengths.append(len(query))
                entity_counts.append(len(entities))
                
        except Exception as e:
            invalid_items.append(f"第{i+1}条: 处理错误 - {str(e)}")
    
    print(f"\n数据质量报告:")
    print(f"有效数据: {valid_count} 条")
    print(f"无效数据: {len(invalid_items)} 条")
    
    if invalid_items:
        print(f"\n前5个无效数据问题:")
        for issue in invalid_items[:5]:
            print(f"  - {issue}")
    
    if query_lengths:
        print(f"\n查询长度统计:")
        print(f"  平均长度: {sum(query_lengths)/len(query_lengths):.1f} 字符")
        print(f"  最短: {min(query_lengths)} 字符")
        print(f"  最长: {max(query_lengths)} 字符")
        
        print(f"\n每条数据实体数量:")
        print(f"  平均: {sum(entity_counts)/len(entity_counts):.1f} 个")
        print(f"  最少: {min(entity_counts)} 个")
        print(f"  最多: {max(entity_counts)} 个")
    
    print(f"\n实体类型统计:")
    for label, count in sorted(entity_stats.items()):
        print(f"  {label}: {count} 个实体")
    
    return valid_count, len(invalid_items), entity_stats

# 使用示例（生成数据后运行）
# validate_entity_data("data/train_data/entity_train_data.json")


## 实体识别数据生成器使用说明

### 功能特点
1. **自动化生成**: 基于intent_lib.json中定义的实体自动生成实体识别训练数据
2. **标准格式**: 生成符合NER训练要求的JSON格式数据
3. **多实体支持**: 每个查询可包含多个实体（时间、地点等）
4. **质量控制**: 内置数据验证和质量检查功能

### 生成的数据格式
```json
{
    "query": "明天上午我想去A3图书馆还书",
    "entities": [
        {"entity_text": "明天上午", "entity_label": "time"},
        {"entity_text": "A3图书馆", "entity_label": "facility_name"}
    ]
}
```

### 支持的实体类型
- `business_name`: 商铺名称
- `restaurant_name`: 餐厅名称
- `facility_name`: 设施名称
- `building_name`: 建筑名称
- `handbook_topic`: 手册主题
- `time`: 时间表达
- `location`: 地点表达

### 使用步骤
1. 创建生成器实例: `entity_generator=entity_data_generator()`
2. 生成数据: `entity_generator.generate_all_categories(num_query_per_entity=3)`
3. 保存数据: `entity_generator.save_entity_data("path/to/file.json")`
4. 验证数据: `validate_entity_data("path/to/file.json")`
