### Import modules and write LLM instructions

In [4]:
import os
import re
import json
from tqdm.auto import tqdm
import pickle
import json

import ollama
model="llama3.1:8b"

In [2]:
intent_instruction='''You are an expert NLU data generation assistant for a campus chatbot.
Your task is to generate {num_query} varied data items for given entity and intent.
**Instructions:**
1. You should generate {num_query} possible queries corresponding to the given entity and intent.
2. The output should be a vaild json snippet like {example}
3. The {num_query} queries should be included in a list.
in json file that is "query":[{num_query} generated] 
4. Each possible query must contain one or some of the given entities.
5. Each possible query must contain one or some of the given intents.
6. You CANNOT add new eneities in the generated query! 
For example:
"intent": "ask_business_location", "generated_query":"What are the business hours of KIMS SALON?"
then a NEW intent "ask_business_time" is added.
7. Output json snippet ONLY. 

**Exapmle:**
{
  "query": ["where is Nasi Kandar restaurant?", "how can i go to Nasi Kandar restaurant?"],
  "intent": "ask_restaurant_location",
  "entities": [
    {"Nasi Kandar restaurant":"restaurant_name"} 
  ]
}
'''

### Construct a class for intent generation

In [3]:
class intent_data_generator:
    def __init__(self,intent_lib_path=r"data\extracted_json\database.json"):
        self.data=json.load(open(intent_lib_path,"r",encoding="utf-8"))
        self.categories=list(self.data.keys())
        self.intent_responses={
          category:None for category in self.categories
        }
    
    def create_intent_prompt(self,category,num_query):
        message_set=[
          {"role":"system","content":intent_instruction}
          ]
        for intent in self.data[category]["possible_intent"]:
          for entity_item in list(self.data[category].values())[1:]:
            for entity_name in entity_item["possible_name"]:
                prompt=f"""**Your Task:**
                Generate {num_query} new and unique query examples for the intent '{intent}' containing entity {entity_name}:{category}.
                """
                message_set.append({
                "role":"user",
                "content":prompt
                })
        return message_set
    
    def get_intent_response(self,category,num_query):
      message_set=self.create_intent_prompt(category,num_query)
      response_set=[]
      for msg in tqdm(message_set[1:],desc=f"Generating data for {category}"):
        try:
            response=ollama.chat(model=model, messages=[
                  message_set[0],msg
                ])["message"]["content"]

            response=re.sub(
              pattern=r"```json\n|\n```",
              repl='',
              string=response
            )
            try:
                json.loads(response)
                response_set.append(response)
            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON response for {category}, skipping this item")
                continue
                
        except Exception as e:
            print(f"Error generating response for {category}: {e}")
            continue
            
      self.intent_responses[category]=response_set

    def save_intent_data(self,file_path):
        
        with open(file_path,"w",encoding="utf-8") as f:
            f.write("{\n")
            valid_categories=[]
            
            for category in self.categories:
                if (self.intent_responses[category] is not None and 
                    len(self.intent_responses[category]) > 0):
                    valid_categories.append(category)
            
            for i, category in enumerate(valid_categories):
                f.write(f'"{category}":[\n')
                
                valid_items=[]
                for data_item in self.intent_responses[category]:
                    try:
                        # Verify JSON format
                        json.loads(data_item)
                        valid_items.append(data_item)
                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON item in {category}")
                        continue
                
                # Write only the valid data items
                for j, data_item in enumerate(valid_items):
                    f.write(data_item)
                    if j == len(valid_items) - 1:
                        f.write("\n")
                    else:
                        f.write(",\n")
                
                # Add comma between categories
                if i == len(valid_categories) - 1:
                    f.write("]\n")
                else:
                    f.write("],\n")
                    
            f.write("}")
            
        print(f"Training data saved to {file_path}")
        print(f"Successfully saved categories: {valid_categories}")
        for category in self.categories:
            if category not in valid_categories:
                print(f"Warning: {category} category has no valid data")


In [4]:
intent_generator=intent_data_generator()

### Generate data for intent recognition

In [5]:
for c in intent_generator.categories:
    intent_generator.get_intent_response(c,4)
with open("data/train_data/intent_generator.pkl", "wb") as f:
    pickle.dump(intent_generator, f)

Generating data for restaurant:   0%|          | 0/426 [00:00<?, ?it/s]



Generating data for facility:   0%|          | 0/138 [00:00<?, ?it/s]



Generating data for building:   0%|          | 0/44 [00:00<?, ?it/s]



Generating data for handbook:   0%|          | 0/154 [00:00<?, ?it/s]



Generating data for greeting:   0%|          | 0/125 [00:00<?, ?it/s]



In [6]:
with open("data/train_data/intent_generator.pkl","rb") as f:
    intent_generator=pickle.load(f)
intent_generator.save_intent_data("data/train_data/intent_train_data.json")

Training data saved to data/train_data/intent_train_data.json
Successfully saved categories: ['restaurant', 'facility', 'building', 'handbook', 'greeting']


### Construct a class for entity recognition

In [5]:
# 实体识别数据生成指令
entity_instruction='''You are an expert NER (Named Entity Recognition) data generation assistant for a campus chatbot.
Your task is to generate training data for entity recognition in Chinese queries.

**Your Task:**
Generate varied Chinese query examples that contain entities from the given categories and entity types.

**Output Format:**
Each generated item should be a JSON object with this exact structure:
{
    "query": "I want to go to A3 library tomorrow morning",
    "entities": [
        {"entity_text": "tomorrow morning", "entity_label": "time"},
        {"entity_text": "A3 library", "entity_label": "facility_name"}
    ]
}

**Entity Label Types:**
- business_name: business name (e.g. KK便利店, KIMS SALON)
- restaurant_name: restaurant name (e.g. 夏1城, 大树下)
- facility_name: facility name (e.g. library, gym, swimming pool)
- building_name: building name (e.g. B1 Activity Building, A Zone Teaching Building)
- handbook_topic: handbook topic (e.g. birthday cake recommendation, campus clinic)
- time: time expression (e.g. tomorrow morning, next Wednesday, tonight)
- location: location expression (e.g. on campus, below the dormitory)

**Instructions:**
1. Generate natural Chinese queries that students would ask
2. Include 1-3 entities per query
3. Entities must match the provided entity names exactly
4. Time and location entities can be flexible expressions
5. Output valid JSON only, no additional text
6. Each query should be realistic and contextually appropriate

**Example:**
{
    "query": ["What is the opening time of KIMS SALON tomorrow?", "What is the opening time of KIMS SALON tomorrow?"],
    "entities": [
        {"entity_text": "KIMS SALON", "entity_label": "business_name"},
        {"entity_text": "tomorrow", "entity_label": "time"}
    ]
}
'''

In [6]:
class entity_data_generator:
    def __init__(self, intent_lib_path=r"data\extracted_json\database.json"):
        self.data=json.load(open(intent_lib_path, "r", encoding="utf-8"))
        self.categories=list(self.data.keys())
        self.entity_responses={
            category: None for category in self.categories
        }
    
    def create_entity_prompt(self, category, num_query):
        # entity_label=self.entity_label_mapping[category]
        entity_label=category
        entity_names=[]
        for catagory in self.categories:
            for entity_item in list(self.data[catagory].values())[1:]:
                entity_names.extend(entity_item["possible_name"])
        
        message_set=[
            {"role": "system", "content": entity_instruction}
        ]
        
        # 为每个实体名称创建生成任务
        for entity_name in entity_names:
            prompt=f"""**Your Task:**
Generate {num_query} English query examples containing the entity "{entity_name}".

**Requirements:**
1. Each query must contain "{entity_name}" as an entity
2. Output format:
{{
  "query": ["query1", "query2", "query3"],
  "entities": [
    {{"entity_text": "entity_name", "entity_label": "entity_label"}}
  ]
}}
3. Output json snippet ONLY. Do not add any other text.

**Entity to include:**
- Entity: "{entity_name}"
- Label: "{entity_label}"
"""
            message_set.append({
                "role": "user",
                "content": prompt
            })
        
        return message_set
    
    def get_entity_response(self, category, num_query):
        message_set=self.create_entity_prompt(category, num_query)
        response_set=[]
        
        for msg in tqdm(message_set[1:], desc=f"Generating entity data for {category}"):

            response=ollama.chat(model=model, messages=[
                message_set[0], msg
            ])["message"]["content"]
            
            response=re.sub(
                pattern=r"```json\n|\n```|```",
                repl='',
                string=response
            )
            
            try:
                json.loads(response)
                response_set.append(response)
            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON response for {category}, skipping this item")
                continue
            except Exception as e:
                print(f"Error generating response for {category}: {e}")
                continue
        
        self.entity_responses[category]=response_set
    
    def save_entity_data(self,file_path="data/train_data/entity_train_data.json"):
        
        with open(file_path,"w",encoding="utf-8") as f:
            f.write("{\n")
            valid_categories=[]
            
            
            for category in self.categories:
                if (self.entity_responses[category] is not None and 
                    len(self.entity_responses[category]) > 0):
                    valid_categories.append(category)
            
            for i, category in enumerate(valid_categories):
                f.write(f'"{category}":[\n')
                
                # Filter out valid JSON data items
                valid_items=[]
                for data_item in self.entity_responses[category]:
                    try:
                        # Verify JSON format
                        json.loads(data_item)
                        valid_items.append(data_item)
                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON item in {category}")
                        continue
                
                # Write only thevalid data items
                for j, data_item in enumerate(valid_items):
                    f.write(data_item)
                    if j == len(valid_items) - 1:
                        f.write("\n")
                    else:
                        f.write(",\n")
                
                # Add comma between categories
                if i == len(valid_categories) - 1:
                    f.write("]\n")
                else:
                    f.write("],\n")
                    
            f.write("}")
            
        print(f"Data saved to {file_path}")
        print(f"Successfully saved categories: {valid_categories}")
        for category in self.categories:
            if category not in valid_categories:
                print(f"Warning: no valid data for {category}")


In [7]:
# Create entity data generator
entity_generator=entity_data_generator()

### Generate data for entity recognition

In [8]:
for c in tqdm(entity_generator.categories,desc="Generating data"):
    entity_generator.get_entity_response(c,4)
with open("data/train_data/entity_generator.pkl", "wb") as f:
    pickle.dump(entity_generator, f)

Generating data:   0%|          | 0/5 [00:00<?, ?it/s]

Generating entity data for restaurant:   0%|          | 0/371 [00:00<?, ?it/s]



Generating entity data for facility:   0%|          | 0/371 [00:00<?, ?it/s]



Generating entity data for building:   0%|          | 0/371 [00:00<?, ?it/s]



Generating entity data for handbook:   0%|          | 0/371 [00:00<?, ?it/s]



Generating entity data for greeting:   0%|          | 0/371 [00:00<?, ?it/s]



In [9]:
with open("data/train_data/entity_generator.pkl","rb") as f:
    entity_generator=pickle.load(f)
entity_generator.save_entity_data("data/train_data/entity_train_data.json")

Data saved to data/train_data/entity_train_data.json
Successfully saved categories: ['restaurant', 'facility', 'building', 'handbook', 'greeting']
