In [19]:
import os
import re
import json
from tqdm.auto import tqdm
import pickle

from openai import OpenAI
api_key=json.load(open("api.json"))[0]
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

import ollama
model="llama3.1:8b"

In [20]:
instruction='''You are an expert NLU data generation assistant for a campus chatbot.
Your task is to generate {num_query} varied data items for given entity and intent.
**Instructions:**
1. You should generate {num_query} possible queries corresponding to the given entity and intent.
2. The output should be a vaild json snippet like {example}
3. The {num_query} queries should be included in a list.
in json file that is "query":[{num_query} generated] 
4. Each possible query must contain one or some of the given entities.
5. Each possible query must contain one or some of the given intents.
6. You cannot add new eneities in the generated query! 
For example:
"entity":{"gym":"facility_name"}, "generated_query":"When does the gym open on weekends"
then a NEW entity "weekend":"time" is added.
7. Output json snippet ONLY. 

**Exapmle:**
{
  "query": ["where is Nasi Kandar restaurant?", "how can i go to Nasi Kandar restaurant?"],
  "intent": "find_location",
  "entities": [
    {"Nasi Kandar restaurant":"reataurant_name"} 
  ]
}
'''

In [21]:
class NLU_data_generator:
    def __init__(self,intent_lib_path=r"data\IE\intent_lib.json"):
        self.data=json.load(open(intent_lib_path,"r",encoding="utf-8"))
        self.categories=list(self.data.keys())
        self.responses={
          category:None for category in self.categories
        }
    
    def create_prompt(self,category,num_query):
        message_set=[
          {"role":"system","content":instruction}
          ]
        for intent in self.data[category][1:]:
          for entity_name in self.data[category][0]:
            prompt=f"""**Your Task:**
            Generate {num_query} new and unique query examples for the intent '{intent}' containing entity {entity_name}:{category}.
            """
            message_set.append({
              "role":"user",
              "content":prompt
            })
        return message_set
    
    def get_response(self,category,num_query):
      message_set=self.create_prompt(category,num_query)
      response_set=[]
      for msg in tqdm(message_set[1:],desc=f"Generating data for {category}"):
        # response=client.chat.completions.create(
        #     model="deepseek-chat",
        #     messages=[
        #       message_set[0],msg
        #     ],
        #     stream=False
        # ).choices[0].message.content
        response=ollama.chat(model=model, messages=[
              message_set[0],msg
            ])["message"]["content"]

        response=re.sub(
          pattern=r"```json\n|\n```",
          repl='',
          string=response
        )
        response_set.append(response)
      self.responses[category]=response_set

    def save_data(self,file_path):
        with open(file_path,"a",encoding="utf-8") as f:
            f.write("{\n")
            for category in self.categories:
                f.write(f'"{category}":[\n')
                for data_item in self.responses[category]:
                    f.write(data_item)
                    f.write(",\n")
                f.write("],\n")
            f.write("}")


In [22]:
generator=NLU_data_generator()

In [5]:
for c in tqdm(generator.categories,desc="Generating data"):
    generator.get_response(c,5)
with open("py_code/data/generator.pkl", "wb") as f:
    pickle.dump(generator, f)

Generating data:   0%|          | 0/5 [00:00<?, ?it/s]

Generating data for business:   0%|          | 0/39 [00:00<?, ?it/s]

Generating data for restaurant:   0%|          | 0/272 [00:00<?, ?it/s]

Generating data for facility:   0%|          | 0/60 [00:00<?, ?it/s]

Generating data for building:   0%|          | 0/10 [00:00<?, ?it/s]

Generating data for handbook:   0%|          | 0/35 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'py_code/data/generator.pkl'

In [7]:
with open("data/generator.pkl", "wb") as f:
    pickle.dump(generator, f)

In [23]:
import json
with open("data/generator.pkl","rb") as f:
    generator=pickle.load(f)

generator.save_data("data/generator.json")