# Intent Curation

In [195]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install git+https://github.com/huggingface/transformers.git@main accelerate

In [70]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import random

In [22]:
### Select the language model
model_name = "codellama/CodeLlama-13b-Instruct-hf"

In [23]:
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                device_map='auto',
                                torch_dtype=torch.float16,
                                )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
print(model.config.max_position_embeddings)

16384


In [25]:
print(model.config)

LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-13b-Instruct-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 13824,
  "max_position_embeddings": 16384,
  "model_type": "llama",
  "num_attention_heads": 40,
  "num_hidden_layers": 40,
  "num_key_value_heads": 40,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.0.dev0",
  "use_cache": true,
  "vocab_size": 32016
}



In [190]:
def generate_response(full_prompt, temp):
   
    torch.cuda.empty_cache()

    tokens = tokenizer.encode(full_prompt, add_special_tokens=False)

    prompt_tokens = []

    dialog_tokens = [tokenizer(
        full_prompt,
        return_tensors="pt",
        add_special_tokens=True
    ).input_ids.to("cuda")]
        
    prompt_tokens.append(torch.cat(dialog_tokens, dim=-1))

    input_ids = prompt_tokens[0]

    generation_output = model.generate(
        input_ids=input_ids,
        do_sample=True,
        max_new_tokens= 1500,
        temperature=temp,
        eos_token_id=tokenizer.eos_token_id,
        top_p=0.9
    );

    new_tokens = generation_output[0][input_ids.shape[-1]:]
    new_assistant_response =  tokenizer.decode(new_tokens).strip();

    return new_assistant_response

In [191]:
def main_prompter(input_prompt):

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant. Provide help in data points creation. Provide output in same format as provided in the input example'
    SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT

    dialogs = [{"role": "system", "content": SYSTEM_PROMPT}]

    dialogs.append({"role": "user", "content": input_prompt})
    
    if dialogs[0]["role"] != "system":
        dialogs = [
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            }
        ] + dialogs

    full_prompt = f"{B_INST} {B_SYS}{(dialogs[0]['content']).strip()}{E_SYS}{(dialogs[1]['content']).strip()} {E_INST}"
    
    return full_prompt

In [121]:
intents = ['flight-cancellation', 'check-reservation', 'book-a-ticket','Shipping-Inquiry','Insurance Info','Medical Appointment Scheduling', 'Restaurant Reservation', 'Job Application Status', 'Product Warranty Information', 'Bank Account Balance', 'Hotel Room Availability', 'Weather Forecast', 'Car Rental Reservation', 'Educational Course Enrollment', 'Technical Support']

In [122]:
print(len(intents))

15


In [134]:
instruction = "Generate 35 new, different and diverse intents. Examples of various intents are given in the below list\n " + str(intents) + "\n All intents must be unique.\n Each intent should consists of minimum 2 and maximum 3 words only.\nDont generate same topics which are mentioned in the examples.\nProvide list of new intents in pyhton list format "
print(instruction)
full_prompt = main_prompter(input_prompt = instruction)
response = generate_response(full_prompt, temp = 1.2)
response

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generate 35 new, different and diverse intents. Examples of various intents are given in the below list
 ['flight-cancellation', 'check-reservation', 'book-a-ticket', 'Shipping-Inquiry', 'Insurance Info', 'Medical Appointment Scheduling', 'Restaurant Reservation', 'Job Application Status', 'Product Warranty Information', 'Bank Account Balance', 'Hotel Room Availability', 'Weather Forecast', 'Car Rental Reservation', 'Educational Course Enrollment', 'Technical Support']
 All intents must be unique.
 Each intent should consists of minimum 2 and maximum 3 words only.
Dont generate same topics which are mentioned in the examples.
Provide list of new intents in pyhton list format 
Entered main_prompter ...
Entered the generator function


"Sure, here are 35 new, different and diverse intents:\n\n['Travel Agent Services', 'Tourism Packages', 'Motorcycle Rental', 'Golf Course Reservations', 'Ski Resort Bookings', 'Luxury Car Rentals', 'Yacht Charter', 'Private Pilot Lessons', 'Airplane Charter', 'Budget Travel Packages', 'All-Inclusive Vacations', 'Mountain Biking Rentals', 'Skydiving Excursions', 'Helicopter Rides', 'Cruise Ship Reservations', 'Food and Wine Tours', 'Vineyard Tours', 'Private Beach Club Rentals', 'Fishing Charters', 'Private Island Resorts', 'Wine Tastings', 'Cultural Tours', 'Adventure Parks', 'Camping Trips', 'Nature Photography Trips', 'Ziplining Adventures', 'Motorcycle Excursions', 'Private Boat Charters', 'Private Jet Reservations', 'Airport Taxis', 'Sightseeing Tours', 'Personalized Travel Plans']</s>"

In [139]:
new_intents = response.replace(']</s>',"").split('[')[-1].split(',')
new_intents = [intent.strip().replace(' ','-').replace('\n]','')[1:-1] for intent in new_intents]
new_intents

['Travel-Agent-Services',
 'Tourism-Packages',
 'Motorcycle-Rental',
 'Golf-Course-Reservations',
 'Ski-Resort-Bookings',
 'Luxury-Car-Rentals',
 'Yacht-Charter',
 'Private-Pilot-Lessons',
 'Airplane-Charter',
 'Budget-Travel-Packages',
 'All-Inclusive-Vacations',
 'Mountain-Biking-Rentals',
 'Skydiving-Excursions',
 'Helicopter-Rides',
 'Cruise-Ship-Reservations',
 'Food-and-Wine-Tours',
 'Vineyard-Tours',
 'Private-Beach-Club-Rentals',
 'Fishing-Charters',
 'Private-Island-Resorts',
 'Wine-Tastings',
 'Cultural-Tours',
 'Adventure-Parks',
 'Camping-Trips',
 'Nature-Photography-Trips',
 'Ziplining-Adventures',
 'Motorcycle-Excursions',
 'Private-Boat-Charters',
 'Private-Jet-Reservations',
 'Airport-Taxis',
 'Sightseeing-Tours',
 'Personalized-Travel-Plans']

In [140]:
len(new_intents)

32

In [141]:
intents.extend(new_intents)
len(intents)

47

In [142]:
intents

['flight-cancellation',
 'check-reservation',
 'book-a-ticket',
 'Shipping-Inquiry',
 'Insurance Info',
 'Medical Appointment Scheduling',
 'Restaurant Reservation',
 'Job Application Status',
 'Product Warranty Information',
 'Bank Account Balance',
 'Hotel Room Availability',
 'Weather Forecast',
 'Car Rental Reservation',
 'Educational Course Enrollment',
 'Technical Support',
 'Travel-Agent-Services',
 'Tourism-Packages',
 'Motorcycle-Rental',
 'Golf-Course-Reservations',
 'Ski-Resort-Bookings',
 'Luxury-Car-Rentals',
 'Yacht-Charter',
 'Private-Pilot-Lessons',
 'Airplane-Charter',
 'Budget-Travel-Packages',
 'All-Inclusive-Vacations',
 'Mountain-Biking-Rentals',
 'Skydiving-Excursions',
 'Helicopter-Rides',
 'Cruise-Ship-Reservations',
 'Food-and-Wine-Tours',
 'Vineyard-Tours',
 'Private-Beach-Club-Rentals',
 'Fishing-Charters',
 'Private-Island-Resorts',
 'Wine-Tastings',
 'Cultural-Tours',
 'Adventure-Parks',
 'Camping-Trips',
 'Nature-Photography-Trips',
 'Ziplining-Adventures'

In [146]:
instruction = "Generate 25 new, different and diverse intents. Examples of various intents are given in the below list\n " + str(random.sample(intents, 10)) + "\n All intents must be unique.\n Each intent should consists of minimum 2 and maximum 3 words only.\nDont generate same topics which are mentioned in the examples.\nProvide list of new intents in pyhton list format "
print(instruction)
full_prompt = main_prompter(input_prompt = instruction)
response = generate_response(full_prompt, temp = 1.2)
response

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generate 25 new, different and diverse intents. Examples of various intents are given in the below list
 ['Vineyard-Tours', 'Product Warranty Information', 'Airplane-Charter', 'Budget-Travel-Packages', 'Bank Account Balance', 'book-a-ticket', 'Medical Appointment Scheduling', 'Mountain-Biking-Rentals', 'Yacht-Charter', 'Ski-Resort-Bookings']
 All intents must be unique.
 Each intent should consists of minimum 2 and maximum 3 words only.
Dont generate same topics which are mentioned in the examples.
Provide list of new intents in pyhton list format 
Entered main_prompter ...
Entered the generator function


"Here are 25 new intents that are different and diverse from the ones provided as examples:\n\n['Trip-Organization', 'Pet-Sitting-Services', 'Hotel-Room-Cleaning', 'Online-Tutoring', 'Teacher-Employment', 'Dental-Care-Plans', 'Household-Organization', 'Event-Planning-Services', 'Housing-Loan-Counseling', 'Food-Bank-Donation', 'Small-Business-Loans', 'Personal-Fitness-Training', 'Mental-Health-Support', 'Art-Tutoring', 'Transportation-Services', 'Home-Repair-Services', 'Medical-Billing-and-Coding', 'Child-Care-Services', 'Online-Translation-Services', 'Career-Coaching', 'Dietary-Supplement-Sales', 'Import-Export-Services', 'Personal-Financial-Planning', 'Travel-Insurance']</s>"

In [147]:
new_intents = response.replace(']</s>',"").split('[')[-1].split(',')
new_intents = [intent.strip().replace(' ','-').replace('\n]','')[1:-1] for intent in new_intents]
new_intents

['Trip-Organization',
 'Pet-Sitting-Services',
 'Hotel-Room-Cleaning',
 'Online-Tutoring',
 'Teacher-Employment',
 'Dental-Care-Plans',
 'Household-Organization',
 'Event-Planning-Services',
 'Housing-Loan-Counseling',
 'Food-Bank-Donation',
 'Small-Business-Loans',
 'Personal-Fitness-Training',
 'Mental-Health-Support',
 'Art-Tutoring',
 'Transportation-Services',
 'Home-Repair-Services',
 'Medical-Billing-and-Coding',
 'Child-Care-Services',
 'Online-Translation-Services',
 'Career-Coaching',
 'Dietary-Supplement-Sales',
 'Import-Export-Services',
 'Personal-Financial-Planning',
 'Travel-Insurance']

In [148]:
intents.extend(new_intents)
len(intents)

71

In [153]:
instruction = "Generate 30 new, different and diverse intents. Examples of various intents are given in the below list\n " + str(random.sample(intents, 10)) + "\n All intents must be unique.\n Each intent should consists of minimum 2 and maximum 3 words only.\nDont generate same topics which are mentioned in the examples.\nProvide list of new intents in pyhton list format "
print(instruction)
full_prompt = main_prompter(input_prompt = instruction)
response = generate_response(full_prompt, temp = 1.2)
response

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generate 30 new, different and diverse intents. Examples of various intents are given in the below list
 ['Cultural-Tours', 'Sightseeing-Tours', 'Household-Organization', 'Food-Bank-Donation', 'Trip-Organization', 'Dietary-Supplement-Sales', 'Dental-Care-Plans', 'Medical-Billing-and-Coding', 'Mental-Health-Support', 'All-Inclusive-Vacations']
 All intents must be unique.
 Each intent should consists of minimum 2 and maximum 3 words only.
Dont generate same topics which are mentioned in the examples.
Provide list of new intents in pyhton list format 
Entered main_prompter ...
Entered the generator function


"Here is a list of 30 new intents that are not included in the previous examples:\n\n1. 'Furniture-Assembly'\n2. 'Meal-Planning'\n3. 'Outdoor-Recreation'\n4. 'Dietary-Restrictions'\n5. 'Pet-Care'\n6. 'Health-and-Wellness'\n7. 'Covid-19-Support'\n8. 'Home-Cleaning'\n9. 'Gardening'\n10. 'Aromatherapy'\n11. 'Organic-Gardening'\n12. 'Sustainable-Living'\n13. 'Self-Care-Routines'\n14. 'Mental-Health-Assessments'\n15. 'Nutrition-Counseling'\n16. 'Travel-Insurance'\n17. 'Health-and-Safety-Tips'\n18. 'Preparedness-Guides'\n19. 'Crisis-Communication'\n20. 'Mental-Health-First-Aid'\n21. 'Disaster-Relief'\n22. 'Community-Building'\n23. 'Gratitude-Journals'\n24. 'Goal-Setting'\n25. 'Mindfulness-Exercises'\n26. 'Natural-Remedies'\n27. 'Personal-Finance-Planning'\n28. 'Healthy-Eating-Recipes'\n29. 'Stress-Management'\n30. 'Emotional-Intelligence-Development'</s>"

In [158]:
new_intents = response.replace('</s>',"").split('\n')[2:]
new_intents = [intent.split(' ')[-1][1:-1] for intent in new_intents]
new_intents

['Furniture-Assembly',
 'Meal-Planning',
 'Outdoor-Recreation',
 'Dietary-Restrictions',
 'Pet-Care',
 'Health-and-Wellness',
 'Covid-19-Support',
 'Home-Cleaning',
 'Gardening',
 'Aromatherapy',
 'Organic-Gardening',
 'Sustainable-Living',
 'Self-Care-Routines',
 'Mental-Health-Assessments',
 'Nutrition-Counseling',
 'Travel-Insurance',
 'Health-and-Safety-Tips',
 'Preparedness-Guides',
 'Crisis-Communication',
 'Mental-Health-First-Aid',
 'Disaster-Relief',
 'Community-Building',
 'Gratitude-Journals',
 'Goal-Setting',
 'Mindfulness-Exercises',
 'Natural-Remedies',
 'Personal-Finance-Planning',
 'Healthy-Eating-Recipes',
 'Stress-Management',
 'Emotional-Intelligence-Development']

In [159]:
intents.extend(new_intents)
len(intents)

101

In [160]:
intents = list(set(intents))
len(intents)

100

In [161]:
intents

['Private-Pilot-Lessons',
 'Tourism-Packages',
 'Vineyard-Tours',
 'Motorcycle-Excursions',
 'Online-Tutoring',
 'Food-and-Wine-Tours',
 'Product Warranty Information',
 'Budget-Travel-Packages',
 'Wine-Tastings',
 'Food-Bank-Donation',
 'Home-Cleaning',
 'Hotel Room Availability',
 'Gratitude-Journals',
 'Medical-Billing-and-Coding',
 'Bank Account Balance',
 'Sustainable-Living',
 'Health-and-Safety-Tips',
 'Housing-Loan-Counseling',
 'Trip-Organization',
 'Transportation-Services',
 'Online-Translation-Services',
 'Pet-Sitting-Services',
 'Mindfulness-Exercises',
 'Luxury-Car-Rentals',
 'Airplane-Charter',
 'Small-Business-Loans',
 'Art-Tutoring',
 'Cruise-Ship-Reservations',
 'Dietary-Restrictions',
 'Travel-Agent-Services',
 'Ski-Resort-Bookings',
 'Medical Appointment Scheduling',
 'Shipping-Inquiry',
 'Dietary-Supplement-Sales',
 'Import-Export-Services',
 'Health-and-Wellness',
 'Pet-Care',
 'Nutrition-Counseling',
 'Job Application Status',
 'Natural-Remedies',
 'Personal-Fina

# Example Curation

In [266]:
import json
import ast

In [278]:
seed_data = [{
  "name": "book-a-ticket",
  "examples": [
    "I want to book a ticket from Hyderabad to Chennai",
    "I'd like to book a plane ticket for a flight on August 25th",
    "Can you help me reserve a seat on a flight from New York to Los Angeles?",
    "I'm interested in flying to Paris from London on the 10th of September.",
    "I want to travel to Tokyo and I need a ticket for a flight next week.",
    "Could you assist me in booking a flight from Chicago to Miami on the 15th of October?",
    "I'm looking for a round-trip ticket from San Francisco to Seattle for the weekend of November 5th.",
    "I need to get to Dubai from Mumbai, preferably on a direct flight.",
    "Can you check for available flights from Toronto to Vancouver for the 20th of December?",
    "I'm planning a trip to Rome, and I'd like to book a business class ticket with extra legroom.",
    "I want to fly to London, departing on the 8th of January and returning on the 20th."
  ]
},
{
  "name": "check-reservation",
  "examples": [
    "What's the reservation status for booking ID ABC123?",
    "Can you provide me with the current boarding status for my flight?",
    "I'd like to know if my reservation for flight XYZ456 is confirmed.",
    "What's the seat number allocated for my reservation with confirmation number DEF789?",
    "Has there been any seat upgrade for my booking on flight LMN567?",
    "Can you check the reservation status for the email address john.doe@email.com?",
    "I want to inquire about the boarding status of my flight with booking reference GHI234.",
    "Please tell me the current status of my reservation for flight JKL890.",
    "Has there been any change in the seat assignment for my booking on flight MNO123?",
    "What's the reservation status for my ticket on the flight with departure code PQR456?"
  ]
}
            ]


In [None]:
counter = 0
invalid = 0

for intent in intents:
    
    if intent in ['book-a-ticket','check-reservation','flight-cancellation']:
        continue

    example = random.choice(seed_data)
    
    one_shot_prompt = f"""
    See below dictiona data point which has two keys 'name' and 'examples'. 'name' key represents the intent.
    'examples' key contains examples related to this intent.
    Below is a sample data point for intent {example["name"]} and corresponding examples.
    
    {example}
    
    Create a data point following the exact same format for intent {intent} and always use "" for strings.
    Do not use placeholder for entities.
    """
    full_prompt = main_prompter(input_prompt = one_shot_prompt)
    response = generate_response(full_prompt, temp = 1.2)
    new_data_point = '{' + response[response.find("{")+1:response.find("}")+1]
    
    try:
        new_data_point = ast.literal_eval(new_data_point)
        seed_data.append(new_data_point)
        counter += 1
        
    except Exception as e:
        invalid += 1
        
    if counter % 5 == 0: print(f'{counter} data points generated ')
    

In [280]:
len(seed_data)

94

{'name': 'book-a-ticket', 'examples': ['I want to book a ticket from Hyderabad to Chennai', "I'd like to book a plane ticket for a flight on August 25th", 'Can you help me reserve a seat on a flight from New York to Los Angeles?', "I'm interested in flying to Paris from London on the 10th of September.", 'I want to travel to Tokyo and I need a ticket for a flight next week.', 'Could you assist me in booking a flight from Chicago to Miami on the 15th of October?', "I'm looking for a round-trip ticket from San Francisco to Seattle for the weekend of November 5th.", 'I need to get to Dubai from Mumbai, preferably on a direct flight.', 'Can you check for available flights from Toronto to Vancouver for the 20th of December?', "I'm planning a trip to Rome, and I'd like to book a business class ticket with extra legroom.", 'I want to fly to London, departing on the 8th of January and returning on the 20th.']}
---
{'name': 'check-reservation', 'examples': ["What's the reservation status for bo

In [282]:
with open("intents.json", "w") as file:
    json.dump(seed_data, file)
