In [6]:
from functools import wraps
from langchain.prompts import PromptTemplate
from langchain.schema import LLMResult
from langchain_aws import ChatBedrock
from numpy.random import choice
from tqdm import tqdm
import time

In [7]:
def retry(max_retries=3, retry_delay=5):
    """Decorator to retry a function or staticmethod if it raises an exception.

    :param max_retries: The maximum number of attempts to retry.
    :param retry_delay: The delay in seconds between retries.
    :return: A decorator that wraps the function or classmethod.
    """

    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            retries = 0
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    retries += 1
                    print(f"Retrying {func.__name__} due to exception: {e}")
                    time.sleep(retry_delay)
            else:
                raise Exception(f"Maximum retries ({max_retries}) exceeded for {func.__name__}")

        return wrapper

    return decorator


@retry(max_retries=3, retry_delay=5)
def get_claude_response(
    prompt_string,
    prompt_params,
    model_id="anthropic.claude-3-haiku-20240307-v1:0",
    model_kwargs={
        "max_tokens": 200,
        "temperature": 1.0,
        # "top_k": 1,
        "top_p": 1,
        "stop_sequences": ['User:', '</assistant>'],
    },
):
    model = ChatBedrock(
        region_name="us-east-1",
        model_id=model_id,
        model_kwargs=model_kwargs
    )
    prompt = PromptTemplate.from_template(prompt_string)
    chain = prompt | model  # | SimpleJsonOutputParser() # LCEL
    return chain.invoke(prompt_params)

In [8]:
description = "You are a helpful hotel assistant, your job is to help users in whatever queries they may have."

intent_list = {
    "book_room": "The user wants to book a room in the hotel",
    "cancel_booking": "The user wants to cancel an existing booking",
    "general_enquiries": "The user wants to ask general questions about the hotel"
}

slots_to_fill = {
    "dateFrom", ('book_room'),
    "dateTo", ('book_to'),
    "bookingID", ("cancel_booking")
}

action_slot_pair = {
    "makeBooking": ("dateFrom", "dateTo"),
    "lookUpBooking": ("bookingID"),
    "cancellation": ("bookingID")
}


# description = "You are a helpful restaurant assistant. Your job is to help users in whatever queries they may have."

# # Intents the restaurant chatbot can handle
# intent_list = {
#     "book_table": "The user wants to book a table at the restaurant",
#     "cancel_reservation": "The user wants to cancel an existing reservation",
#     # "menu_enquiry": "The user wants information about the menu or specials",
#     # "opening_hours": "The user wants to know the restaurant's opening hours",
#     "order_takeout": "The user wants to place a takeout order",
#     "chit_chat": "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with restaurant queries.'"
# }

# # Slots to be filled by the chatbot for specific intents
# slots_to_fill = {
#     "date": ("book_table"),
#     "time": ("book_table"),
#     "party_size": ("book_table"),
#     "reservationID": ("cancel_reservation"),
#     "order_items": ("order_takeout"),
#     "pickup_time": ("order_takeout")
# }

# # Mapping actions to required slots
# action_slot_pair = {
#     "makeReservation": ("date", "time", "party_size"),
#     "lookUpReservation": ("reservationID"),
#     "cancelReservation": ("reservationID"),
#     "placeTakeoutOrder": ("order_items", "pickup_time")
# }
# current_intent = "book_room"
# conv_history = []

In [9]:
from prompts import CONV_PROMPT
current_intent = list(intent_list.keys())[0]
conv_history = []
conv_number = 0

example = get_claude_response(prompt_string=CONV_PROMPT,
                              prompt_params={
                                  'intent_list': str(intent_list),
                                  'slot_list': str(slots_to_fill),
                                  'action_slot_pair': str(action_slot_pair),
                                  'current_intent': current_intent,
                                  "conv_history": conv_history,
                                  'conv_number': conv_number
                              },
                              model_id='anthropic.claude-3-5-sonnet-20240620-v1:0')

In [10]:
example.content

'{\n  "user_utterance": "Hi there, I\'d like to make a reservation for dinner tonight if you have any tables available.",\n  "predicted_system_response": "Certainly! I\'d be happy to help you book a table for dinner tonight. Could you please tell me how many people will be in your party?",\n  "goal_completed": false,\n  "task_success": false,\n  "dialogue_acts": {\n    "intent": "book_table",\n    "action": ""\n  },\n  "belief_state": {},\n  "dialog_turn": 1\n}'

In [11]:
import json

json.loads(example.content.strip())

{'user_utterance': "Hi there, I'd like to make a reservation for dinner tonight if you have any tables available.",
 'predicted_system_response': "Certainly! I'd be happy to help you book a table for dinner tonight. Could you please tell me how many people will be in your party?",
 'goal_completed': False,
 'task_success': False,
 'dialogue_acts': {'intent': 'book_table', 'action': ''},
 'belief_state': {},
 'dialog_turn': 1}

In [5]:
from GRPOTOD.grpotod import GRPOTODAgent

agent = GRPOTODAgent(model_path="init_ft_000", lora_on_top=True, 
                     intent_list=intent_list,action_list=action_slot_pair,
                     slot_list=slots_to_fill)

LOADING BASE MODEL FIRST
==((====))==  Unsloth 2025.7.5: Fast Llama patching. Transformers: 4.53.3. vLLM: 0.9.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.975 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit with actual GPU utilization = 69.08%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 21.98 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 4096. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 9.0 GB. Also swap space = 6 GB.
INFO 08-30 15:10:39 [config.py:841] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 08-30 15:10:39 [config.py:

2025-08-30 15:10:40,243 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 08-30 15:10:40 [parallel_state.py:1076] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 08-30 15:10:40 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 08-30 15:10:40 [gpu_model_runner.py:1770] Starting to load model unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit...
INFO 08-30 15:10:41 [gpu_model_runner.py:1775] Loading model from scratch...
INFO 08-30 15:10:41 [cuda.py:284] Using Flash Attention backend on V1 engine.
INFO 08-30 15:10:41 [bitsandbytes_loader.py:499] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 08-30 15:10:41 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 08-30 15:10:41 [weight_utils.py:345] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-30 15:10:42 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 08-30 15:10:43 [gpu_model_runner.py:1801] Model loading took 5.7739 GiB and 1.666801 seconds
INFO 08-30 15:10:55 [backends.py:508] Using cache directory: /home/sagemaker-user/.cache/vllm/torch_compile_cache/b9c803d01e/rank_0_0/backbone for vLLM's torch.compile
INFO 08-30 15:10:55 [backends.py:519] Dynamo bytecode transform time: 11.42 s
INFO 08-30 15:11:03 [backends.py:155] Directly load the compiled graph(s) for shape None from the cache, took 7.110 s
INFO 08-30 15:11:06 [monitor.py:34] torch.compile takes 11.42 s in total


2025-08-30 15:11:06,806 - INFO - flashinfer.jit: Loading JIT ops: sampling
2025-08-30 15:11:06,829 - INFO - flashinfer.jit: Finished loading JIT ops: sampling


INFO 08-30 15:11:07 [gpu_worker.py:232] Available KV cache memory: 8.71 GiB
INFO 08-30 15:11:08 [kv_cache_utils.py:716] GPU KV cache size: 71,312 tokens
INFO 08-30 15:11:08 [kv_cache_utils.py:720] Maximum concurrency for 4,096 tokens per request: 17.41x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:56<00:00,  1.18it/s]

INFO 08-30 15:12:05 [gpu_model_runner.py:2326] Graph capturing finished in 57 secs, took 1.44 GiB





INFO 08-30 15:12:05 [core.py:172] init engine (profile, create kv cache, warmup model) took 81.95 seconds
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm', 'k_norm', 'q_norm']
Unsloth: Just some info: will skip parsing ['pre_feedforward_layernorm', 'post_feedforward_layernorm', 'k_norm', 'q_norm']


In [12]:
user_input = "What is life mehn"
response = agent.response(user_input)

In [13]:
response

({'system_response': "Sorry, I can only help you with restaurant queries. Is there anything specific about our restaurant services that you'd like to know?",
  'dialogue_acts': {'intent': 'chit_chat', 'action': ''},
  'belief_state': {}},
 "Sorry, I can only help you with restaurant queries. Is there anything specific about our restaurant services that you'd like to know?")

In [14]:
from GRPOTOD.evaluator import TODEvaluator

evaluator = TODEvaluator()

user_turn = {
    "user_utterance": "Hi, I'd like to book a room at your hotel please.",
    "predicted_system_response": "Certainly! I'd be happy to help you book a room. Could you please provide me with your desired check-in and check-out dates?",
    "goal_completed": False,
    "task_success": False,
    "dialogue_acts": {"intent": "", "action": ""},
    "belief_state": {},
    "task_complete": False,
    "dialog_turn": 1
}

system_turn = {
    "system_response": "I'd be happy to help you book a room. Could you please provide me with your desired check-in and check-out dates?",
    "dialogue_acts": {"intent": "book_room", "action": ""},
    "belief_state": {}
}


# Add the turn with reference for BLEU
evaluator.add_turn(1, user_turn, system_turn)

# Compute metrics including BLEU
metrics = evaluator.compute_metrics()
print(metrics)

{'average_turns': 1, 'goals_completed_rate': 0.0, 'tasks_successful_rate': 0.0, 'inform_precision': 0.0, 'inform_recall': 0.0, 'inform_f1': 0.0, 'slot_value_accuracy': 0.0, 'joint_goal_accuracy': 1.0, 'bleu': 0.0}


In [15]:
evaluator.diagnostics

{1: [{'task_completed': False,
   'dialog_turn': 1,
   'user_utterance': "Hi, I'd like to book a room at your hotel please.",
   'system_response': "I'd be happy to help you book a room. Could you please provide me with your desired check-in and check-out dates?",
   'reference_response': None,
   'gold_belief_state': {},
   'pred_belief_state': {},
   'inform_precision': 0.0,
   'inform_recall': 0.0,
   'inform_f1': 0.0,
   'slot_value_accuracy': 0.0,
   'joint_goal_accuracy': 1,
   'bleu': 0.0}]}

In [17]:
from prompts import CONV_PROMPT, FINETUNE_PROMPT
from GRPOTOD.evaluator import TODEvaluator
from GRPOTOD.utils import safe_parse_json_or_python
num_conversations = 10
evaluator = TODEvaluator()
count = 0

for i in range(num_conversations):
    count += 1
    print(f"ROUND {i} of Conversations")
    
    for intent in list(intent_list.keys()):
        end_convo = False
        current_intent = intent
        conv_history = []
        conv_number = 0
        error = False
        agent.init_session()

        print(f"START OF CONVERSATION: {current_intent}_{count}")
        while not end_convo:
            conv_number += 1
            try:
                user_response = safe_parse_json_or_python(get_claude_response(prompt_string=CONV_PROMPT,
                                              prompt_params={
                                                  'intent_list': str(intent_list),
                                                  'slot_list': str(slots_to_fill),
                                                  'current_intent': current_intent,
                                                  'action_slot_pair': str(action_slot_pair),
                                                  "conv_history": conv_history,
                                                  'conv_number': conv_number
                                              },
                                             model_kwargs={
                                                "max_tokens": 512,
                                                "temperature": 1,
                                                # "top_k": 1,
                                                "top_p": 1,
                                                "stop_sequences": ['User:', '</assistant>'],
                                            },
                                            model_id='anthropic.claude-3-5-sonnet-20240620-v1:0').content)
    
                user_utterance = user_response['user_utterance']
                da_agent, agent_response = agent.response(user_response)
                # print(agent_response)
                print(f"USER: {user_response}")
                print(f"AGENT: {agent_response}")
    
                conv_history.append(f"USER: {user_utterance}")
                conv_history.append(f"AGENT: {da_agent}")
                evaluator.add_turn(f"{current_intent}_{count}", user_response, da_agent)
    
                end_convo = user_response['goal_completed']
    
                if user_response['dialog_turn'] > 15:
                    print("Conversation has passed 15 turns, moving on")
                    end_convo = True
    
            except Exception as err:
                if error:
                    raise err
                else:
                    continue

ROUND 0 of Conversations
START OF CONVERSATION: book_table_1
USER: {'user_utterance': "Hi there! I'd like to make a reservation for dinner tonight if possible.", 'predicted_system_response': "Certainly! I'd be happy to help you book a table for tonight. Could you please let me know how many people will be in your party?", 'goal_completed': False, 'task_success': False, 'dialogue_acts': {'intent': 'book_table', 'action': ''}, 'belief_state': {'date': 'tonight'}, 'dialog_turn': 1}
AGENT: Thank you for providing that information. I can check availability for a table for 4 people tonight. Would you like me to proceed with the booking?
USER: {'user_utterance': 'Yes, please go ahead with the booking for tonight. Could we make it for 7:30 PM?', 'predicted_system_response': 'Certainly! I can book a table for 4 people tonight at 7:30 PM. May I know the date for your reservation?', 'goal_completed': False, 'task_success': False, 'dialogue_acts': {'intent': 'book_table', 'action': ''}, 'belief_st

ERROR:root:Error raised by bedrock service: An error occurred (ServiceUnavailableException) when calling the InvokeModel operation (reached max retries: 4): Bedrock is unable to process your request.


Retrying get_claude_response due to exception: An error occurred (ServiceUnavailableException) when calling the InvokeModel operation (reached max retries: 4): Bedrock is unable to process your request.
USER: {'user_utterance': "Hey, I've got a hankering for some fancy grub tonight. You guys got any spots open for a solo diner?", 'predicted_system_response': "Certainly! I'd be happy to check our availability for tonight for a solo diner. We have a table available at 8:30 PM. Would you like me to go ahead and book that for you?", 'goal_completed': False, 'task_success': False, 'dialogue_acts': {'intent': 'book_table', 'action': ''}, 'belief_state': {'party_size': '1', 'date': 'tonight'}, 'dialog_turn': 1}
AGENT: Certainly! I'd be happy to check our availability for tonight for a solo diner. We have a table available at 8:30 PM. Would you like me to go ahead and book that for you?
USER: {'user_utterance': "Hey there! I'm throwing a surprise party for my best friend's 30th birthday next m

ERROR:root:Error raised by bedrock service: An error occurred (ServiceUnavailableException) when calling the InvokeModel operation (reached max retries: 4): Bedrock is unable to process your request.


Retrying get_claude_response due to exception: An error occurred (ServiceUnavailableException) when calling the InvokeModel operation (reached max retries: 4): Bedrock is unable to process your request.
USER: {'user_utterance': "Hello there! I'm in town for a conference and I've heard your restaurant is a must-try. Any chance you have a table for one available tomorrow evening?", 'predicted_system_response': "Certainly! I'd be happy to check our availability for tomorrow evening for a table for one. We have a table available at 7:30 PM. Would you like me to go ahead and book that for you?", 'goal_completed': False, 'task_success': False, 'dialogue_acts': {'intent': 'book_table', 'action': ''}, 'belief_state': {'party_size': '1', 'date': 'tomorrow'}, 'dialog_turn': 1}
AGENT: Certainly! I'd be happy to help you book a table for one tomorrow evening. We have a table available at 7:30 PM. Would you like me to go ahead and book that for you?
USER: {'user_utterance': "Hey there! I've been dr

KeyboardInterrupt: 

In [18]:
evaluator.compute_metrics()

{'average_turns': 658,
 'goals_completed_rate': 0.0,
 'tasks_successful_rate': 1.0,
 'inform_precision': 0.9218422888979635,
 'inform_recall': 0.904794520541748,
 'inform_f1': 0.9132388473964728,
 'slot_value_accuracy': 0.8645287958115183,
 'joint_goal_accuracy': 0.7537993920972644,
 'bleu': 0.0}

In [18]:
evaluator.diagnostics["book_room_1"][4]

{'dialog_turn': 5,
 'user_utterance': "No, that's all I need. Thanks for your help!",
 'system_response': "You're welcome! I'm glad I could assist you with your booking. If you need any further information, please don't hesitate to ask. Have a great day!",
 'gold_belief_state': {'dateFrom': 'July 15th',
  'dateTo': 'July 20th',
  'room_type': 'standard double room',
  'bookingID': '<bookingID>'},
 'pred_belief_state': {'dateFrom': 'July 15th',
  'dateTo': 'July 20th',
  'room_type': 'standard double room',
  'bookingID': '<bookingID>'},
 'inform_precision': 0.9999999975,
 'inform_recall': 0.9999999975,
 'inform_f1': 0.9999999925,
 'slot_value_accuracy': 1.0,
 'joint_goal_accuracy': 1}

In [12]:
conv_history

["USER: Hi there, I'd like to book a room at your hotel please.",
 "AGENT: Certainly! I'd be happy to help you book a room. Could you please provide me with your preferred check-in and check-out dates?",
 "USER: I'd like to check in on July 15th and check out on July 20th.",
 "AGENT: Thank you for providing those dates. I'll book a room for you from <dateFrom> to <dateTo>. What type of room would you prefer?",
 "USER: I'd like a standard double room, please.",
 "AGENT: Certainly. I've booked a standard double room for you from <dateFrom> to <dateTo>. Is there anything else you need assistance with?",
 "USER: That's perfect, thank you. Could you please confirm my booking details one last time?",
 "AGENT: Certainly! I've booked a standard double room for you from <dateFrom> to <dateTo>. Your booking ID is <bookingID>. Is there anything else you need assistance with?",
 'USER: Great, thank you for confirming. That all looks correct. I appreciate your help!',
 "AGENT: You're welcome! I'm g