In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    # load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.70
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
    use_rslora = False,  # We support rank stabilised LoRA
    loftq_config = None, # And LoftQ
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-31 13:23:50 [__init__.py:244] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.5: Fast Llama patching. Transformers: 4.53.3. vLLM: 0.9.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.975 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit with actual GPU utilization = 69.08%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 21.98 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 9.0 GB. Also swap

2025-08-31 13:24:04,540 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 08-31 13:24:05 [parallel_state.py:1076] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 08-31 13:24:05 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 08-31 13:24:05 [gpu_model_runner.py:1770] Starting to load model unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit...
INFO 08-31 13:24:05 [gpu_model_runner.py:1775] Loading model from scratch...
INFO 08-31 13:24:05 [cuda.py:284] Using Flash Attention backend on V1 engine.
INFO 08-31 13:24:05 [bitsandbytes_loader.py:499] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 08-31 13:24:05 [weight_utils.py:292] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

INFO 08-31 13:24:11 [weight_utils.py:308] Time spent downloading weights for unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit: 5.916271 seconds
INFO 08-31 13:24:11 [weight_utils.py:345] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-31 13:24:12 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 08-31 13:24:13 [gpu_model_runner.py:1801] Model loading took 5.7737 GiB and 7.571209 seconds
INFO 08-31 13:24:25 [backends.py:508] Using cache directory: /home/sagemaker-user/.cache/vllm/torch_compile_cache/7b30a9cbc9/rank_0_0/backbone for vLLM's torch.compile
INFO 08-31 13:24:25 [backends.py:519] Dynamo bytecode transform time: 11.41 s


Inductor Compilation: 100%|██████████| 7/7 [00:01<00:00,  6.91it/s, triton_poi_fused_view_6]                                    

INFO 08-31 13:24:28 [backends.py:181] Cache the graph of shape None for later use



Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 10.99it/s, triton_poi_fused_view_8]                            
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 74.38it/s, triton_poi_fused_view_8]                            
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 58.00it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 61.42it/s, triton_poi_fused_view_8]                            
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 162.71it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 57.99it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 176.97it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 174.40it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████

INFO 08-31 13:25:08 [backends.py:193] Compiling a graph for general shape takes 42.14 s





INFO 08-31 13:26:19 [monitor.py:34] torch.compile takes 53.55 s in total


2025-08-31 13:26:21,336 - INFO - flashinfer.jit: Loading JIT ops: sampling
2025-08-31 13:27:00,586 - INFO - flashinfer.jit: Finished loading JIT ops: sampling


INFO 08-31 13:27:01 [gpu_worker.py:232] Available KV cache memory: 8.93 GiB
INFO 08-31 13:27:02 [kv_cache_utils.py:716] GPU KV cache size: 73,184 tokens
INFO 08-31 13:27:02 [kv_cache_utils.py:720] Maximum concurrency for 2,048 tokens per request: 35.73x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [01:11<00:00,  1.07s/it]

INFO 08-31 13:28:13 [gpu_model_runner.py:2326] Graph capturing finished in 72 secs, took 1.44 GiB





INFO 08-31 13:28:14 [core.py:172] init engine (profile, create kv cache, warmup model) took 240.70 seconds
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'q_norm', 'pre_feedforward_layernorm', 'k_norm']
Unsloth: Just some info: will skip parsing ['post_feedforward_layernorm', 'q_norm', 'pre_feedforward_layernorm', 'k_norm']


Unsloth 2025.7.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
import pandas as pd

In [5]:
import random
pd_df = pd.read_csv('hotel_2000.csv', sep=',')
pd_df

# Only use the below code if you haven't already split your datasets
unique_convs = pd_df['conv_id'].unique().tolist()
random.shuffle(unique_convs)  # shuffle to randomize split

split_idx = int(len(unique_convs) * 0.5)  # 50% train, 50% test
sft_convs = set(unique_convs[:split_idx])
grpo_convs = set(unique_convs[split_idx:])

sft_train = pd_df[pd_df['conv_id'].isin(sft_convs)].reset_index(drop=True)
grpo_train = pd_df[pd_df['conv_id'].isin(grpo_convs)].reset_index(drop=True)

# sft_train.to_csv('hotel_sft_train2000.csv', index=False, sep=',')
# grpo_train.to_csv('hotel_grpo_train2000.csv', index=False, sep=',')

In [6]:
import random

def split_by_conv(df, frac=0.5, id_col="conv_id", seed=42):
    # Split on unique conversation ids to avoid leakage
    rnd = random.Random(seed)
    unique_ids = df[id_col].unique().tolist()
    rnd.shuffle(unique_ids)
    cut = int(len(unique_ids) * frac)
    a_ids = set(unique_ids[:cut])
    b_ids = set(unique_ids[cut:])
    part_a = df[df[id_col].isin(a_ids)].reset_index(drop=True)
    part_b = df[df[id_col].isin(b_ids)].reset_index(drop=True)
    return part_a, part_b

# Create ~50/50 splits for sft_train
sft_a, sft_b = split_by_conv(sft_train, frac=0.5, id_col="conv_id", seed=42)
# Create ~50/50 splits for grpo_train
grpo_a, grpo_b = split_by_conv(grpo_train, frac=0.5, id_col="conv_id", seed=42)

# Optional: save to CSVs
sft_a.to_csv("hotel_sft_train_partA_500.csv", index=False)
sft_b.to_csv("hotel_sft_train_partB_500.csv", index=False)
grpo_a.to_csv("hotel_grpo_train_partA_500.csv", index=False)
grpo_b.to_csv("hotel_grpo_train_partB_500.csv", index=False)

# Quick sanity check sizes
print({
    "sft_total": len(sft_train),
    "sft_partA": len(sft_a),
    "sft_partB": len(sft_b),
    "grpo_total": len(grpo_train),
    "grpo_partA": len(grpo_a),
    "grpo_partB": len(grpo_b),
})

{'sft_total': 1655, 'sft_partA': 826, 'sft_partB': 829, 'grpo_total': 1680, 'grpo_partA': 855, 'grpo_partB': 825}


In [7]:
tokenizer.eos_token

'<|end_of_text|>'

In [8]:
def tokenize_for_sft(sample):
    sample["input"] = sample["input"].replace('<|eot_id|>', "")
    prompt = f'{sample["input"]}{sample["ground_truth"]}{tokenizer.eos_token}'
    sample["input_ids"] = tokenizer.encode(prompt)
    sample["query"] = tokenizer.decode(sample['input_ids'])
    return sample

In [9]:
pd_to_use = sft_a

In [10]:
import datasets
dataset_dict = {}
dataset_dict['train'] = datasets.Dataset.from_pandas(pd_to_use)
training_ddt = datasets.DatasetDict(dataset_dict)
training_ddt

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'user_utterance', 'system_utterance', 'input', 'output', 'ground_truth'],
        num_rows: 826
    })
})

In [11]:
training_ddt = training_ddt.map(tokenize_for_sft, batched=False)
training_ddt.set_format(type="torch")
training_ddt

Map:   0%|          | 0/826 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'user_utterance', 'system_utterance', 'input', 'output', 'ground_truth', 'input_ids', 'query'],
        num_rows: 826
    })
})

In [12]:
print(training_ddt['train'][0]['query'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a conversational agent with the following persona:
You are a helpful hotel assistant, your job is to help users in whatever queries they may have.

ALLOWED INTENTS:
{'book_room': 'The user wants to book a room in the hotel', 'cancel_booking': 'The user wants to cancel an existing booking', 'general_enquiries': 'The user wants to ask general questions about the hotel', 'chit_chat': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with hotel queries.'"}

ALLOWED SLOTS (must match exactly):
{'book_to', 'book_room', 'bookingID', 'dateTo', 'cancel_booking', 'dateFrom'}

ALLOWED ACTIONS (with their required slots):
{'makeBooking': ('dateFrom', 'dateTo'), 'lookUpBooking': 'bookingID', 'cancellation': 'bookingID'}

TASK:
Given the current conversation history, generate EXACTLY one JSON object that describes ONLY the next system tur

# SFT FIRST!!!

In [13]:
training_data = datasets.Dataset.from_dict({'text': training_ddt['train']['query']})
print(training_data[3]['text'])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a conversational agent with the following persona:
You are a helpful hotel assistant, your job is to help users in whatever queries they may have.

ALLOWED INTENTS:
{'book_room': 'The user wants to book a room in the hotel', 'cancel_booking': 'The user wants to cancel an existing booking', 'general_enquiries': 'The user wants to ask general questions about the hotel', 'chit_chat': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with hotel queries.'"}

ALLOWED SLOTS (must match exactly):
{'book_to', 'book_room', 'bookingID', 'dateTo', 'cancel_booking', 'dateFrom'}

ALLOWED ACTIONS (with their required slots):
{'makeBooking': ('dateFrom', 'dateTo'), 'lookUpBooking': 'bookingID', 'cancellation': 'bookingID'}

TASK:
Given the current conversation history, generate EXACTLY one JSON object that describes ONLY the next system tur

In [14]:
# Validation doesn't work with unsloth training :(

# split_dataset = training_data.train_test_split(test_size=0.15, seed=42)

# training_data = split_dataset["train"]
# validation_data = split_dataset["test"]


# print(f"Training data size: {len(training_data)}")
# print(f"Validation data size: {len(validation_data)}")

In [15]:
import mlflow
mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("llama3_sft_experiment_hotel_500")

2025/08/31 13:41:24 INFO mlflow.tracking.fluent: Experiment with name 'llama3_sft_experiment_hotel_500' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/sagemaker-user/mlruns/190038803992163367', creation_time=1756647684736, experiment_id='190038803992163367', last_update_time=1756647684736, lifecycle_stage='active', name='llama3_sft_experiment_hotel_500', tags={}>

In [16]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas

In [17]:
from trl import SFTConfig, SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 10,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run. Use num_train_epochs for full dataset passes.
        # max_steps = 80, # Use max_steps if you want exactly 60 updates regardless of dataset size.
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        learning_rate = 2e-4, # or 5e-5 if dataset is bigger than 500 examples
        logging_steps = 1, # If using num_train_epochs, you may want to set this at a higher value
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="mlflow",  # 👈 Enable MLflow logging
        dataloader_num_workers=4,  # Parallel data loading
        dataloader_pin_memory=True,  # Speed up data transfer to GPU
        dataloader_persistent_workers=True, # Reuse dataloader workers. Important for reducing I/O time.�
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/826 [00:00<?, ? examples/s]

In [18]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A10G. Max memory = 21.975 GB.
15.447 GB of memory reserved.


In [19]:
print(trainer.train_dataset[0])

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a conversational agent with the following persona:\nYou are a helpful hotel assistant, your job is to help users in whatever queries they may have.\n\nALLOWED INTENTS:\n{\'book_room\': \'The user wants to book a room in the hotel\', \'cancel_booking\': \'The user wants to cancel an existing booking\', \'general_enquiries\': \'The user wants to ask general questions about the hotel\', \'chit_chat\': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be \'Sorry, I can only help you with hotel queries.\'"}\n\nALLOWED SLOTS (must match exactly):\n{\'book_to\', \'book_room\', \'bookingID\', \'dateTo\', \'cancel_booking\', \'dateFrom\'}\n\nALLOWED ACTIONS (with their required slots):\n{\'makeBooking\': (\'dateFrom\', \'dateTo\'), \'lookUpBooking\': \'bookingID\', \'cancellation\': \'bookingID\'}\n\nTASK:\nGiven the current conversation history, genera

In [20]:
# Check a longer sample to see the full format
sample = trainer.train_dataset[5]
print("Full decoded sample:")
print(tokenizer.decode(sample['input_ids']))

Full decoded sample:
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a conversational agent with the following persona:
You are a helpful hotel assistant, your job is to help users in whatever queries they may have.

ALLOWED INTENTS:
{'book_room': 'The user wants to book a room in the hotel', 'cancel_booking': 'The user wants to cancel an existing booking', 'general_enquiries': 'The user wants to ask general questions about the hotel', 'chit_chat': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with hotel queries.'"}

ALLOWED SLOTS (must match exactly):
{'book_to', 'book_room', 'bookingID', 'dateTo', 'cancel_booking', 'dateFrom'}

ALLOWED ACTIONS (with their required slots):
{'makeBooking': ('dateFrom', 'dateTo'), 'lookUpBooking': 'bookingID', 'cancellation': 'bookingID'}

TASK:
Given the current conversation history, generate EXACTLY one JSON object t

In [21]:
from unsloth.chat_templates import train_on_responses_only

# Note to self if you get an error, change from two \n to one, or vice versa
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|begin_of_text|><|start_header_id|>system<|end_header_id|>",
    response_part="<|start_header_id|>assistant<|end_header_id|>\nDialogue State:\n",
)

Map (num_proc=16):   0%|          | 0/826 [00:00<?, ? examples/s]

In [22]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 826 | Num Epochs = 3 | Total steps = 63
O^O/ \_/ \    Batch size per device = 10 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (10 x 4 x 1) = 40
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 826 | Num Epochs = 3 | Total steps = 69
O^O/ \_/ \    Batch size per device = 9 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (9 x 4 x 1) = 36
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 826 | Num Epochs = 3 | Total steps = 78
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch 

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.7619
2,0.7727
3,0.7302
4,0.6882
5,0.5317
6,0.5086
7,0.453
8,0.4455
9,0.3555
10,0.4873


TypeError: 'NoneType' object is not subscriptable

In [23]:
trainer_stats

NameError: name 'trainer_stats' is not defined

In [24]:
industry = "hotel"

In [25]:
experiment_name = f"{industry}_sft_500"
experiment_name

'hotel_sft_500'

In [26]:
model.save_pretrained_merged(experiment_name + "_merged", tokenizer, save_method = "merged_16bit",)
model.save_pretrained(experiment_name + "_adapter")  # Local saving of adapter only
tokenizer.save_pretrained(experiment_name + "_adapter")

Found HuggingFace hub cache directory: /home/sagemaker-user/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/meta-llama-3.1-8b...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:14<00:44, 14.82s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [00:27<00:27, 13.81s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [00:39<00:12, 12.97s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [00:42<00:00, 10.74s/it]


('hotel_sft_500_adapter/tokenizer_config.json',
 'hotel_sft_500_adapter/special_tokens_map.json',
 'hotel_sft_500_adapter/tokenizer.json')

In [27]:
# replace with whatever industry you are working with
prompt = """<|start_header_id|>system<|end_header_id|>
You are a conversational agent with the following persona:
You are a helpful hotel assistant, your job is to help users in whatever queries they may have.

ALLOWED INTENTS:
{'book_room': 'The user wants to book a room in the hotel', 'cancel_booking': 'The user wants to cancel an existing booking', 'general_enquiries': 'The user wants to ask general questions about the hotel', 'chit_chat': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with hotel queries.'"}

ALLOWED SLOTS (must match exactly):
{'bookingID', 'book_room', 'dateFrom', 'book_to', 'cancel_booking', 'dateTo'}

ALLOWED ACTIONS (with their required slots):
{'makeBooking': ('dateFrom', 'dateTo'), 'lookUpBooking': 'bookingID', 'cancellation': 'bookingID'}

TASK:
Given the current conversation history, generate EXACTLY one JSON object that describes the next system turn. The JSON MUST have the following top-level keys ONLY:
    "system_response" : string
    "dialogue_acts"   : object with keys {"intent": string, "action": string} — "action" may be "" if none
    "belief_state"    : object with ALL slots from {'bookingID', 'book_room', 'dateFrom', 'book_to', 'cancel_booking', 'dateTo'} as keys (values are "" if unfilled)

STRICT FORMAT RULES:
1. Output must be ONLY the JSON object — no extra text, no explanations, no labels, no prefixes, no suffixes.
2. Do not add or remove keys. Do not reorder top-level keys.
3. Use ONLY intents from {'book_room': 'The user wants to book a room in the hotel', 'cancel_booking': 'The user wants to cancel an existing booking', 'general_enquiries': 'The user wants to ask general questions about the hotel', 'chit_chat': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be 'Sorry, I can only help you with hotel queries.'"}, slots from {'bookingID', 'book_room', 'dateFrom', 'book_to', 'cancel_booking', 'dateTo'}, and actions from {'makeBooking': ('dateFrom', 'dateTo'), 'lookUpBooking': 'bookingID', 'cancellation': 'bookingID'}. Do not invent or abbreviate any names. Strings must match exactly.
4. If the SYSTEM turn includes a slot reference AND an action, replace slot values in the "system_response" with the placeholder format "<slot_name>".
5. The "belief_state" object must contain ALL allowed slots, with correct names and either a concrete value (if filled) or "".
6. "system_response" must not repeat the user's last utterance verbatim. It should advance the conversation.
7. Wording should be natural, concise, and free of extraneous commentary.

DIVERSITY RULES:
1. Vary slot values and phrasings used in your "system_response" compared to earlier turns in the same or previous conversations.
2. Use synonyms, alter sentence structures, and vary the order in which you request or confirm slots.
3. Avoid reusing the exact same wording or slot combinations already seen in prior outputs for the same intent.

EXAMPLES:
(keep examples out of final output; they are for guidance only)
<example1>
CONV_HISTORY:
USER: I want to book a hotel room from August 12th to August 15th.
Dialogue State:
{
    "system_response": "Got it. I’ll reserve a room for you from <dateFrom> to <dateTo>. Do you have a preferred bed type?",
    "dialogue_acts": {"intent" : "book_room", "action" : "makeBooking"},
    "belief_state": {"dateFrom" : "2025-08-12", "dateTo" : "2025-08-15", "bookingID" : "", "cancel_booking" : "", "roomType": ""}
}
</example1>

<example2>
CONV_HISTORY:
SYSTEM: How can I help you today?
USER: I’d like to cancel my hotel booking please.
Dialogue State:
{
    "system_response": "Sure, I can cancel your booking. Could you provide your booking ID?",
    "dialogue_acts": {"intent" : "cancel_booking", "action" : ""},
    "belief_state": {"dateFrom" : "", "dateTo" : "", "bookingID" : "", "cancel_booking" : "", "roomType": ""}
}
</example2>

<example3>
CONV_HISTORY:
USER: My booking ID is 78910.
Dialogue State:
{
    "system_response": "Thanks. I’ve cancelled your reservation with booking ID <bookingID>. Is there anything else I can help you with?",
    "dialogue_acts": {"intent" : "cancel_booking", "action" : "cancelBooking"},
    "belief_state": {"dateFrom" : "", "dateTo" : "", "bookingID" : "78910", "cancel_booking" : "", "roomType": ""}
}
</example3>

<example4>
CONV_HISTORY:
USER: Do you have a suite available for this weekend?
Dialogue State:
{
    "system_response": "Let me check. Could you tell me your exact check-in and check-out dates?",
    "dialogue_acts": {"intent" : "book_room", "action" : ""},
    "belief_state": {"dateFrom" : "", "dateTo" : "", "bookingID" : "", "cancel_booking" : "", "roomType": "suite"}
}
</example4>

<example5>
CONV_HISTORY:
USER: What is the meaning of life?
Dialogue State:
{
    "system_response": "No problem, I’ll update your check-out date to <dateTo>. Is the check-in date staying the same?",
    "dialogue_acts": {"intent" : "modify_booking", "action" : "updateBooking"},
    "belief_state": {"dateFrom" : "2025-07-15", "dateTo" : "2025-07-22", "bookingID" : "BK55667", "cancel_booking" : "", "roomType": "double"}
}
</example5>

<|start_header_id|>user<|end_header_id|>
Conversation history so far:

Return ONLY the JSON object for the next system turn.
<|start_header_id|>assistant<|end_header_id|>

Dialogue State:"""

In [28]:
def extract_llm_response(text: str) -> str:
    answer = text.split('<|start_header_id|>assistant<|end_header_id|>')[1]
    return answer.strip()

In [29]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt
], return_tensors = "pt").to("cuda")

# lora_rank = 32 # Larger rank = smarter, but slower

outputs = model.generate(**inputs, max_new_tokens = 1000, use_cache = True)
print(extract_llm_response(tokenizer.batch_decode(outputs)[0]))

Dialogue State: {'system_response': "Certainly! I've updated your booking for a double room from <dateFrom> to <dateTo>. Your booking ID is <bookingID>. Is there anything else you need assistance with?", 'dialogue_acts': {'intent': 'book_room', 'action': ''}, 'belief_state': {'dateFrom': '2025-07-15', 'dateTo': '2025-07-22', 'bookingID': 'BK55667', 'cancel_booking': '', 'roomType': 'double'}}<|end_of_text|>


In [32]:
print("✅ SFT stage complete.")

✅ SFT stage complete.


# NOW GRPO THISS BICH!!

If you would like to load the model from file system

In [33]:
del model, tokenizer, trainer

In [1]:
print("Configuring model for GRPO stage...")

# del model, tokenizer # if this doesn't work, you can simply restart the kernel and go from here.
from unsloth import FastLanguageModel

industry = "hotel"

experiment_name = f"{industry}_sft"
experiment_name = "hotel_sft_500_merged"

max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = experiment_name,
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.80
)

# target_modules = [
#         "q_proj", "k_proj", "v_proj", "o_proj",
#         "gate_proj", "up_proj", "down_proj",
#     ], # Remove QKVO if out of memory

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
)

Configuring model for GRPO stage...
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-31 14:50:39 [__init__.py:244] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.5: Fast Llama patching. Transformers: 4.53.3. vLLM: 0.9.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 21.975 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading hotel_sft_500_merged with actual GPU utilization = 78.95%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 21.98 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.2

2025-08-31 14:50:53,774 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 08-31 14:50:54 [parallel_state.py:1076] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 08-31 14:50:54 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 08-31 14:50:54 [gpu_model_runner.py:1770] Starting to load model hotel_sft_500_merged...
INFO 08-31 14:50:54 [gpu_model_runner.py:1775] Loading model from scratch...
INFO 08-31 14:50:54 [cuda.py:284] Using Flash Attention backend on V1 engine.
INFO 08-31 14:50:54 [bitsandbytes_loader.py:499] Loading weights with BitsAndBytes quantization. May take a while ...


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-31 14:50:58 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 08-31 14:50:58 [gpu_model_runner.py:1801] Model loading took 5.5303 GiB and 3.475162 seconds
INFO 08-31 14:51:10 [backends.py:508] Using cache directory: /home/sagemaker-user/.cache/vllm/torch_compile_cache/a0d8bd538d/rank_0_0/backbone for vLLM's torch.compile
INFO 08-31 14:51:10 [backends.py:519] Dynamo bytecode transform time: 11.22 s


Inductor Compilation: 100%|██████████| 7/7 [00:00<00:00,  7.10it/s, triton_poi_fused_view_6]                                    

INFO 08-31 14:51:13 [backends.py:181] Cache the graph of shape None for later use



Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 11.00it/s, triton_poi_fused_view_8]                            
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 59.76it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 57.85it/s, triton_poi_fused_view_8]                            
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 52.66it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 58.16it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 164.93it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 54.68it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|██████████| 9/9 [00:00<00:00, 192.48it/s, triton_poi_fused_view_8]                           
Inductor Compilation: 100%|████████

INFO 08-31 14:51:53 [backends.py:193] Compiling a graph for general shape takes 42.18 s





INFO 08-31 14:53:02 [monitor.py:34] torch.compile takes 53.39 s in total


2025-08-31 14:53:04,517 - INFO - flashinfer.jit: Loading JIT ops: sampling
2025-08-31 14:53:04,599 - INFO - flashinfer.jit: Finished loading JIT ops: sampling


INFO 08-31 14:53:05 [gpu_worker.py:232] Available KV cache memory: 11.37 GiB
INFO 08-31 14:53:06 [kv_cache_utils.py:716] GPU KV cache size: 93,136 tokens
INFO 08-31 14:53:06 [kv_cache_utils.py:720] Maximum concurrency for 2,048 tokens per request: 45.48x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [01:10<00:00,  1.05s/it]

INFO 08-31 14:54:16 [gpu_model_runner.py:2326] Graph capturing finished in 70 secs, took 1.45 GiB





INFO 08-31 14:54:16 [core.py:172] init engine (profile, create kv cache, warmup model) took 197.66 seconds
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm']
Unsloth: Just some info: will skip parsing ['q_norm', 'post_feedforward_layernorm', 'k_norm', 'pre_feedforward_layernorm']


Unsloth 2025.7.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
import random
import pandas as pd
pd_df = pd.read_csv('hotel_grpo_train_partA_500.csv', sep=',')
pd_df

#Only use the below code if you haven't already split your datasets
# unique_convs = pd_df['conv_id'].unique().tolist()
# random.shuffle(unique_convs)  # shuffle to randomize split

# split_idx = int(len(unique_convs) * 0.5)  # 50% train, 50% test
# sft_convs = set(unique_convs[:split_idx])
# grpo_convs = set(unique_convs[split_idx:])

# sft_train = pd_df[pd_df['conv_id'].isin(sft_convs)].reset_index(drop=True)
# grpo_train = pd_df[pd_df['conv_id'].isin(grpo_convs)].reset_index(drop=True)

Unnamed: 0,conv_id,user_utterance,system_utterance,input,output,ground_truth
0,cancel_booking_0_7b2c9d4e,"Hello, I need to cancel my hotel reservation.",I understand you want to cancel your reservati...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""I understand you want to ..."
1,cancel_booking_0_7b2c9d4e,"Yes, my booking ID is RES9876.",Thank you. I've located your reservation with ...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""Thank you. I've located y..."
2,cancel_booking_0_7b2c9d4e,I have to reschedule my trip due to work commi...,"I see, thank you for letting us know. I'm proc...",<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""I see, thank you for lett..."
3,cancel_booking_0_7b2c9d4e,"Yes, please check the cancellation policy for me.",Certainly. I've checked the policy for your bo...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""Certainly. I've checked t..."
4,cancel_booking_0_7b2c9d4e,"Yes, please go ahead and cancel the booking.","Alright, I've processed the cancellation for y...",<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""Alright, I've processed t..."
...,...,...,...,...,...,...
850,cancel_booking_339_b2e1c5f6,"Hello, I need to cancel my hotel reservation.",I understand you'd like to cancel a reservatio...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""I understand you'd like t..."
851,cancel_booking_339_b2e1c5f6,"Yes, my booking ID is REF8532.",Thank you. I've located your booking with ID <...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""Thank you. I've located y..."
852,cancel_booking_339_b2e1c5f6,I have to postpone my trip due to work commitm...,"I see, thank you for letting us know. I'm proc...",<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""I see, thank you for lett..."
853,cancel_booking_339_b2e1c5f6,"Yes, please cancel it.",Certainly. I've successfully cancelled your bo...,<|start_header_id|>system<|end_header_id|>\nYo...,<|start_header_id|>system<|end_header_id|>\nYo...,"{'system_response': ""Certainly. I've successfu..."


else

In [3]:
# pd_df = pd.read_csv('grpo_train2.csv', sep=',')
# pd_df
# dataset_dict = {}

# Use code above if you've already split your dataset
import datasets
dataset_dict = {}
dataset_dict['train'] = datasets.Dataset.from_pandas(pd_df)
training_ddt = datasets.DatasetDict(dataset_dict)
training_ddt

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'user_utterance', 'system_utterance', 'input', 'output', 'ground_truth'],
        num_rows: 855
    })
})

In [4]:
def tokenize_for_sft(sample):
    sample["input"] = sample["input"].replace('<|eot_id|>', "")
    prompt = f'{sample["input"]}{sample["ground_truth"]}{tokenizer.eos_token}'
    sample["input_ids"] = tokenizer.encode(prompt)
    sample["query"] = tokenizer.decode(sample['input_ids'])
    return sample

training_ddt = training_ddt.map(tokenize_for_sft, batched=False)
training_ddt.set_format(type="torch")
training_ddt

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'user_utterance', 'system_utterance', 'input', 'output', 'ground_truth', 'input_ids', 'query'],
        num_rows: 855
    })
})

In [5]:
from fuzzywuzzy import fuzz

def fast_fuzzy(text1, text2):
    """Good balance of speed and typo tolerance"""
    return fuzz.ratio(text1, text2) / 100  # Normalize to 0-1

fast_fuzzy("bookHotel", "cancelBooking")  # Handles typos

0.27

In [6]:
import re
import json
import ast

def safe_parse_json_or_python(s):
    """Try parsing as JSON first, then as Python literal."""
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(s)
        except Exception:
            return None


def check_exact_format(generated_output, expected_keys=["system_response", "dialogue_acts", "belief_state"], ground_truth=None):
    try:
        pattern = r"\{(?:[^{}]|\{[^{}]*\})*\}"
        match = re.search(pattern, generated_output)

        if match:
            extracted_dict = match.group(0)
        else:
            print(f"Didn't find match: {generated_output}")
            return None, 0.0, None
        
        parsed = safe_parse_json_or_python(extracted_dict)
        if not isinstance(parsed, dict):
            print(f"Parsed object is not a dict: {parsed}")
            return None, 0.0, None

        # --- Key presence score ---
        found_keys = list(parsed.keys())
        key_score = sum(1 for key in expected_keys if key in parsed) / len(expected_keys)

        # --- dialogue_acts validation ---
        dialogue_acts_score = 0
        intent_found = False
        action_found = False
        if isinstance(parsed.get("dialogue_acts"), dict):
            intent_found = "intent" in parsed["dialogue_acts"]
            action_found = "action" in parsed["dialogue_acts"]

            intent_correct = True
            action_correct = True

            
            if ground_truth:
                gt_intent = ground_truth.get("dialogue_acts", {}).get("intent", "").strip()
                gt_action = ground_truth.get("dialogue_acts", {}).get("action", "").strip()
                if gt_intent:
                    intent_correct = intent_found
                else:
                    intent_correct = not intent_found
                if gt_action:
                    action_correct = action_found
                else:
                    action_correct = not action_found

            if intent_correct and action_correct:
                dialogue_acts_score = 1

        belief_state_score = 0
        bs = parsed.get("belief_state")

        if isinstance(bs, dict):
            belief_state_score = 1
        elif isinstance(bs, str):
            if safe_parse_json_or_python(bs):
                belief_state_score = 1
            else:
                belief_state_score = 0
        
        return parsed, (key_score + dialogue_acts_score + belief_state_score) / 3, found_keys
    
    except Exception as err:
        print(f"Complete fail from check: {generated_output}, Error: {err}")
        return None, 0.0, None



def intent_accuracy_reward(generated, ground_truth):
    try:
        gen_intent = generated["dialogue_acts"]["intent"]
        gt_intent = ground_truth["dialogue_acts"]["intent"]
        
        if gen_intent.lower() == gt_intent.lower():
            return 1.0
        
        # Simple fuzzy matching - replace with your preferred implementation
        return fast_fuzzy(gen_intent, gt_intent)
        
    except (KeyError, TypeError) as err:
        print(f"IAR exception: generated {generated}, error: {err}")
        return 0.0


def action_accuracy_reward(generated, ground_truth):
    """Reward for correct action prediction"""
    try:
        gen_action = generated["dialogue_acts"].get("action")
        gt_action = ground_truth["dialogue_acts"].get("action")
        
        if gen_action == gt_action:
            return 1.0
        
        return fast_fuzzy(gen_action, gt_action)
    except Exception as err:
        print(f"AAR exception: generated {generated}, error: {err}")
        
    return 0.0

def belief_state_reward(gen_bs, gt_bs):
    """Reward based on slot accuracy (JGA-style)."""
    if not isinstance(gen_bs, dict) or not isinstance(gt_bs, dict):
        return 0.0

    if not gt_bs:  # ground truth empty
        return 1.0 if not gen_bs else 0.0

    # Slot-level accuracy
    correct = sum(1 for k, v in gt_bs.items() if gen_bs.get(k) == v)
    total = len(gt_bs)
    slot_acc = correct / total

    # JGA-style: exact match gives full credit
    if gen_bs == gt_bs:
        return 1.0

    return slot_acc

testes = "{'system_response': 'I want to check in on July 15th and check out on July 20th.', 'dialogue_acts': {'intent': 'book_room'}, 'belief_state': {'dateFrom': 'July 15th', 'dateTo': 'July 20th'}}<|eot_id|>"

check_exact_format(testes)

({'system_response': 'I want to check in on July 15th and check out on July 20th.',
  'dialogue_acts': {'intent': 'book_room'},
  'belief_state': {'dateFrom': 'July 15th', 'dateTo': 'July 20th'}},
 1.0,
 ['system_response', 'dialogue_acts', 'belief_state'])

In [7]:
import random
import ast

global_step = 0  # keep track of training steps manually


def combined_reward_function(generated_output, ground_truth, step, total_steps=250, **kwargs):
    """Combined reward function with curriculum weighting."""
    gt = safe_parse_json_or_python(ground_truth)
    if not isinstance(gt, dict):
        return 0.0

    # Parse generated output
    parsed, format_score, _ = check_exact_format(generated_output, ground_truth=gt)
    if not parsed:
        return format_score * 0.5  # small reward if just format partially matched

    # Individual rewards
    intent_reward = intent_accuracy_reward(parsed, gt)
    action_reward = action_accuracy_reward(parsed, gt)
    bs_reward = belief_state_reward(parsed.get("belief_state", {}), gt.get("belief_state", {}))

    # Curriculum weights: start format-heavy, shift to task correctness
    progress = step / max(1, total_steps)
    weights = {
        "format": 0.3 * (1 - progress) + 0.1 * progress,   # 0.3 → 0.1
        "intent": 0.2 * (1 - progress) + 0.3 * progress,   # 0.2 → 0.3
        "action": 0.2 * (1 - progress) + 0.3 * progress,   # 0.2 → 0.3
        "belief": 0.3 * (1 - progress) + 0.3 * progress    # 0.1 → 0.3
    }

    # Combine rewards
    total_reward = (
        format_score * weights["format"] +
        intent_reward * weights["intent"] +
        action_reward * weights["action"] +
        bs_reward * weights["belief"]
    )

    return total_reward

# Create a wrapper that matches GRPO's expected signature
def reward_fn(prompts, completions, completion_ids, ground_truths, **reward_kwargs):
    global global_step
    print(global_step)
    # print(completions[random.randrange(0,5)])
    reward = [combined_reward_function(completion, gt, global_step) for completion, gt in zip(completions, ground_truths)]

    global_step += 1

    return reward

In [8]:
combined_reward_function("{'system_response': 'I want to check in on July 15th and check out on July 20th.', 'dialogue_acts': {'intent': 'book_room'}, 'belief_state': {'dateFrom': 'July 15th', 'dateTo': 'July 20th'}}<|eot_id|>", "{'system_response': 'I want to check in on July 15th and check out on July 20th.',"
  "'dialogue_acts': {'intent': 'book_room'},"
  "'belief_state': {'dateFrom': 'July 15th', 'dateTo': 'July 20th'}}", 342)

1.0

In [9]:
training_ddt = training_ddt.rename_column('input', 'prompt')
# training_ddt = training_ddt.rename_column('output', 'query')

In [10]:
import mlflow
mlflow.set_experiment("llama3_grpo_experiment_hotel_500")

2025/08/31 14:57:04 INFO mlflow.tracking.fluent: Experiment with name 'llama3_grpo_experiment_hotel_500' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/sagemaker-user/mlruns/361881582858500346', creation_time=1756652224812, experiment_id='361881582858500346', last_update_time=1756652224812, lifecycle_stage='active', name='llama3_grpo_experiment_hotel_500', tags={}>

In [11]:
max_prompt_length = 2048
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 10,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 2, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_prompt_length,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    max_grad_norm = 0.5,
    report_to="mlflow",  # 👈 Enable MLflow logging
    dataloader_num_workers=4,  # Parallel data loading
    dataloader_pin_memory=True,  # Speed up data transfer to GPU
    dataloader_persistent_workers=True, # Reuse dataloader workers. Important for reducing I/O time.
    output_dir = "outputs",
)

In [12]:
training_data = datasets.Dataset.from_dict({'prompt': training_ddt['train']['prompt'], 'ground_truths': training_ddt['train']['ground_truth']})
training_data

Dataset({
    features: ['prompt', 'ground_truths'],
    num_rows: 855
})

In [13]:
training_ddt['train']['prompt'][5]

'<|start_header_id|>system<|end_header_id|>\nYou are a conversational agent with the following persona:\nYou are a helpful hotel assistant, your job is to help users in whatever queries they may have.\n\nALLOWED INTENTS:\n{\'book_room\': \'The user wants to book a room in the hotel\', \'cancel_booking\': \'The user wants to cancel an existing booking\', \'general_enquiries\': \'The user wants to ask general questions about the hotel\', \'chit_chat\': "Queries outside of the other intents specified. Apart from greetings and hellos, the response for this one should be \'Sorry, I can only help you with hotel queries.\'"}\n\nALLOWED SLOTS (must match exactly):\n{\'book_to\', \'book_room\', \'bookingID\', \'dateTo\', \'cancel_booking\', \'dateFrom\'}\n\nALLOWED ACTIONS (with their required slots):\n{\'makeBooking\': (\'dateFrom\', \'dateTo\'), \'lookUpBooking\': \'bookingID\', \'cancellation\': \'bookingID\'}\n\nTASK:\nGiven the current conversation history, generate EXACTLY one JSON object

In [14]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        reward_fn
    ],
    args = training_args,
    train_dataset = training_data,
)

In [15]:
from unsloth import unsloth_train
global global_step
global_step = 0
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 855 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after 

0


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / reward_fn / mean,rewards / reward_fn / std
10,0.0,0.833295,0.052653,73.425,60.1,88.1,0.0,73.425,60.1,88.1,0.002529,0.833295,0.089124
20,0.0,0.808999,0.067323,78.05,62.9,92.5,0.0,78.05,62.9,92.5,0.00232,0.808999,0.114047
30,0.0,0.853056,0.031623,79.3,69.9,90.9,0.0,79.3,69.9,90.9,0.002793,0.853056,0.065876
40,0.0,0.867541,0.057306,76.2,65.3,88.1,0.0,76.2,65.3,88.1,0.002859,0.867541,0.093453
50,0.0,0.835639,0.04395,84.3,72.1,97.3,0.0,84.3,72.1,97.3,0.002127,0.835639,0.121359
60,0.0,0.793489,0.104403,79.575,65.8,93.2,0.0,79.575,65.8,93.2,0.002736,0.793489,0.201214
70,0.0,0.832736,0.074204,82.7,71.4,95.3,0.0,82.7,71.4,95.3,0.002792,0.832736,0.125665
80,0.0,0.849326,0.070756,81.1,65.7,96.7,0.0,81.1,65.7,96.7,0.002532,0.849326,0.10239
90,0.0,0.879802,0.055934,78.425,67.7,89.4,0.0,78.425,67.7,89.4,0.002486,0.879802,0.072049
100,0.0,0.814363,0.061403,84.1,69.7,98.4,0.0,84.1,69.7,98.4,0.002266,0.814363,0.138766


1
Unsloth: Will smartly offload gradients to save VRAM!
2
3
4
Parsed object is not a dict: None
5
6
7
8
9
10
11
12
13
Parsed object is not a dict: None
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
Parsed object is not a dict: None
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
Parsed object is not a dict: None
55
56
57
58
Parsed object is not a dict: None
59
IAR exception: generated {'system_response': "Excellent. I've noted your check-in date for May 15th and check-out date for May 18th. Now, what type of room would you prefer?", 'dialogity_acts': {'intent': 'book_room', 'action': ''}, 'belief_state': {'dateFrom': 'May 15th', 'dateTo': 'May 18th'}}, error: 'dialogue_acts'
AAR exception: generated {'system_response': "Excellent. I've noted your check-in date for May 15th and check-out date for May 18th. Now, what type of room would you prefer?", 'dialogity_acts': {'intent': 'book_room', 'action': ''}, 'belief_state': {'dateFrom': 'May 15th', 'dateTo': 'Ma

TypeError: 'NoneType' object is not subscriptable

In [16]:
model.save_pretrained_merged(experiment_name + "_grpo"+ '-merged', tokenizer, save_method="merged_16bit")

ValueError: Could not determine original model ID from None. If using a local model, ensure the path exists and contains safetensors files.

In [27]:
prompt

NameError: name 'prompt' is not defined

In [42]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 250, use_cache = True)
extract_llm_response(tokenizer.batch_decode(outputs)[0])

'Dialogue State: {\'system_response\': "I apologize, but I can only help you with hotel queries. For questions about the meaning of life, you might want to consult a philosopher or a spiritual leader. Is there anything about our hotel services or facilities that I can assist you with?", \'dialogue_acts\': {\'intent\': \'chit_chat\', \'action\': \'\'}, \'belief_state\': {}}<|end_of_text|>'

- OpenAI’s GPT Models / Large LLM models
- SOTA Approaches - Soloist, SimpleTOD, SimpleTOD, ZS-TOD, AutoTOD