# Prepare a train set to fine-tune the model

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from booking_baseline import BookingAgent, available_functions, system_message_1, system_message_2

In [2]:
booking_agent = BookingAgent(
    functions=available_functions,
    model_name='mistral',
    system_message_1=system_message_1,
    system_message_2=system_message_2
)

In [3]:
train_df = pd.read_csv('train.csv')
train_df

Unnamed: 0,available,query
0,True,Subject: Room Reservation Request at Premier ...
1,False,"Hello, I'd like to make a reservation for a r..."
2,True,I need some suggestions for hotels available ...
3,True,"Hi there, could you please provide me with a ..."
4,False,Hi there! I'm interested in booking a room at...
...,...,...
459,True,Could you please provide me with a list of ho...
460,False,"Hi, I'm looking for available hotels in Paris..."
461,False,Subject: Room Reservation Request for Marrow ...
462,True,Hi there! I'd like to make a reservation at L...


In [4]:
train_df[['llm_response_1', 'llm_response_2']] = train_df.progress_apply(lambda row: booking_agent(row['query']), axis=1, result_type='expand')

  0%|                                                   | 0/464 [00:00<?, ?it/s]ERROR:root:Function calls failed with error: book_hotel() missing 1 required positional argument: 'checkout_date'
  3%|█▎                                        | 15/464 [01:03<29:53,  3.99s/it]ERROR:root:Function calls failed with error: book_hotel() missing 4 required positional arguments: 'name', 'city', 'checkin_date', and 'checkout_date'
  8%|███▍                                      | 38/464 [02:58<34:12,  4.82s/it]ERROR:root:Function calls failed with error: get_hotels() missing 1 required positional argument: 'checkout_date'
  9%|███▉                                      | 43/464 [03:20<32:02,  4.57s/it]ERROR:root:Function calls failed with error: get_hotels() missing 1 required positional argument: 'checkout_date'
 11%|████▌                                     | 50/464 [03:57<31:22,  4.55s/it]ERROR:root:Function calls failed with error: book_hotel() got an unexpected keyword argument 'num_of_guests

In [5]:
train_df

Unnamed: 0,available,query,llm_response_1,llm_response_2
0,True,Subject: Room Reservation Request at Premier ...,{'comment': 'User provided the city and checki...,"Dear User,\n\nThank you for reaching out to u..."
1,False,"Hello, I'd like to make a reservation for a r...","{'comment': '', 'function_calls': [{'name': 'b...","I'm sorry for the inconvenience, but it seems..."
2,True,I need some suggestions for hotels available ...,"{'comment': '', 'function_calls': [{'name': 'g...",Based on the response from the booking servic...
3,True,"Hi there, could you please provide me with a ...","{'comment': '', 'function_calls': [{'name': 'g...",I'm sorry for any inconvenience. The booking ...
4,False,Hi there! I'm interested in booking a room at...,"{'comment': '', 'function_calls': [{'name': 'b...","I'm sorry for the inconvenience, but it seems..."
...,...,...,...,...
459,True,Could you please provide me with a list of ho...,"{'comment': '', 'function_calls': [{'name': 'g...",Based on the response I received from the boo...
460,False,"Hi, I'm looking for available hotels in Paris...","{'comment': '', 'function_calls': [{'name': 'g...",I'm sorry for any inconvenience. It seems the...
461,False,Subject: Room Reservation Request for Marrow ...,"{'comment': '', 'function_calls': [{'name': 'b...","I'm sorry for any inconvenience. However, bas..."
462,True,Hi there! I'd like to make a reservation at L...,"{'comment': '', 'function_calls': [{'name': 'b...","I'm sorry for the inconvenience, but it seems..."


In [6]:
train_df.to_excel('train_raw.xlsx', index=False)

## Finalize the train set to fine-tune the 1st stage (tool call generation)
The dataset was checked and corrected manually.

In [1]:
import json, os

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

from booking_baseline import system_message_1

In [2]:
train_df = pd.read_excel('train_first_response_only.xlsx')
train_df

Unnamed: 0,available,query,llm_response_1
0,True,Subject: Room Reservation Request at Premier ...,"{'comment': '', 'function_calls': [{'name': 'b..."
1,False,"Hello, I'd like to make a reservation for a r...","{'comment': '', 'function_calls': [{'name': 'b..."
2,True,I need some suggestions for hotels available ...,"{'comment': '', 'function_calls': [{'name': 'g..."
3,True,"Hi there, could you please provide me with a ...",{'comment': 'Missing check-in and check-out da...
4,False,Hi there! I'm interested in booking a room at...,"{'comment': 'Missing city name', 'function_cal..."
...,...,...,...
456,True,Could you please provide me with a list of ho...,"{'comment': '', 'function_calls': [{'name': 'g..."
457,False,"Hi, I'm looking for available hotels in Paris...","{'comment': 'Missing checkout date', 'function..."
458,False,Subject: Room Reservation Request for Marrow ...,"{'comment': '', 'function_calls': [{'name': 'b..."
459,True,Hi there! I'd like to make a reservation at L...,"{'comment': '', 'function_calls': [{'name': 'b..."


In [3]:
model_path = '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/'

Model's max number of tokens:

In [4]:
with open(os.path.join(model_path, 'config.json')) as f:
    model_config = json.load(f)
print(model_config)

{'architectures': ['MistralForCausalLM'], 'attention_dropout': 0.0, 'bos_token_id': 1, 'eos_token_id': 2, 'head_dim': None, 'hidden_act': 'silu', 'hidden_size': 4096, 'initializer_range': 0.02, 'intermediate_size': 14336, 'max_position_embeddings': 32768, 'model_type': 'mistral', 'num_attention_heads': 32, 'num_hidden_layers': 32, 'num_key_value_heads': 8, 'quantization_config': {'_load_in_4bit': True, '_load_in_8bit': False, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_storage': 'uint8', 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'llm_int8_skip_modules': None, 'llm_int8_threshold': 6.0, 'load_in_4bit': True, 'load_in_8bit': False, 'quant_method': 'bitsandbytes'}, 'rms_norm_eps': 1e-05, 'rope_theta': 1000000.0, 'sliding_window': None, 'tie_word_embeddings': False, 'torch_dtype': 'float16', 'transformers_version': '4.52.4', 'use_cache': False, 'vocab_size': 32768}


In [5]:
model_config['max_position_embeddings']

32768

Use a smaller number to save memory (checked that the maximum number of tokens in the train set is less than 2000)

In [6]:
max_tokens = 2048

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [8]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_hotels",
            "description": "Find hotels with rooms available for booking in a given city between given check-in and check-out dates",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {
                        "type": "string",
                        "description": "The city where available hotels should be found",
                    },
                    "checkin_date": {
                        "type": "string",
                        "description": "Date when the user wants to check-in",
                    },
                    "checkout_date": {
                        "type": "string",
                        "description": "Date when the user wants to check-out",
                    }
                },
                "required": ["city", "checkin_date", "checkout_date"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "book_hotel",
            "description": "Book a room in the specified hotel from the provided check-in date to the provided check-out date",
            "parameters": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "Name of the hotel where the user wants a room to be booked",
                    },
                    "city": {
                        "type": "string",
                        "description": "The city where the hotel is located",
                    },
                    "checkin_date": {
                        "type": "string",
                        "description": "Date when the user wants to check-in",
                    },
                    "checkout_date": {
                        "type": "string",
                        "description": "Date when the user wants to check-out",
                    }
                },
                "required": ["name", "city", "checkin_date", "checkout_date"],
            },
        },
    }
]

In [9]:
ds = Dataset.from_pandas(train_df[['query', 'llm_response_1']], split="train", preserve_index=False)

In [10]:
def preprocess_function(examples):
    inputs = [
        f'''System:
{system_message_1}
Tools:
{json.dumps(tools, indent=2)}
User request:
{query}'''
        for query in examples["query"]
    ]
    targets = examples["llm_response_1"]
    model_inputs = tokenizer(
        inputs,
        padding='max_length',
        truncation=True,
        max_length=max_tokens
        
    )
    labels = tokenizer(
        targets,
        padding='max_length',
        truncation=True,
        max_length=max_tokens
    )
    model_inputs["labels"] = labels["input_ids"] 
    return model_inputs

In [11]:
tokenized_datasets = ds.map(preprocess_function, batched=True, remove_columns=ds.column_names)

Map:   0%|          | 0/461 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets.save_to_disk('./train_ds_1st_stage/')

Saving the dataset (0/1 shards):   0%|          | 0/461 [00:00<?, ? examples/s]