# Generate a synthetic dataset
Generate valid and invalid queries, including queries about unavailable hotels and irrelevant questions.

In [1]:
import random
from datetime import date, timedelta

import pandas as pd
from ollama import ChatResponse, chat
from tqdm import tqdm

%load_ext chime

In [2]:
def generate_random_date(start_date: date, end_date: date):
    """
    Generates a random date between two given datetime objects (inclusive).

    Args:
        start_date (datetime): The beginning of the date range.
        end_date (datetime): The end of the date range.

    Returns:
        datetime: A randomly generated date within the specified range.
    """
    # Calculate the difference in days between the start and end dates
    time_delta = end_date - start_date
    total_seconds = time_delta.total_seconds()

    # Generate a random number of seconds within that difference
    random_seconds = random.uniform(0, total_seconds)

    # Add the random seconds to the start date to get the random date
    random_date = start_date + timedelta(seconds=random_seconds)
    return random_date

In [3]:
def random_stay() -> tuple[date, date]:
    """Generate random check-in and check-out dates"""
    START_OF_PERIOD = date(2025, 8, 1)
    END_OF_PERIOD = date(2026, 8, 1)
    check_in = generate_random_date(START_OF_PERIOD, END_OF_PERIOD)
    check_out = generate_random_date(check_in, END_OF_PERIOD)
    return check_in, check_out

In [4]:
hotels_df = pd.read_csv('hotel_db.csv')

List of cities including some that are not in the hotels database

In [5]:
existing_cities = list(set(hotels_df.city.to_list()))
cities = existing_cities + ['Cape Town', 'Los Angeles', 'Keokuk', 'Paris']
cities

['Mumbai',
 'Cairo',
 'Jakarta',
 'Karachi',
 'Seoul',
 'Guangzhou',
 'Beijing',
 'Sao Paulo',
 'Kolkata',
 'Manila',
 'Tokyo',
 'Shanghai',
 'Dhaka',
 'Delhi',
 'Mexico City',
 'Moscow',
 'Shenzhen',
 'New York',
 'Bangkok',
 'Cape Town',
 'Los Angeles',
 'Keokuk',
 'Paris']

List of hotels including some that are not in the hotels database

In [6]:
hotels = list(set(hotels_df.name.to_list()))
hotels.extend(['Funny Inn', 'Nowhere Near', 'Keokuk Plaza', 'Golden Palace', 'Perfect Bliss'])
hotels

['Crowne Plaza',
 'Premier Boutique Stay',
 'Majestic Moments',
 'Serenity Sands',
 'Urban Luxe Suites',
 'Signature Boutique Hotel',
 'Hotel Excellence Artistry',
 'The Luxury Collection',
 'Four Seasons',
 'InterContinental',
 'The Exquisite Retreat',
 'Solara Retreat',
 'Hilton',
 'Sheraton',
 "Enigma's Embrace Haven",
 'Prime Prestige Studio',
 'Elite Craft Hotel',
 'Whispering Dreams',
 'Emberline Hotel',
 'Crafting Comfort Legacy',
 'Artisan Hotel Collective',
 'Majestic Urban Boutique',
 'Masterpiece Manor',
 'The Velvet Key',
 'The Heritage Corner',
 'LegacyCraft Suites',
 'The Ritz-Carlton',
 'The Gilded Harbor',
 'Marrow & Stone',
 'Supreme Solitude',
 'Hyatt',
 'CityView Boutique',
 'Hotel Artistry Studio',
 'Opulent Overlook',
 'Pinnacle Perfection',
 'Havencrest Inn',
 'Elite Escapes Oasis',
 'Epic Estates',
 'St. Regis',
 'Marriott',
 'Regal Charm Boutique',
 'Funny Inn',
 'Nowhere Near',
 'Keokuk Plaza',
 'Golden Palace',
 'Perfect Bliss']

In [7]:
user_queries = []

In [8]:
def generate_query(prompt: str) -> str:
    system_message = 'Talk like a user interested in booking a hotel room. Your requests should be short. Do not ask for extra information'
    response: ChatResponse = chat(
        'mistral',
        messages=[
            {'role': 'system', 'content': system_message},
            {'role': 'user', 'content': prompt}
        ],
        stream=False,
        options={'temperature': 1.0}
    )
    return response.message.content

Generate valid availability queries:

In [9]:
for _ in tqdm(range(128)):
    city = random.choice(cities)
    check_in, check_out = random_stay()
    user_queries.append(
        {
            'available': city in existing_cities,
            'query': generate_query(f'Inquire about hotels available in {city} from {check_in} to {check_out}')
        }
    )

100%|█████████████████████████████████████████| 128/128 [03:01<00:00,  1.42s/it]


In [10]:
user_queries

[{'available': True,
  'query': ' Can you provide me with a list of hotels available for stay in Moscow, Russia from August 14, 2025, to May 5, 2026? Please include details such as the hotel name, star rating, and average nightly rate.'},
 {'available': True,
  'query': ' Could you please provide a list of hotels available for stay in Jakarta, Indonesia from February 16, 2026 to July 9, 2026? Thanks!'},
 {'available': True,
  'query': ' "Hey there, could you please provide me with a list of hotels available in Seoul from August 7, 2025, to July 23, 2026? I\'m looking forward to seeing the options."'},
 {'available': True,
  'query': ' "Can you provide a list of hotels available for stay in Bangkok from August 26, 2025, to September 17, 2025?"'},
 {'available': True,
  'query': ' "Hi there! I\'m looking for hotel availability in Sao Paulo from December 5, 2025, to February 12, 2026. Could you please provide me with some options and prices?"'},
 {'available': False,
  'query': " Hello! I

Invalid queries with missing dates

In [11]:
%%chime
for _ in tqdm(range(64)):
    city = random.choice(cities)
    user_queries.append(
        {
            'available': city in existing_cities,
            'query': generate_query(f'Inquire about hotels available in {city}')
        }
    )

100%|███████████████████████████████████████████| 64/64 [00:48<00:00,  1.31it/s]


Invalid queries with missing checkout date

In [12]:
%%chime
for _ in tqdm(range(64)):
    city = random.choice(cities)
    check_in, check_out = random_stay()
    user_queries.append(
        {
            'available': city in existing_cities,
            'query': generate_query(f'Inquire about hotels available in {city} after {check_in}')
        }
    )

100%|███████████████████████████████████████████| 64/64 [01:19<00:00,  1.24s/it]


Valid booking requests

In [13]:
for _ in tqdm(range(128)):
    city = random.choice(cities)
    hotel = random.choice(hotels)
    check_in, check_out = random_stay()
    user_queries.append(
        {
            'available': len(hotels_df.query('city == @city & name == @hotel')) > 0,
            'query': generate_query(f'Write a request to book a room in {hotel} hotel, {city}, from {check_in} to {check_out}')
        }
    )

100%|█████████████████████████████████████████| 128/128 [04:37<00:00,  2.17s/it]


Invalid booking request with missing dates

In [14]:
for _ in tqdm(range(32)):
    city = random.choice(cities)
    hotel = random.choice(hotels)
    user_queries.append(
        {
            'available': len(hotels_df.query('city == @city & name == @hotel')) > 0,
            'query': generate_query(f'Write a request to book a room in {hotel} hotel, {city}')
        }
    )

100%|███████████████████████████████████████████| 32/32 [00:59<00:00,  1.87s/it]


Invalid booking request with missing hotel names

In [15]:
for _ in tqdm(range(32)):
    city = random.choice(cities)
    check_in, check_out = random_stay()
    user_queries.append(
        {
            'available': len(hotels_df.query('city == @city & name == @hotel')) > 0,
            'query': generate_query(f'Write a request to book a room in a hotel in {city} from {check_in} to {check_out}')
        }
    )

100%|███████████████████████████████████████████| 32/32 [00:58<00:00,  1.83s/it]


Invalid booking request with missing cities

In [16]:
for _ in tqdm(range(32)):
    hotel = random.choice(hotels)
    check_in, check_out = random_stay()
    user_queries.append(
        {
            'available': len(hotels_df.query('city == @city & name == @hotel')) > 0,
            'query': generate_query(f'Write a request to book a room in {hotel} hotel from {check_in} to {check_out}')
        }
    )

100%|███████████████████████████████████████████| 32/32 [00:56<00:00,  1.77s/it]


Irrelevant questions

In [17]:
def generate_irrelevant_query() -> str:
    response: ChatResponse = chat(
        'mistral',
        messages=[
            {'role': 'user', 'content': 'Write a question disrelated to hotel booking'}
        ],
        stream=False,
        options={'temperature': 1.0}
    )
    return response.message.content

In [18]:
for _ in tqdm(range(32)):
    user_queries.append(
        {
            'available': False,
            'query': generate_irrelevant_query()
        }
    )

100%|███████████████████████████████████████████| 32/32 [00:37<00:00,  1.18s/it]


In [21]:
df = pd.DataFrame.from_records(user_queries)
df

Unnamed: 0,available,query
0,True,Can you provide me with a list of hotels avai...
1,True,Could you please provide a list of hotels ava...
2,True,"""Hey there, could you please provide me with ..."
3,True,"""Can you provide a list of hotels available f..."
4,True,"""Hi there! I'm looking for hotel availability..."
...,...,...
507,False,Sure! Here's a question that is not related t...
508,False,Sure! Here's a question that is not related t...
509,False,"Sure! Here's an unrelated question: ""If you c..."
510,False,What are some creative ways businesses can us...


In [22]:
df.to_excel('dataset_raw.xlsx', index=False)

## Generate more booking requests
Random combinations of city and hotel names above resulted in mostly unavailable options. Need more valid valid queries

In [26]:
extra_queries = []
for _ in tqdm(range(128)):
    index = random.randint(0, len(hotels_df) - 1)
    hotel = hotels_df.at[index, 'name']
    city = hotels_df.at[index, 'city']
    check_in, check_out = random_stay()
    extra_queries.append(
        {
            'available': True,
            'query': generate_query(f'Write a request to book a room in {hotel} hotel, {city}, from {check_in} to {check_out}')
        }
    )

100%|█████████████████████████████████████████| 128/128 [04:54<00:00,  2.30s/it]


In [28]:
addendum = pd.DataFrame.from_records(extra_queries)

In [29]:
addendum

Unnamed: 0,available,query
0,True,"""Hi there! I'm looking to book a stay at the ..."
1,True,"""Hey there! I'd like to make a reservation fo..."
2,True,Subject: Room Reservation Request - April 202...
3,True,Hello! I'm interested in booking a stay at th...
4,True,"""Hello, I'd like to make a reservation for a ..."
...,...,...
123,True,Hello there! I'd like to make a reservation a...
124,True,"Dear Hotel Reservation,\n\nI'd like to make a..."
125,True,"""Hey there! I'd like to reserve a room at the..."
126,True,Subject: Room Reservation Inquiry - Hotel Exc...


In [30]:
addendum.to_excel('dataset_raw_addendum.xlsx', index=False)