In [1]:
# imports
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset, concatenate_datasets
import torch

# variables
content_system = 'Your name is Trip Assistant bot. You are a useful assistant as an AI chatbot to complete the tasks of the airline and hotel booking system. You can search, book, cancel and update flights and hotel rooms. The assistant is helpful, resourceful, smart and very friendly. If the user wants, he can talk about anything as soon as the assistant finishes filling in the following fields from the user: username, (when booking an air ticket: place of departure, destination), (when booking a hotel: location, number of stars), travel dates, trip budget. Any additional preferences or constraints the user may have. Get user detail step by step in proper conversion.'
model_name = 'KvrParaskevi/Llama-2-7b-Hotel-Booking-Model'
path_hi_answers = 'additional_data/hi_answers.txt'
path_additional_data = 'additional_data/additional_data.txt'
path_to_save_dataset = 'hotel_fights_data'




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load tokenizer and datasets
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset_ticket = load_dataset("mithlesh/Flight_Ticket_Booking_Conversion")
dataset_hotel = load_dataset("KvrParaskevi/hotel_data")

In [7]:
# change data | add my instruction
dataset_ticket = dataset_ticket.map(lambda  example: {'Instruction': content_system}, remove_columns=['__index_level_0__', 'text'])

In [8]:
# change data | rename columns
dataset_ticket = dataset_ticket.rename_column('Instruction', 'system').rename_column('Human', 'user').rename_column('Bot', 'assistant')
dataset_ticket

DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 2126
    })
})

In [9]:
# Opening and reading the file containing responses to "hi" 
with open(path_hi_answers) as f:
    hi_answers = f.read().split('\n')

In [10]:
def list_to_cyclic_generator(input_list: list):
    """
    Converts a list into a cyclic generator that infinitely yields items from the list.

    Args:
        input_list (list): The list to be converted.

    Yields:
        Any: Items from the input list, cyclically.
    """
    while True:
        for item in input_list:
            yield item

In [11]:
# Creating a cyclic generator for hi answers
hi_answers_gen = list_to_cyclic_generator(hi_answers)
def change_answers_hi(example: dict) -> dict:
    """
    Change the assistant's response if user say 'hi'.
    Args:
        example (dict): A dictionary representing an example from the dataset, with 'user' and 'assistant' keys.
    Returns:
        dict: The modified example with the assistant's response changed.
    """
    if example['user'].lower() == 'hi':
         example["assistant"] = next(hi_answers_gen)
    return example

In [12]:
# change data | hi answers, apply change_answers_hi
dataset_ticket = dataset_ticket.map(change_answers_hi)
dataset_ticket

Map: 100%|██████████| 2126/2126 [00:00<00:00, 25234.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 2126
    })
})

In [13]:
def change_answers_thanks(example: dict) -> dict:
     """
     Changes the assistant's response by removing 'Thanks for choosing Propellyr!' if specific user input conditions are met.
     Args:
          example (dict): A dictionary representing an example from the dataset, with 'user' and 'assistant' keys.
     Returns:
          dict: The modified conversation example.
     """

    if ((example['user'].lower() == 'yes') or (example['user'].lower() == 'please finally book this ticket too with promo code'))and ('propellyr' in example['assistant'].lower()) :
         example["assistant"] =  example["assistant"].replace('Thanks for choosing Propellyr!','').strip()
    return example

In [14]:
# change data | remove 'Thanks for choosing Propellyr!'
dataset_ticket = dataset_ticket.map(change_answers_thanks)
dataset_ticket

Map: 100%|██████████| 2126/2126 [00:00<00:00, 29789.97 examples/s]


DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 2126
    })
})

In [15]:
def change_answers_another(example: dict)  -> dict:
    """
    Changes the assistant's response based on specific user inputs.

    Args:
        example (dict): A dictionary representing an example from the dataset, with 'user' and 'assistant' keys.

    Returns:
        dict: The modified conversation example.
    """
    change_dict = {
    'mithlesh upadhyay':['mithlesh upadhyay',"Hello Mithlesh Upadhyay! It's great to meet you. I'm Travel Booking Bot, your friendly AI assistant for flight ticket and hotel rooms booking.I'm here to help you plan the perfect trip. Are you looking to book plane tickets or a hotel room today?"],
    "Who are you'" : ["Who are you?", "I'm a Travel Booking Bot designed to assist you in booking flights and plane tickets."],
    "hi, what you can do": ["hi, what you can do?", "I can assist you in searching for flights, comparing prices, and booking plane tickets for your travel needs."]
    }
    if example['user'] in change_dict:
        example['assistant'] = change_dict[example['user']][1]
        example['user'] = change_dict[example['user']][0]   
    return example 


In [16]:
# change data | fix another  
dataset_ticket = dataset_ticket.map(change_answers_another)
dataset_ticket

Map: 100%|██████████| 2126/2126 [00:00<00:00, 27994.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant'],
        num_rows: 2126
    })
})

In [17]:
# Opening and reading the file with additional data
with open(path_additional_data) as f:
    additional_data = f.read().split('\n\n')

In [18]:
# Initialize dictionary to store additional data
add_data = {'user':[], 'assistant':[], 'system':[]}
for data in additional_data:
    """
    Iterates through each data entry in additional_data.

    - Splits the data into lines.
    - Appends the system content to the 'system' list.
    - Appends the user input (stripped of 'Q:') to the 'user' list.
    - Appends the assistant response (stripped of 'A:') to the 'assistant' list.
    """
    data = data.split('\n')
    add_data['system'].append(content_system)
    add_data['user'].append(data[0].replace('Q:','').strip())
    add_data['assistant'].append(data[1].replace('A:','').strip())
    
# Creating a dataset from the collected data
add_data = Dataset.from_dict(add_data)

In [20]:
# Concatenating the original ticket dataset with the additional data
dataset_ticket = concatenate_datasets([dataset_ticket['train'], add_data])


In [24]:
# Mapping and transforming the hotel dataset
dataset_hotel = dataset_hotel.map(
    lambda example: {
        'system': content_system,  # Assigning system content
        'user': example['text'].split('###')[1].replace('Human:', '').strip(),  # Extracting and cleaning user input
        'assistant': example['text'].split('###')[2].replace('Assistant:', '').strip()  # Extracting and cleaning assistant response
    }, 
    remove_columns=['text']  # Removing the 'text' column after transformation
)

In [25]:
# Concatenating the hotel and ticket datasets
my_dataset = concatenate_datasets([dataset_hotel['train'], dataset_ticket])
my_dataset

Dataset({
    features: ['system', 'user', 'assistant'],
    num_rows: 3355
})

In [26]:
def add_text(example: dict) -> dict:
    """
    Adds 'text' field to the example by applying chat template and tokenizing.

    Args:
        example (dict): A dictionary representing a conversation example with 'user', 'assistant', and 'system' keys.
    Returns:
        dict: The modified conversation example with an added 'text' field.
    """
    # Constructing chat format

    chat = [
    {
     "role": "system",
     "content": example['system'],
    },
    {"role": "user", "content": example['user']},
    {"role": "assistant", "content": example['assistant']}
    ]
    # Applying chat template and tokenizing
    example['text'] = tokenizer.apply_chat_template( chat, tokenize=False, add_generation_prompt=True).replace('<s>','').replace('</s>','').strip()
    return example
    

In [27]:
# Mapping the add_text function to the dataset to add 'text' field
my_dataset = my_dataset.map(add_text)


Map: 100%|██████████| 3355/3355 [00:00<00:00, 13901.79 examples/s]


In [31]:
# Saving the dataset
my_dataset.save_to_disk(path_to_save_dataset)

Saving the dataset (1/1 shards): 100%|██████████| 3355/3355 [00:00<00:00, 30312.26 examples/s]
