In [3]:
! pip install datasets -q


In [4]:
# Imports
import torch
from datasets import load_dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import re

In [5]:
# Load the Bitext Customer Service Dataset
dataset = load_dataset("bitext/Bitext-travel-llm-chatbot-training-dataset")

# Explore the dataset
print(dataset['train'][0])  # Example conversation

split_ratio = 0.9
train_test_split = dataset["train"].train_test_split(test_size=(1 - split_ratio), seed=42)
dataset = DatasetDict({"train": train_test_split["train"], "validation": train_test_split["test"]})

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.21k [00:00<?, ?B/s]

(…)-travel-llm-chatbot-training-dataset.csv:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31658 [00:00<?, ? examples/s]

{'instruction': 'I want to know about my checked carry-on baggage allowance, how can I get more information?', 'intent': 'check_baggage_allowance', 'category': 'BAGGAGE', 'tags': 'BCIL', 'response': 'To find out your checked baggage allowance, please follow these instructions:\n\n1. Visit {{WEBSITE_URL}} or access the {{APP_NAME}} application.\n2. Log into your account.\n3. Navigate to the {{BOOKINGS_OPTION}} section.\n4. Enter the necessary booking details or flight information.\n5. Your baggage allowance details will be displayed.\n\nFor additional help, reach out to our customer support via the {{APP_NAME}} application or at {{WEBSITE_URL}}.'}
DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'category', 'tags', 'response'],
        num_rows: 28492
    })
    validation: Dataset({
        features: ['instruction', 'intent', 'category', 'tags', 'response'],
        num_rows: 3166
    })
})


In [9]:
# Preprocessing
def normalize_input(text):
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\?\.,!]', '', text)
    return text

tokenizer = T5Tokenizer.from_pretrained("t5-small")
def preprocess_function(examples):
    inputs = [f"generate response for intent: {intent}: {normalize_input(instr)}"
              for intent, instr in zip(examples["intent"], examples["instruction"])]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["response"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset.save_to_disk("./tokenized_dataset")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/28492 [00:00<?, ? examples/s]



Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/28492 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3166 [00:00<?, ? examples/s]

In [11]:
# Load T5-small model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments optimized for Colab memory usage
training_args = TrainingArguments(
    output_dir="./results",  # Directory for saving model checkpoints
    num_train_epochs=3,  # Three epochs for optimal balance of performance and efficiency
    per_device_train_batch_size=8,  # Batch size constrained by Colab memory
    per_device_eval_batch_size=16,  # Larger eval batch for faster validation
    warmup_steps=100,  # Gradual learning rate increase for stability
    weight_decay=0.01,  # Regularization to prevent overfitting
    learning_rate=2e-5,  # Baseline learning rate for initial training
    logging_dir="./logs",  # Directory for training logs
    logging_steps=10,  # Log progress every 10 steps
    save_steps=200,  # Save checkpoints every 200 steps
    save_total_limit=1,  # Keep only the best model
    eval_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch
    load_best_model_at_end=True,  # Use the best model based on validation loss
    fp16=True if torch.cuda.is_available() else False,  # Mixed precision for memory efficiency
    gradient_accumulation_steps=2,  # Effective batch size of 16 for memory optimization
    report_to="none"  # Disable logging to external services
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Train model
trainer.train()

# Evaluate and compare learning rates
baseline_loss = trainer.evaluate()["eval_loss"]



Epoch,Training Loss,Validation Loss
1,1.0687,0.836189
2,0.8755,0.670402
3,0.837,0.632903


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [None]:
# Fine-tune with a higher learning rate
training_args.learning_rate = 5e-5
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"])
trainer.train()
tuned_loss = trainer.evaluate()["eval_loss"]


In [21]:
# Save final model
model.save_pretrained("./travel_assistant_model")
tokenizer.save_pretrained("./travel_assistant_model")

('./travel_assistant_model/tokenizer_config.json',
 './travel_assistant_model/special_tokens_map.json',
 './travel_assistant_model/spiece.model',
 './travel_assistant_model/added_tokens.json')

In [18]:
# Print training results
print(f"Initial Loss: {baseline_loss:.2f}")
print(f"Tuned Loss: {tuned_loss:.2f}")

Initial Loss: 0.63
Tuned Loss: 0.36


In [None]:

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re

# Load model
tokenizer = T5Tokenizer.from_pretrained("./travel_assistant_model")
model = T5ForConditionalGeneration.from_pretrained("./travel_assistant_model")

def normalize_input(text):
    """Normalize text: lowercase, strip spaces, remove special chars."""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\?\.,!]', '', text)
    return text

def capitalize_response(response):
    """Capitalize the first letter of sentences for readability."""
    sentences = response.split(". ")
    unique_sentences = []
    for s in sentences:
        if s and s not in unique_sentences:
            unique_sentences.append(s.capitalize())
    return ". ".join(unique_sentences)

def test_query(query):
    """Test chatbot response for a given query."""
    query_lower = normalize_input(query)
    input_text = f"generate response: Current query: {query_lower}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).input_ids
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=200,
            temperature=0.8,
            top_k=70,
            repetition_penalty=1.5
        )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return capitalize_response(response)



In [20]:

# Qualitative testing with sample queries (including a small test query)
test_queries = [
    "how to apply for a travel visa",
    "I need to know cheapest flights"
]
for q in test_queries:
    print(f"Query: {q}\nResponse: {test_query(q)}\n")

Query: how to apply for a travel visa
Response: To apply for a travel visa, please follow these steps: 1. Visit website_url and navigate to the travel section. 2. Fill in your travel details including origin, destination, and the dates of travel. 3. Click on the search_button to view the available travel options. 4. Examine the available travel options and select the one that best fits your requirements. 5. Press the search_button to view

Query: I need to know cheapest flights
Response: To find the cheapest flights from origin to destination, please follow these steps: 1. Visit website_url. 2. Input origin as your point of departure and destination as your destination. 3. Choose your preferred travel dates along with the number of passengers. 4. Press the search_button to display the available flights. By following these steps, you will be presented with the available flights.



In [23]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.19.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [24]:
import gradio as gr

# Define the Gradio interface function
def generate_response(query):
    """Generate chatbot response for a given query."""
    return test_query(query)  # Use the test_query function defined earlier

# Create the Gradio interface
iface = gr.Interface(
    fn=generate_response,  # Function to call
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask something...", lines=2),  # Input text box
    outputs=gr.Textbox(label="Response"),  # Output text box
    title="Travel Assistant Chatbot",  # Title of the interface
    description="This is a chatbot that helps with travel-related queries. Ask anything!",  # Description of the interface
    theme="default"  # Optional, to customize the theme
)

# Launch the Gradio interface
iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5a6ab499e7eff4f49f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


