<a href="https://colab.research.google.com/github/31adityakumar/StaySphere/blob/main/bookingAssistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets
!pip install accelerate



In [None]:
from google.colab import files

# Upload the file
uploaded = files.upload()

# Verify upload
!ls


Saving hotel_data.json to hotel_data.json
hotel_data.json  sample_data


In [None]:
import json

# Load data
with open('hotel_data.json', 'r') as file:
    hotel_data = json.load(file)

# Prepare data in a conversational format
training_data = [{"text": f"Hotel: {hotel['hotelName']}\nRoom Details: {hotel['rooms']}\n"}
                 for hotel in hotel_data]

# Save as a text dataset
with open('training_data.txt', 'w') as f:
    for item in training_data:
        f.write(item['text'] + "\n")


In [None]:
from transformers import BloomTokenizerFast, BloomForCausalLM

# Load tokenizer and model
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from datasets import load_dataset

# Load text dataset
dataset = load_dataset("text", data_files={"train": "training_data.txt"})

# Tokenize with error handling
def tokenize_function(examples):
    try:
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    except Exception as e:
        print(f"Tokenization error: {e}")
        return None

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

In [None]:
# Validate tokenized dataset
for example in tokenized_datasets['train']:
    if len(example['input_ids']) != 512:
        print("Invalid tokenized entry:", example)


In [None]:
print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 33
    })
})


In [None]:
split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2)

# Rename the splits
tokenized_datasets = {
    "train": split_datasets["train"],
    "validation": split_datasets["test"],
}

In [None]:
# Inspect dataset
for example in dataset['train']:
    if not example['text']:
        print("Empty entry found:", example)


Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}
Empty entry found: {'text': ''}


In [None]:
# Remove empty entries
dataset = dataset.filter(lambda x: len(x['text']) > 0)


Filter:   0%|          | 0/33 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,        # Adjust batch size for memory constraints
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    eval_strategy="no",            # No evaluation during training
    save_strategy="steps",
    save_steps=1000,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=500,
    seed=42,
    fp16=False,                           # Disable mixed precision if GPU issues occur
    optim="adamw_torch",
    report_to="none",
)

# Define the data_collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,  # Ensure tokenizer is defined
    mlm=False  # Set to False for causal language modeling (GPT-like models)
)

# Subclass Trainer to override compute_loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Extract labels or create them
        labels = inputs.pop("labels", None)
        if labels is None:
            # Create labels by shifting the input_ids
            labels = inputs["input_ids"].clone()
            labels[:, :-1] = inputs["input_ids"][:, 1:].clone()  # Shift labels for causal language modeling
            labels[:, -1] = -100  # Mask the last token (no loss on the last token)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Use CustomTrainer for training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Ensure tokenized_datasets is defined
    data_collator=data_collator,  # Pass the data_collator
)

# Start fine-tuning
trainer.train()

Step,Training Loss


In [None]:
output_dir = "./bloom_finetuned"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./bloom_finetuned


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Test the loaded model with a better input text
input_text = (
    "You are an AI assistant for a luxury hotel. "
    "A customer asks: Hello! I'm looking for a luxury room with a beautiful view. "
    "Could you provide me with some options for a 3-day stay?"
)
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate the response using controlled decoding
outputs = model.generate(
    input_ids,
    max_length=150,          # Allows for a detailed response
    num_return_sequences=1,  # Generate a single response
    do_sample=True,          # Enable sampling-based generation
    temperature=0.6,         # Reduce randomness for coherence
    top_k=30,                # Further limit token choices
    top_p=0.8,               # Tighten nucleus sampling
    repetition_penalty=1.2,  # Penalize repeated phrases
    no_repeat_ngram_size=3   # Avoid repeating trigrams
)

# Decode and print the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


You are an AI assistant for a luxury hotel. A customer asks: Hello! I'm looking for a luxury room with a beautiful view. Could you provide me with some options for a 3-day stay? The guestroom RoomRoom is very large, and the bathroom has been recently renovatedand it needs to be cleaned up as wellas changed . I am not sure how much time will take us in this process , but we have already made several plans : 1) We needto make our house more comfortable - so that all visitors can enjoy their holiday without disturbing anyone else 2 )We want people who visit on vacation or stays at home during holidays (stay longer than 4 days).Hotel Hotel Manager(manager ofthe resorthotel managerial staff '
