# Mistral-7B QLoRA — Hospitality Assistant (FAQs, Dialogs, Sentiment/Rewrite)

## Setup

In [None]:
# Clone repo (or pull latest if already cloned)
import os

REPO_DIR = "/content/mistral-hospitality-finetune"
if os.path.isdir(REPO_DIR):
    !git -C {REPO_DIR} pull
else:
    !git clone https://github.com/DoubleH10/mistral-hospitality-finetune.git {REPO_DIR}
%cd {REPO_DIR}

# Install deps from pyproject.toml — single source of truth
!pip install -q uv
!uv pip install --system ".[all]"

In [None]:
# Verify A100 GPU is connected (Colab Pro → Runtime → Change runtime type → A100)
import torch

assert torch.cuda.is_available(), "No GPU detected! Change runtime: Runtime → Change runtime type → A100"
gpu_name = torch.cuda.get_device_name(0)
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"GPU: {gpu_name} ({vram_gb:.0f} GB)")
assert "A100" in gpu_name, f"Expected A100, got: {gpu_name}. Change runtime type to A100."

In [None]:
import bitsandbytes as bnb
print(f"bitsandbytes: {bnb.__version__}")
print(f"torch: {torch.__version__}, CUDA: {torch.version.cuda}")

In [None]:
import os, torch

BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
OUT_DIR = "./outputs/mistral-hotel-qlora"
os.makedirs(OUT_DIR, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

## 1) Load, Normalize & Split Datasets

Each dataset is split independently **before** merging to guarantee zero train/val overlap.

### SGD_Hotels

In [None]:
from datasets import load_dataset

ds1 = load_dataset("vidhikatkoria/SGD_Hotels")

def clean_context(s, max_chars=1200):
    """Truncate dialog context from the top, keeping most-recent turns."""
    if not s: return ""
    s = s.replace("<SEP>", "\n")
    s = "\n".join(line.strip() for line in s.splitlines() if line.strip())
    if len(s) > max_chars:
        s = s[-max_chars:]
        if "\n" in s:
            s = s[s.index("\n") + 1:]
    return s

def sgd_to_text(ex):
    # Only train on assistant turns (speaker == 1)
    if ex.get("speaker") != 1:
        return {"text": None}
    ctx = clean_context(ex.get("context", ""))
    resp = (ex.get("response") or "").strip()
    if not ctx or not resp:
        return {"text": None}
    instruction = "Continue this hotel booking conversation as a helpful assistant."
    user_msg = f"{instruction}\n\nContext:\n{ctx}"
    return {"text": f"[INST] {user_msg} [/INST]{resp}</s>"}

sgd_all = ds1["train"].map(sgd_to_text).filter(lambda ex: ex["text"] is not None).shuffle(seed=42)

# FIX Bug 1: proper train/val split — zero overlap
sgd_split = sgd_all.train_test_split(test_size=0.15, seed=42)
sgd_train, sgd_val = sgd_split["train"], sgd_split["test"]
print(f"SGD Hotels → Train: {len(sgd_train)}, Val: {len(sgd_val)}")

In [None]:
for i in range(2):
    print("---")
    print(sgd_train[i]["text"][:600])

### Bitext

In [None]:
ds2 = load_dataset("bitext/Bitext-hospitality-llm-chatbot-training-dataset")

def bitext_to_text(ex):
    inst = (ex.get("instruction") or "").strip()
    resp = (ex.get("response") or "").strip()
    if not inst or not resp:
        return {"text": None}
    intent = (ex.get("intent") or "").strip()
    cat = (ex.get("category") or "").strip()
    if intent or cat:
        inst = f"[{cat} | {intent}] {inst}"
    return {"text": f"[INST] {inst} [/INST]{resp}</s>"}

bit_all = ds2["train"].map(bitext_to_text).filter(lambda ex: ex["text"] is not None).shuffle(seed=42)

# FIX Bug 1: proper train/val split — zero overlap
bit_split = bit_all.train_test_split(test_size=0.15, seed=42)
bit_train, bit_val = bit_split["train"], bit_split["test"]
print(f"Bitext → Train: {len(bit_train)}, Val: {len(bit_val)}")

In [None]:
for i in range(3):
    print("---")
    print(bit_train[i]["text"][:800])

### Hotel Reviews — Dropped

The `ashraq/hotel-reviews` dataset was evaluated but excluded from training:
- **No gold sentiment labels** — the dataset only contains raw review text with no rating/sentiment column, so we would need to generate synthetic labels via heuristics or an external model.
- **Hardcoded responses** — the v1 processing assigned identical canned responses ("Mixed. Provide a one-sentence rationale.") to every review regardless of content, meaning the model learned nothing useful.

Keeping only SGD Hotels (dialog) and Bitext (FAQ) gives us two well-labeled, complementary task types with genuine training signal.

In [None]:
# (Hotel reviews dataset removed — see note above)

In [None]:
# (Hotel reviews examples removed)

## 2) Merge & Verify No Overlap

In [None]:
from datasets import concatenate_datasets, DatasetDict
from transformers import AutoTokenizer

# Merge the independently-split datasets
merged_train = concatenate_datasets([sgd_train, bit_train]).shuffle(seed=7)
merged_val = concatenate_datasets([sgd_val, bit_val]).shuffle(seed=11)

# Cap total size for a manageable Colab run
MAX_TRAIN, MAX_VAL = 2500, 500
if len(merged_train) > MAX_TRAIN:
    merged_train = merged_train.select(range(MAX_TRAIN))
if len(merged_val) > MAX_VAL:
    merged_val = merged_val.select(range(MAX_VAL))

# Verify zero overlap between train and val
train_texts = set(merged_train["text"])
val_texts = set(merged_val["text"])
overlap = train_texts & val_texts
assert len(overlap) == 0, f"DATA LEAKAGE: {len(overlap)} examples in both train and val!"
print(f"Train: {len(merged_train)}, Val: {len(merged_val)}, Overlap: {len(overlap)}")

ds = DatasetDict({"train": merged_train, "validation": merged_val})

# Tokenizer
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.padding_side = "right"

# Preview a formatted example
print("\n--- Sample training example ---")
print(ds["train"][0]["text"][:500])

## 3) QLoRA + SFTTrainer (completion-only loss)

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# --- QLoRA quantization config ---
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# --- Load base model (4-bit) ---
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False

model.gradient_checkpointing_enable(
    gradient_checkpointing_kwargs={"use_reentrant": False}
)
model = prepare_model_for_kbit_training(model)

# --- LoRA adapters ---
peft_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

# --- FIX Bug 2: completion-only loss ---
# Loss is computed ONLY on tokens after [/INST] (the response).
# This avoids inflating metrics by predicting instruction tokens.
collator = DataCollatorForCompletionOnlyLM(
    response_template="[/INST]",
    tokenizer=tok,
)

# --- Training args ---
args = TrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,   # effective batch size = 8
    learning_rate=2e-4,
    num_train_epochs=3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    bf16=True,
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_checkpointing=True,
    report_to="none",
)

# --- SFTTrainer ---
trainer = SFTTrainer(
    model=model,
    tokenizer=tok,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    data_collator=collator,
    dataset_text_field="text",
    max_seq_length=1024,
    packing=False,
)

trainer.train()
trainer.save_model(OUT_DIR)
tok.save_pretrained(OUT_DIR)

## 4) Before / After Comparison

Both base and adapter are tested with the **same** `[INST]...[/INST]` template so the comparison is fair.

In [None]:
from peft import PeftModel

# Load fresh base + adapter for fair comparison
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, quantization_config=bnb_cfg, device_map="auto", trust_remote_code=True
)
adapted = PeftModel.from_pretrained(base_model, OUT_DIR).eval()

def generate(model, prompt, max_new_tokens=160):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tok.eos_token_id,
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    # Extract response: everything after [/INST]
    if "[/INST]" in text:
        return text.split("[/INST]")[-1].strip()
    return text[len(prompt):].strip()

# FIX Bug 3: All prompts use correct [INST]...[/INST] template
tests = [
    # SGD-style dialog
    "[INST] Continue this hotel booking conversation as a helpful assistant.\n\n"
    "Context:\n"
    "User: I need a room in Rome for 2 nights next week.\n"
    "Assistant: Certainly. What dates are you arriving and leaving?\n"
    "User: Arrive 12 May, leave 14 May. Prefer near Termini. [/INST]",

    # Bitext-style FAQ
    "[INST] What are the check-in and check-out times? [/INST]",

    # Intent-tagged FAQ
    "[INST] [BILLING | invoices] Where can I find my invoices? [/INST]",

    # Booking modification
    "[INST] [BOOKING | modify_booking] I want to change my reservation to a different date. [/INST]",
]

for prompt in tests:
    print("\n" + "=" * 70)
    print(f"PROMPT: {prompt[:100]}...")
    # Temporarily detach adapter for base comparison
    adapted.disable_adapter_layers()
    print(f"\nBASE:    {generate(adapted, prompt)}")
    adapted.enable_adapter_layers()
    print(f"ADAPTER: {generate(adapted, prompt)}")

In [None]:
import math, json

# Final evaluation
eval_metrics = trainer.evaluate()
ppl = math.exp(eval_metrics["eval_loss"])
print(f"Eval loss (completion-only): {eval_metrics['eval_loss']:.4f}")
print(f"Perplexity: {ppl:.2f}")

# Save metrics to outputs/ for reproducibility
results = {
    "eval_loss": eval_metrics["eval_loss"],
    "perplexity": round(ppl, 2),
    "train_samples": len(ds["train"]),
    "val_samples": len(ds["validation"]),
    "base_model": BASE_MODEL,
    "lora_r": 8,
    "lora_alpha": 16,
    "completion_only_loss": True,
    "data_leakage_check": "passed",
}
metrics_path = os.path.join(OUT_DIR, "metrics.json")
with open(metrics_path, "w") as f:
    json.dump(results, f, indent=2)
print(f"\nMetrics saved to {metrics_path}")