<a href="https://colab.research.google.com/github/AnkitSingh10-hub/TinyLlama_PEFT/blob/main/LLMfinetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U torch transformers peft trl bitsandbytes accelerate datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m813.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m797.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m141.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import torch
import os
os.environ["WANDB_DISABLED"] = "true"
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset, load_dataset
from collections import defaultdict
import random

# 1. DATA ENGINEERING (Matches CV: "Balanced sampling strategy")
# ---------------------------------------------------------
ds = load_dataset('bitext/Bitext-travel-llm-chatbot-training-dataset', split="train")

random.seed(42)
intent_groups = defaultdict(list)
for record in ds:
    intent_groups[record["intent"]].append(record)

# Calculate samples needed per intent to ensure balance
total_desired_samples = 100
total_intents = len(intent_groups)
samples_per_intent = total_desired_samples // total_intents

balanced_subset = []
for intent, examples in intent_groups.items():
    # Sample without replacement up to the limit
    sampled = random.sample(examples, min(samples_per_intent, len(examples)))
    balanced_subset.extend(sampled)

# Create dataset and format it
travel_chat_ds = Dataset.from_list(balanced_subset)

def format_conversation(row):
    # Matches CV: "Travel-intent recognition" context
    return {
        "text": f"Query: {row['instruction']}\nIntent: {row['intent']}\nResponse: {row['response']}"
    }

travel_chat_ds = travel_chat_ds.map(format_conversation)

# 2. MODEL & QUANTIZATION (Matches CV: "Quantization" & "TinyLlama")
# ---------------------------------------------------------
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.1"

# Quantization Config (This was missing in your original code)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Key addition for CV accuracy
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False # Silence warnings for training
model = prepare_model_for_kbit_training(model)

# 3. PEFT CONFIGURATION (Matches CV: "LoRA", "Low-rank adaptation")
# ---------------------------------------------------------
lora_config = LoraConfig(
    r=16, # Slightly higher rank often helps with 1.1B models
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['q_proj', 'v_proj'] # TinyLlama specific target modules
)

# 4. TRAINING (Matches CV: "Custom SFT Pipeline")
# ---------------------------------------------------------
sft_config = SFTConfig(
    max_steps=60, # Increased slightly for demo purposes
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    fp16=True, # Use fp16 for faster training on GPU
    logging_steps=1,
    output_dir="./results",
    dataset_text_field="text" # Pointing to the formatted column
)

trainer = SFTTrainer(
    model=model,
    train_dataset=travel_chat_ds,
    peft_config=lora_config,
    args=sft_config
)

print("Starting Training...")
trainer.train()

# 5. INFERENCE (Matches CV: "Travel-intent recognition")
# ---------------------------------------------------------
print("\n--- Inference ---")
query = "Query: I need to buy a ticket to Kathmandu"
# Ensure inputs are moved to the same device as the model (GPU)
inputs = tokenizer(query, return_tensors="pt").to(model.device)

# Fix: specific max_new_tokens, removed conflicting max_length
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7
)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Adding EOS to train dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1, 'pad_token_id': 32000}.


Starting Training...


  return fn(*args, **kwargs)


Step,Training Loss
1,2.5733
2,2.5212
3,2.5042
4,2.3498
5,2.4121
6,2.4284
7,2.428
8,2.2932
9,2.4414
10,2.1527


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



--- Inference ---


  return fn(*args, **kwargs)


Query: I need to buy a ticket to Kathmandu, ma garfanglo Ap Ot War, who continued the side spread ground with red sn sn sn caught the prem covered last pow su swing inscipate spread techn Apellett garcrier al plane quickly snoked that spread, of all the late
