# Fine-Tuning LLaMA Task Agent (Self-Contained)

**For Colab GPU Extension - All code in notebook**

1. Select Kernel → Colab → T4 GPU
2. Run cells sequentially

## 1. Setup & Install

In [1]:
# Install dependencies
!pip install -q torch transformers peft bitsandbytes accelerate datasets huggingface-hub

# Verify GPU
!nvidia-smi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hMon Jan 19 09:23:13 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/

## 2. Upload Config File

Upload `configs/agent_config.json` when prompted, or run the cell below to create it:

In [2]:
# Create config
import json
import os

os.makedirs('configs', exist_ok=True)

config = {
    "model": {
        "base_model": "meta-llama/Llama-3.1-8B-Instruct",
        "adapter_path": "models/lora-adapter",
        "max_length": 512
    },
    "training": {
        "learning_rate": 2e-4,
        "num_epochs": 3,
        "batch_size": 4,
        "gradient_accumulation_steps": 2,
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.05,
        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
    }
}

with open('configs/agent_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("✓ Config created")

✓ Config created


## 3. Generate Training Dataset

In [3]:
# Dataset generation code (inline)
import random
from datetime import datetime, timedelta

os.makedirs('data', exist_ok=True)
random.seed(42)

def generate_date(days_offset=0):
    return (datetime.now() + timedelta(days=days_offset)).strftime("%Y-%m-%d")

# Generate samples
samples = []

# Tool invocations (add_task)
tasks = ["buy groceries", "finish report", "call dentist", "gym workout", "read book"]
for _ in range(120):
    task = random.choice(tasks)
    days = random.choice([0, 1, 3, 7])
    samples.append({
        "instruction": f"Add a task to {task}",
        "analysis": f"User wants to create a new task '{task}'.",
        "action": f'add_task(title="{task.capitalize()}", due_date="{generate_date(days)}")'
    })

# List tasks
for _ in range(60):
    samples.append({
        "instruction": random.choice(["What tasks do I have?", "Show my tasks", "List tasks"]),
        "analysis": "User is requesting a list of all tasks.",
        "action": "list_tasks()"
    })

# Summarize tasks
for _ in range(60):
    samples.append({
        "instruction": random.choice(["Summarize my tasks", "Task overview", "How many tasks?"]),
        "analysis": "User wants a summary of their tasks.",
        "action": "summarize_tasks()"
    })

# Direct responses
convs = [
    ("Hello", "Hello! How can I help you manage your tasks today?"),
    ("Hi", "Hi! I'm here to help you with your tasks."),
    ("Thanks", "You're welcome! Let me know if you need anything else.")
]
for instruction, response in convs * 40:
    samples.append({"instruction": instruction, "final": response})

# Shuffle and split
random.shuffle(samples)
split_idx = int(len(samples) * 0.9)

with open('data/train.json', 'w') as f:
    json.dump(samples[:split_idx], f)
    
with open('data/eval.json', 'w') as f:
    json.dump(samples[split_idx:], f)

print(f"✓ Generated {len(samples)} samples")
print(f"  Train: {split_idx}, Eval: {len(samples) - split_idx}")

✓ Generated 360 samples
  Train: 324, Eval: 36


## 4. HuggingFace Login

In [4]:
# Login (get token from https://huggingface.co/settings/tokens)
# Accept license: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from huggingface_hub import whoami
whoami()


{'type': 'user',
 'id': '67d9cbbb7a087207dfdaa2cd',
 'name': 'altruvi',
 'fullname': 'Aswani Sahoo',
 'email': 'aswanisahoo227@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'billingMode': 'prepaid',
 'periodEnd': 1769904000,
 'isPro': False,
 'avatarUrl': '/avatars/6637de6fb2c275008f01f7b64de54319.svg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'taskai',
   'role': 'read',
   'createdAt': '2026-01-19T06:40:18.061Z'}}}

## 5. Fine-Tuning (1-2 hours)

In [6]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load config
with open('configs/agent_config.json') as f:
    config = json.load(f)

model_name = config["model"]["base_model"]
print(f"Model: {model_name}")

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=config["training"]["lora_r"],
    lora_alpha=config["training"]["lora_alpha"],
    target_modules=config["training"]["target_modules"],
    lora_dropout=config["training"]["lora_dropout"],
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
print("✓ Model loaded with LoRA")

2026-01-19 09:26:19.291957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768814779.764393      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768814779.895483      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768814780.990256      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768814780.990291      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768814780.990294      55 computation_placer.cc:177] computation placer alr

Model: meta-llama/Llama-3.1-8B-Instruct


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

✓ Model loaded with LoRA


In [9]:
# Prepare dataset - UPDATED VERSION
def format_sample(sample):
    if "action" in sample:
        response = f"<analysis>\n{sample['analysis']}\n</analysis>\n\n<action>\n{sample['action']}\n</action>"
    else:
        response = f"<final>\n{sample['final']}\n</final>"
    return {"instruction": sample["instruction"], "response": response}

def prepare_dataset(path):
    with open(path) as f:
        data = [format_sample(s) for s in json.load(f)]
    
    dataset = Dataset.from_list(data)
    
    def apply_template(ex):
        messages = [
            {"role": "system", "content": "You are a task assistant. Use <analysis> and <action> for tools, <final> for conversation."},
            {"role": "user", "content": ex["instruction"]},
            {"role": "assistant", "content": ex["response"]}
        ]
        return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}
    
    dataset = dataset.map(apply_template, remove_columns=["instruction", "response"])
    
    def tokenize(ex):
        # Add padding=True and return_tensors removed
        tokenized = tokenizer(
            ex["text"], 
            truncation=True, 
            max_length=512, 
            padding="max_length",  # <--- Changed: pad to max_length
            return_tensors=None  # <--- Changed: don't convert to tensors yet
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized
    
    return dataset.map(tokenize, remove_columns=["text"])

train_dataset = prepare_dataset('data/train.json')
eval_dataset = prepare_dataset('data/eval.json')
print(f"✓ Prepared {len(train_dataset)} train, {len(eval_dataset)} eval samples")

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

✓ Prepared 324 train, 36 eval samples


In [10]:
# Training - UPDATED VERSION
from transformers import default_data_collator

training_args = TrainingArguments(
    output_dir="models/lora-adapter",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
    warmup_ratio=0.1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator  # <--- Use default collator for pre-padded data
)

print("Starting training...")
trainer.train()

# Save
model.save_pretrained("models/lora-adapter")
tokenizer.save_pretrained("models/lora-adapter")
print("\n✓ Training complete! Model saved to models/lora-adapter")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.0584,0.052637
2,0.0256,0.022275
3,0.0092,0.009825


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



✓ Training complete! Model saved to models/lora-adapter


## 6. Download Trained Model

Download to use locally (skip if using Drive)

In [11]:
# Zip and download
!zip -r lora-adapter.zip models/lora-adapter/

# For Colab extension, files should sync automatically
# Check your local models/lora-adapter/ folder
print("\n✓ Model ready! Check local folder: models/lora-adapter/")

  adding: models/lora-adapter/ (stored 0%)
  adding: models/lora-adapter/tokenizer.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 85%)
  adding: models/lora-adapter/checkpoint-123/ (stored 0%)
  adding: models/lora-adapter/checkpoint-123/rng_state.pth (deflated 26%)
  adding: models/lora-adapter/checkpoint-123/training_args.bin (deflated 53%)
  adding: models/lora-adapter/checkpoint-123/README.md (deflated 65%)
  adding: models/lora-adapter/checkpoint-123/adapter_model.safetensors (deflated 7%)
  adding: models/lora-adapter/checkpoint-123/optimizer.pt (deflated 11%)
  adding: models/lora-adapter/checkpoint-123/adapter_config.json (deflated 56%)
  adding: models/lora-adapter/checkpoint-123/trainer_state.json (deflated 73%)
  adding: models/lora-adapter/checkpoint-123/scaler.pt (deflated 64%)
  adding: models/lora-adapter/checkpoint-123/scheduler.pt (deflated 61%)
  adding: models/lora-adapter/README.md (deflated 65%)
  adding: models/lora-adapter/adapter_model.safetensors (deflated 7%)
  adding: models/lora-adapter/tokenizer_config.json (deflated 96%)
  adding: models/lora-adapter/special_tokens_map.jso

## Done!

Next: Deploy locally with `python serving/app.py`