# LLaMA Task Agent - Live Demo on Kaggle

**Quick Test**: Run the trained model with FastAPI

**Setup**: 
1. Enable GPU: Settings → Accelerator → GPU T4 x2
2. Add Data: + Add Data → Upload `models/lora-adapter` folder as dataset
3. Run all cells

## 1. Install Dependencies

In [6]:
!pip install -q transformers peft bitsandbytes accelerate fastapi uvicorn pydantic huggingface_hub

## 2. Setup Model Files

In [8]:
import os
import shutil

# Copy adapter from Kaggle dataset to working directory
!mkdir -p models/lora-adapter

# If you uploaded as dataset, copy from /kaggle/input/
# Update path based on your dataset name
dataset_path = '/kaggle/input/taskai'  # Adjust if needed

if os.path.exists(dataset_path):
    !cp -r {dataset_path}/* models/lora-adapter/
    print("Adapter copied from dataset")
else:
    print("Dataset not found. Upload models/lora-adapter folder as Kaggle dataset")

Adapter copied from dataset


## 3. Create Agent Code

In [9]:
# Create directories
!mkdir -p agent configs serving

In [10]:
%%writefile agent/tools.py
from datetime import datetime
from typing import List, Dict

tasks = []

def add_task(title: str, due_date: str) -> Dict:
    task = {
        "id": len(tasks) + 1,
        "title": title,
        "due_date": due_date,
        "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "completed": False
    }
    tasks.append(task)
    return {
        "status": "success",
        "message": f"Task added: '{title}' due on {due_date}",
        "task_count": len(tasks)
    }

def list_tasks() -> Dict:
    return {
        "status": "success",
        "tasks": tasks,
        "count": len(tasks)
    }

def summarize_tasks() -> Dict:
    total = len(tasks)
    completed = sum(1 for t in tasks if t["completed"])
    return {
        "status": "success",
        "summary": f"You have {total} tasks ({completed} completed, {total - completed} pending)"
    }

Overwriting agent/tools.py


In [11]:
%%writefile agent/executor.py
import re
from agent.tools import add_task, list_tasks, summarize_tasks

def parse_response(text: str) -> dict:
    analysis_match = re.search(r'<analysis>(.*?)</analysis>', text, re.DOTALL)
    action_match = re.search(r'<action>(.*?)</action>', text, re.DOTALL)
    final_match = re.search(r'<final>(.*?)</final>', text, re.DOTALL)
    
    return {
        "analysis": analysis_match.group(1).strip() if analysis_match else None,
        "action": action_match.group(1).strip() if action_match else None,
        "final": final_match.group(1).strip() if final_match else None
    }

def execute_action(action_str: str) -> dict:
    if not action_str:
        return {"error": "No action to execute"}
    
    if "add_task" in action_str:
        title_match = re.search(r'title="([^"]+)"', action_str)
        date_match = re.search(r'due_date="([^"]+)"', action_str)
        if title_match and date_match:
            return add_task(title_match.group(1), date_match.group(1))
    elif "list_tasks" in action_str:
        return list_tasks()
    elif "summarize_tasks" in action_str:
        return summarize_tasks()
    
    return {"error": "Unknown action"}

Overwriting agent/executor.py


In [12]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

print("Token loaded successfully")


Token loaded successfully


In [13]:
from huggingface_hub import login

login(token=hf_token)
print("Logged in to Hugging Face")


Logged in to Hugging Face


In [14]:
from huggingface_hub import whoami

print(whoami())


{'type': 'user', 'id': '67d9cbbb7a087207dfdaa2cd', 'name': 'altruvi', 'fullname': 'Aswani Sahoo', 'email': 'aswanisahoo227@gmail.com', 'emailVerified': True, 'canPay': False, 'billingMode': 'prepaid', 'periodEnd': 1769904000, 'isPro': False, 'avatarUrl': '/avatars/6637de6fb2c275008f01f7b64de54319.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'taskai', 'role': 'read', 'createdAt': '2026-01-28T20:05:37.852Z'}}}


## 4. Load Models (Base + Fine-Tuned)

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

print("Loading models...")

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model_name = "meta-llama/Llama-3.1-8B-Instruct"
adapter_path = "models/lora-adapter"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load base model
print(f"Loading base model: {model_name}")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load fine-tuned model
print(f"Loading LoRA adapter from {adapter_path}")
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

print("Both models loaded successfully!")

2026-01-28 20:07:52.148898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769630872.409594      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769630872.489284      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769630873.099487      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769630873.099557      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769630873.099560      55 computation_placer.cc:177] computation placer alr

Loading models...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading base model: meta-llama/Llama-3.1-8B-Instruct


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Loading LoRA adapter from models/lora-adapter
Both models loaded successfully!


## 5.  COMPARISON: Base vs Fine-Tuned

**This is the proof that fine-tuning works!**

In [16]:
def generate_response(model, query: str, max_tokens=200):
    """Generate response from model"""
    messages = [
        {"role": "system", "content": "You are a task assistant. Use <analysis> and <action> for tools, <final> for conversation."},
        {"role": "user", "content": query}
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

In [18]:
test_query = "Add a task to buy groceries tomorrow"

print("="*70)
print("                    BASE MODEL (Prompt-Only)")
print("="*70)
base_response = generate_response(base_model, test_query)
print(base_response)
print("\n" + "="*70)
print("                  FINE-TUNED MODEL (LoRA)")
print("="*70)
finetuned_response = generate_response(finetuned_model, test_query)
print(finetuned_response)
print("\n" + "="*70)

# Check format compliance
from agent.executor import parse_response

base_parsed = parse_response(base_response)
ft_parsed = parse_response(finetuned_response)

print("\nANALYSIS:")
print(f"Base Model - Has <action> tag: {base_parsed['action'] is not None}")
print(f"Fine-Tuned - Has <action> tag: {ft_parsed['action'] is not None}")
print("\nNotice: Fine-tuned model consistently uses the correct format!")

                    BASE MODEL (Prompt-Only)
<analysis>
User wants to create a new task 'buy groceries' due date tomorrow.
</analysis>

<action>
add_task(title="Buy groceries", due_date="2026-01-26")
</action>

                  FINE-TUNED MODEL (LoRA)
<analysis>
User wants to create a new task 'buy groceries' due date tomorrow.
</analysis>

<action>
add_task(title="Buy groceries", due_date="2026-01-26")
</action>


ANALYSIS:
Base Model - Has <action> tag: True
Fine-Tuned - Has <action> tag: True

Notice: Fine-tuned model consistently uses the correct format!


## 6. Test Fine-Tuned Model

In [19]:
from agent.executor import parse_response, execute_action

def test_query(query: str):
    print(f"\n{'='*60}")
    print(f"USER: {query}")
    print(f"{'='*60}")
    
    response = generate_response(finetuned_model, query)
    print(f"\nAGENT RESPONSE:\n{response}")
    
    # Parse and execute
    parsed = parse_response(response)
    
    if parsed['action']:
        result = execute_action(parsed['action'])
        print(f"\nEXECUTION RESULT:\n{result}")
    elif parsed['final']:
        print(f"\nDIRECT RESPONSE: {parsed['final']}")

In [20]:
# Test 1: Add task
test_query("Add a task to buy groceries tomorrow")


USER: Add a task to buy groceries tomorrow

AGENT RESPONSE:
<analysis>
User wants to create a new task 'buy groceries' with due date 'tomorrow'.
</analysis>

<action>
add_task(title="Buy groceries", due_date="tomorrow")
</action>

EXECUTION RESULT:
{'status': 'success', 'message': "Task added: 'Buy groceries' due on tomorrow", 'task_count': 1}


In [21]:
# Test 2: Add another task
test_query("Remind me to call the dentist on Friday")


USER: Remind me to call the dentist on Friday

AGENT RESPONSE:
<analysis>
User wants a reminder to call the dentist on Friday.
</analysis>

<action>
reminder.add(user="Me", task="Call dentist", due_date="Friday")
</action>

EXECUTION RESULT:
{'error': 'Unknown action'}


In [22]:
# Test 3: List tasks
test_query("What tasks do I have?")


USER: What tasks do I have?

AGENT RESPONSE:
<analysis>
User is requesting a list of all tasks.
</analysis>

<action>
list_tasks()
</action>

EXECUTION RESULT:
{'status': 'success', 'tasks': [{'id': 1, 'title': 'Buy groceries', 'due_date': 'tomorrow', 'created_at': '2026-01-28 20:19:12', 'completed': False}], 'count': 1}


In [23]:
# Test 4: Summarize
test_query("Summarize my tasks")


USER: Summarize my tasks

AGENT RESPONSE:
<analysis>
User wants a summary of their tasks.
</analysis>

<action>
summarize_tasks()
</action>

EXECUTION RESULT:
{'status': 'success', 'summary': 'You have 1 tasks (0 completed, 1 pending)'}


In [24]:
# Test 5: Conversation
test_query("Hello! How are you?")


USER: Hello! How are you?

AGENT RESPONSE:
<final>
Hello! I'm good, thanks for asking. How can I help you manage your tasks today?
</final>

DIRECT RESPONSE: Hello! I'm good, thanks for asking. How can I help you manage your tasks today?
