## Init For Agents Testing

In [None]:
# Setup: Load environment variables and dependencies
import os
import sys
from pathlib import Path

from jinja2 import Environment, FileSystemLoader

project_root = Path.cwd()
src_path = project_root / "src"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"✓ Added to sys.path: {src_path}")

from LabAgentSkill import skills_utils
from LabAgentSkill.SkillAwareAgent import SkillAwareAgent

root_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
env_path = root_dir / ".env"
env = {}

if env_path.exists():
    for line in env_path.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        env[key.strip()] = value.strip()

# Set API key
os.environ["OPENAI_API_KEY"] = env.get("OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", ""))
env = Environment(loader=FileSystemLoader('prompts/'))  
skills_folder = Path("/home/snt/projects_lujun/LabAgentSkill/skillsHub/skills_finer")
all_skills = skills_utils.read_all_skills_metadata(skills_folder)
for skill in all_skills:
    print(f"  - {skill['name']}: {skill['description']}")

model_name = "claude-opus-4-6"
base_url = None
# model_name = "google/gemma-3-270m-it"
# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# base_url = "http://127.0.0.1:8001/v1"

✓ Added to sys.path: /home/snt/projects_lujun/LabAgentSkill/src


  from .autonotebook import tqdm as notebook_tqdm


  - XBRL-tag-classification: Classify financial text into specific XBRL tags by analyzing semantic cues, context, and category boundaries.
  - algorithmic-art: Creating algorithmic art using p5.js with seeded randomness and interactive parameter exploration. Use this when users request creating art using code, generative art, algorithmic art, flow fields, or particle systems. Create original algorithmic art rather than copying existing artists' work to avoid copyright violations.
  - brand-guidelines: Applies Anthropic's official brand colors and typography to any sort of artifact that may benefit from having Anthropic's look-and-feel. Use it when brand colors or style guidelines, visual formatting, or company design standards apply.
  - canvas-design: Create beautiful visual art in .png and .pdf documents using design philosophy. You should use this skill when the user asks to create a poster, piece of art, design, or other static piece. Create original visual designs, never copying e

## Load Data - Sentimental Analysis

In [2]:
from datasets import load_dataset
dataset_name = "Volavion/finer-139-numeric-sampled"
loaded_dataset = load_dataset(dataset_name, split="train")
loaded_df = loaded_dataset.to_pandas()
loaded_df = loaded_df.reset_index(drop=True)

## Select skills 

In [None]:
import time
import json
from datetime import datetime
import pandas as pd
from tqdm import tqdm

from LabAgentSkill.evaluate import get_predicted_label, get_prediction_XBRL_TAGS

print(f"Using model: {model_name}")
# Initialize agents
agent_skill_aware = SkillAwareAgent(use_chat_history=True, use_trim_messages=True, model=model_name, base_url=base_url)
agent_skill_exec_agent = SkillAwareAgent(use_chat_history=True, use_trim_messages=True, model=model_name, base_url=base_url)
agent_simple = SkillAwareAgent(use_chat_history=True, use_trim_messages=False, model=model_name, base_url=base_url)

p_exec_finer_temp = env.get_template('p_exec_finer.jinja')
p_skill_select_temp = env.get_template('p_skill_select.jinja')
p_skill_discov_temp = env.get_template('p_skill_discov.jinja')
p_default_system_temp = env.get_template('p_default_system.jinja')
p_skill_exec_temp = env.get_template('p_skill_exec.jinja')

# JSONL output path
output_dir = "/home/snt/projects_lujun/LabAgentSkill/assets/results/"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
jsonl_path = output_dir+f"finer_standard_{model_name.split('/')[-1]}_{timestamp}.jsonl"
print(f"Results will be saved to: {jsonl_path}")

skill_count = 0
count_row = 0
if os.path.exists(jsonl_path):
    df_exist = pd.read_json(jsonl_path, lines=True)
    count_row = len(df_exist)
    print(f"Resume from row: {count_row}")


# Process each sample
for idx, row in tqdm( loaded_df.iterrows(), total=len(loaded_df), desc="Processing samples",):

    if idx < count_row:
        continue
    sample_start_time, sentence, tag_name, tag_token = time.time(), row.sentence, row.tag_name[2:], row.tag_token
    true_label = tag_name
    # Step 1: Skill Selection
    # print(f"Start Skill Selection Phase for Sample {idx + 1}/{len(loaded_df)}")
    skill_context = "\n".join([
        f"- **{skill['name']}**: {skill['description']}"
        for skill in all_skills
    ])

    p_skill_select = p_skill_select_temp.render(SKILL_CONTEXT=skill_context)
    p_exec_finer = p_exec_finer_temp.render(SENTENCE_CONTENT = sentence, NUMERIC_ENTITY = tag_token)
    skill_select_resp = agent_skill_aware.chat(user_input=p_exec_finer, custom_system_prompt=p_skill_select)
    selected_skills = skills_utils.parse_skills_from_json_response(json_response=skill_select_resp, skills_hub_dir=skills_folder)

    # Track whether "movie-sentiment-analysis" was selected in Step 1 
    selected_skill_names_step1 = [s["name"] for s in selected_skills]
    hit_target_skill = "XBRL-tag-classification" in selected_skill_names_step1 ## This is hard Coded

    skill_execution_context = ""
    for skill_meta in selected_skills:
        skill_execution_context += (
            f"SKill {skill_count + 1}: \n"
            f"{skill_meta['description']}\n"
            f"{'\n'.join(skill_meta['body'].split('\n')[1:])}\n\n"
        )
        skill_count += 1

    skill_count_prev = skill_count

    # Step 2: Skill Discovery
    discovery_rounds = 0
    while len(selected_skills) > 0:
        p_skill_discov = p_skill_discov_temp.render(SKILL_CONTEXT=skill_execution_context)
        skill_discov_resp = agent_skill_exec_agent.chat(user_input=p_skill_discov, custom_system_prompt=p_default_system_temp.render())
        selected_skills = skills_utils.parse_skills_from_json_response(json_response=skill_discov_resp, skills_hub_dir=skills_folder)

        for skill_meta in selected_skills:
            skill_execution_context += (
                f"SKill {skill_count + 1}: \n"
                f"{skill_meta['description']}\n"
                f"{'\n'.join(skill_meta['body'].split('\n')[1:])}\n\n"
            )
            skill_count += 1
        discovery_rounds += 1
    new_skills_found = skill_count - skill_count_prev


    # print(f"End of skill discovery phase. Found total of new skills: {new_skills_found}")
    # Step 3: Query Execution
    p_exec_finer_sys = p_skill_exec_temp.render(SKILL_CONTEXT=skill_execution_context)
    finer_exec_response = agent_skill_exec_agent.chat(user_input=p_exec_finer, custom_system_prompt=p_exec_finer_sys)
    message_classification = skills_utils.parse_message_from_json_response(finer_exec_response)
    is_correct = tag_name.lower() in message_classification.strip().lower()

    predicted_label = get_prediction_XBRL_TAGS(message_classification)
    sample_end_time = time.time()
    sample_elapsed = sample_end_time - sample_start_time
    chat_history_agent_skill_select = agent_skill_aware.get_human_ai_message_history()
    chat_history_agent_exec = agent_skill_exec_agent.get_human_ai_message_history()

    # Build record and append to JSONL
    record = {
        "index": int(idx),
        "sentence": sentence,
        "tag_name": tag_name,
        "tag_token": tag_token,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "raw_response": message_classification,
        "correct": is_correct,
        "selected_skills_step1": selected_skill_names_step1,
        "hit_target_skill": hit_target_skill,
        "new_skills_discovered": new_skills_found,
        "discovery_rounds": discovery_rounds,
        "elapsed_seconds": round(sample_elapsed, 4),
        "model": model_name,
        "timestamp": datetime.now().isoformat(),
        "chat_history_agent_skill_select": chat_history_agent_skill_select,
        "chat_history_agent_exec": chat_history_agent_exec,
        "task_type": "agent_skill_based"
        
    }

    dataframe_record = pd.DataFrame([record])
    dataframe_record.to_json(jsonl_path, orient="records", lines=True, mode="a" if os.path.exists(jsonl_path) else "w")
    agent_skill_aware.clear_history()
    agent_skill_exec_agent.clear_history()

    ######################################################################################################################################
    sample_start_time, sentence, tag_name, tag_token = time.time(), row.sentence, row.tag_name[2:], row.tag_token
    p_exec_finer_sys = p_default_system_temp.render()
    finer_exec_response = agent_simple.chat(user_input=p_exec_finer, custom_system_prompt=p_exec_finer_sys)
    message_classification = skills_utils.parse_message_from_json_response(finer_exec_response)
    is_correct = tag_name.lower() in message_classification.strip().lower()

    predicted_label = get_prediction_XBRL_TAGS(message_classification)
    sample_end_time = time.time()
    sample_elapsed = sample_end_time - sample_start_time
    chat_history_agent_exec = agent_simple.get_human_ai_message_history()

    # Build record and append to JSONL
    record = {
        "index": int(idx),
        "sentence": sentence,
        "tag_name": tag_name,
        "tag_token": tag_token,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "raw_response": message_classification,
        "correct": is_correct,
        "selected_skills_step1": "",
        "hit_target_skill": "",
        "new_skills_discovered": "",
        "discovery_rounds": "",
        "elapsed_seconds": round(sample_elapsed, 4),
        "model": model_name,
        "timestamp": datetime.now().isoformat(),
        "chat_history_agent_skill_select": "",
        "chat_history_agent_exec": chat_history_agent_exec,
        "task_type": "agent_simple"
        
    }

    dataframe_record = pd.DataFrame([record])
    dataframe_record.to_json(jsonl_path, orient="records", lines=True, mode="a" if os.path.exists(jsonl_path) else "w")
    agent_simple.clear_history()


    ######################################################################################################################################
    sample_start_time, sentence, tag_name, tag_token = time.time(), row.sentence, row.tag_name[2:], row.tag_token
    skill_context_all =  "The following are skills informaiton you can use as a reference for user request:\n".join([
        f"- **{skill['name']}**:\n {skill['description']} **:\n {skill['body']}"
        for skill in all_skills
    ])
    p_exec_finer = p_exec_finer_temp.render(SENTENCE_CONTENT = sentence, NUMERIC_ENTITY = tag_token+  "\n" + skill_context_all)
    p_exec_finer_sys = p_default_system_temp.render()

    finer_exec_response = agent_simple.chat(user_input=p_exec_finer, custom_system_prompt=p_exec_finer_sys)
    message_classification = skills_utils.parse_message_from_json_response(finer_exec_response)
    is_correct = true_label.lower() in message_classification.strip().lower()

    predicted_label = get_prediction_XBRL_TAGS(message_classification)
    sample_end_time = time.time()
    sample_elapsed = sample_end_time - sample_start_time
    chat_history_agent_exec = agent_simple.get_human_ai_message_history()

    # Build record and append to JSONL
    record = {
        "index": int(idx),
        "sentence": sentence,
        "tag_name": tag_name,
        "tag_token": tag_token,
        "true_label": true_label,
        "predicted_label": predicted_label,
        "raw_response": message_classification,
        "correct": is_correct,
        "selected_skills_step1": "",
        "hit_target_skill": "",
        "new_skills_discovered": "",
        "discovery_rounds": "",
        "elapsed_seconds": round(sample_elapsed, 4),
        "model": model_name,
        "timestamp": datetime.now().isoformat(),
        "chat_history_agent_skill_select": "",
        "chat_history_agent_exec": chat_history_agent_exec,
        "task_type": "agent_skill_full_context"
        
    }

    dataframe_record = pd.DataFrame([record])
    dataframe_record.to_json(jsonl_path, orient="records", lines=True, mode="a" if os.path.exists(jsonl_path) else "w")
    agent_simple.clear_history()

print(f"\n{'='*60}")
print(f"All {len(loaded_df)} samples processed. Results saved to: {jsonl_path}")
print(f"{'='*60}")

In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score  # F1 supports macro/weighted averages [web:2]

jsonl_path = "/home/snt/projects_lujun/LabAgentSkill/assets/results/finer_standard_Qwen3-30B-A3B-Instruct-2507_20260212_121226.jsonl"
results_df = pd.read_json(jsonl_path, lines=True)
TARGET = "XBRL-tag-classification"

def calc_hit_target_skill(row) -> bool:
    lst = row.get("chat_history_agent_skill_select", "")
    if not lst:
        return False
    try:
        obj = lst[1] 
        content = (obj or {}).get("content", "")
    except (IndexError, TypeError, AttributeError):
        return False

    return TARGET.lower() in str(content).lower()

results_df["hit_target_skill"] = results_df.apply(calc_hit_target_skill, axis=1)
ESTIMATED_GPU_RAM_GB = 100

for task_type, g in results_df.groupby("task_type", dropna=False):

    # --- ACC / F1: exclude "unknown" predictions ---
    pred_lower = g["predicted_label"].astype(str).str.lower()
    valid_mask = pred_lower.ne("unknown")
    valid_df = g[valid_mask]
    unknown_count = int((~valid_mask).sum())

    if len(valid_df) > 0:
        y_true = valid_df["true_label"].astype(str).str.lower()
        y_pred = valid_df["predicted_label"].astype(str).str.lower()

        acc = accuracy_score(y_true, y_pred)
        f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)      # macro avg [web:2]
        f1_weighted = f1_score(y_true, y_pred, average="weighted", zero_division=0)  # weighted avg [web:2]
        correct = int((y_true == y_pred).sum())
        denom = len(valid_df)
    else:
        acc = float("0.0000")
        f1_macro = float("0.0000")
        f1_weighted = float("0.0000")
        correct = 0
        denom = 0

    # --- Hit Rate: count non-empty hit_target_skill ---
    hit_count = int(g["hit_target_skill"].fillna("").astype(str).eq("True").sum())
    hit_rate = hit_count / len(g) if len(g) > 0 else float("nan")

    # --- VRAM-Hours (GB·h): VRAM(GB) * time(hours) ---
    total_time_sec = g["elapsed_seconds"].fillna(0).astype(float).sum()
    total_minutes = (total_time_sec / 60.0)
    total_vram_minutes = total_minutes * ESTIMATED_GPU_RAM_GB
    avg_minutes= total_minutes / len(g) if len(g) > 0 else float("nan")
    avg_vram_minutes = total_vram_minutes / len(g) if len(g) > 0 else float("nan")
    # --- Print only the requested metrics ---
    # print(f"[task_type={task_type}] "
    #       f"ACC={acc:.4f} ({correct}/{denom}, unknown_excluded={unknown_count}) | "
    #       f"F1_weighted={f1_weighted:.4f} | "
    #       f"HitRate={hit_rate:.4f} ({hit_count}/{len(g)}) | "
    #       f"AvgHours={avg_hours:.4f} | AvgVRAM-Hours={avg_vram_hours:.4f} GB·h"
    # )

    print (f"{task_type} {acc:.4f} {f1_weighted:.4f} {hit_rate:.4f} {avg_minutes:.4f} {avg_vram_minutes:.4f}")


agent_simple 0.1667 0.1667 0.0000 0.9953 99.5261
agent_skill_based 0.6667 0.7222 1.0000 0.4374 43.7441
agent_skill_full_context 0.1667 0.1667 0.0000 1.2994 129.9419


In [5]:
results_df


Unnamed: 0,index,sentence,tag_name,tag_token,true_label,predicted_label,raw_response,correct,selected_skills_step1,hit_target_skill,new_skills_discovered,discovery_rounds,elapsed_seconds,model,timestamp,chat_history_agent_skill_select,chat_history_agent_exec,task_type
0,0,"During the nine months ended September 30 , 20...",TreasuryStockAcquiredAverageCostPerShare,17.43,TreasuryStockAcquiredAverageCostPerShare,SaleOfStockPricePerShare,The target numeric entity '17.43' in the sente...,False,[XBRL-tag-classification],False,0.0,1.0,10.0762,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:12:36.705037,"[{'role': 'human', 'content': ' Given a senten...","[{'role': 'human', 'content': 'You are a skill...",agent_skill_based
1,0,"During the nine months ended September 30 , 20...",TreasuryStockAcquiredAverageCostPerShare,17.43,TreasuryStockAcquiredAverageCostPerShare,TreasuryStockAcquiredAverageCostPerShare,The target numeric entity in the sentence is *...,True,,False,,,20.1901,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:12:56.903496,,"[{'role': 'human', 'content': ' Given a senten...",agent_simple
2,0,"During the nine months ended September 30 , 20...",TreasuryStockAcquiredAverageCostPerShare,17.43,TreasuryStockAcquiredAverageCostPerShare,TreasuryStockAcquiredAverageCostPerShare,"The target numeric entity is **17.43**, which ...",True,,False,,,34.4634,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:13:31.369678,,"[{'role': 'human', 'content': ' Given a senten...",agent_skill_full_context
3,1,"During the nine months ended September 30 , 20...",TreasuryStockValueAcquiredCostMethod,14.6,TreasuryStockValueAcquiredCostMethod,TreasuryStockValueAcquiredCostMethod,The target numeric entity '14.6' in the senten...,True,[XBRL-tag-classification],False,0.0,1.0,28.7588,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:14:00.136515,"[{'role': 'human', 'content': ' Given a senten...","[{'role': 'human', 'content': 'You are a skill...",agent_skill_based
4,1,"During the nine months ended September 30 , 20...",TreasuryStockValueAcquiredCostMethod,14.6,TreasuryStockValueAcquiredCostMethod,StockRepurchaseProgramRemainingAuthorizedRepur...,"The target numeric entity is **14.6**, which r...",True,,False,,,42.534,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:14:42.676678,,"[{'role': 'human', 'content': ' Given a senten...",agent_simple
5,1,"During the nine months ended September 30 , 20...",TreasuryStockValueAcquiredCostMethod,14.6,TreasuryStockValueAcquiredCostMethod,StockRepurchaseProgramRemainingAuthorizedRepur...,"The target numeric entity is **14.6**, which r...",True,,False,,,55.8668,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:15:38.549370,,"[{'role': 'human', 'content': ' Given a senten...",agent_skill_full_context
6,2,The Company incurred interest expense from the...,InterestExpense,9109.0,InterestExpense,InterestExpenseDebt,The sentence refers to interest expense incurr...,True,[XBRL-tag-classification],False,0.0,1.0,28.2045,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:16:06.761678,"[{'role': 'human', 'content': ' Given a senten...","[{'role': 'human', 'content': 'You are a skill...",agent_skill_based
7,2,The Company incurred interest expense from the...,InterestExpense,9109.0,InterestExpense,LineOfCreditFacilityCurrentBorrowingCapacity,"The target numeric entity is **9,109**, which ...",True,,False,,,54.3868,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:17:01.154580,,"[{'role': 'human', 'content': ' Given a senten...",agent_simple
8,2,The Company incurred interest expense from the...,InterestExpense,9109.0,InterestExpense,LineOfCreditFacilityCurrentBorrowingCapacity,"The target numeric entity is **9,109**, which ...",True,,False,,,71.3702,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:18:12.530644,,"[{'role': 'human', 'content': ' Given a senten...",agent_skill_full_context
9,3,The Company incurred interest expense from the...,InterestExpense,2850.0,InterestExpense,InterestExpense,The sentence mentions 'interest expense from t...,True,[XBRL-tag-classification],False,0.0,1.0,36.2126,Qwen/Qwen3-30B-A3B-Instruct-2507,2026-02-12 12:18:48.751091,"[{'role': 'human', 'content': ' Given a senten...","[{'role': 'human', 'content': 'You are a skill...",agent_skill_based


hit_target_skill
         806
True     392
False     11
Name: count, dtype: int64