## Init For Agents Testing

In [None]:
# Setup: Load environment variables and dependencies
import datetime
import os
import sys
from pathlib import Path

from jinja2 import Environment, FileSystemLoader
from datasets import load_dataset
import pandas as pd
import time
import json
from datetime import datetime
import pandas as pd
from tqdm import tqdm


num_skills_to_test_list = [5,10,15,20,50,80,100]

for num_skills_to_test in num_skills_to_test_list:
    print (f"\n\nTesting with Number {num_skills_to_test} skills...\n")
    project_root = Path.cwd()
    src_path = project_root / "src"
    env = Environment(loader=FileSystemLoader('prompts/'))  
    dataset_name = "/home/snt/projects_lujun/LabAgentSkill/assets/datasets/insureBench.jsonl"
    skills_folder = Path("/home/snt/projects_lujun/LabAgentSkill/skillsHub/skills_scaling")

    model_name = "gpt-4o-mini"
    # num_skills_to_test = 80
    # JSONL output path
    output_dir = "/home/snt/projects_lujun/LabAgentSkill/assets/results/"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    jsonl_path = output_dir+f"skill_scaling_{num_skills_to_test}_{model_name.split('/')[-1]}_{timestamp}.jsonl"
    path_to_prompt = 'prompts/'
    target_skill_names = "insurance-mail-triage"
    base_url = None
    # model_name = "google/gemma-3-270m-it"
    # model_name = "Qwen/Qwen2.5-0.5B-Instruct"
    # base_url = "http://127.0.0.1:8001/v1"


    if str(src_path) not in sys.path:
        sys.path.insert(0, str(src_path))
        print(f"✓ Added to sys.path: {src_path}")

    from LabAgentSkill import skills_utils
    from LabAgentSkill.SkillAwareAgent import SkillAwareAgent
    from LabAgentSkill.evaluate import get_insurBench_predicted_label_v2, get_insurBench_predicted_label, get_predicted_label, get_prediction_XBRL_TAGS

    root_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
    env_path = root_dir / ".env"
    env = {}

    if env_path.exists():
        for line in env_path.read_text().splitlines():
            line = line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            key, value = line.split("=", 1)
            env[key.strip()] = value.strip()

    # Set API key
    os.environ["OPENAI_API_KEY"] = env.get("OPENAI_API_KEY", os.environ.get("OPENAI_API_KEY", ""))

    env = Environment(loader=FileSystemLoader(path_to_prompt))  
    all_skills = skills_utils.read_all_skills_metadata(skills_folder)
    for skill in all_skills:
        # print(f"  - {skill['name']}: {skill['description']}")
        if target_skill_names.lower() in skill['name'].lower():
            target_skill = skill
    skill_random_sample = skills_utils.get_random_skills(all_skills, num_skills_to_test-1)
    skill_random_sample.append(target_skill)
    all_skills = skill_random_sample

    # for skill in all_skills:
    #     print(f"  - {skill['name']}: {skill['description']}")
        
    loaded_df = pd.read_json(dataset_name, lines=True)
    loaded_df = loaded_df.reset_index(drop=True)


    print(f"Using model: {model_name}")
    # Initialize agents
    agent_skill_aware = SkillAwareAgent(use_chat_history=True, use_trim_messages=True, model=model_name, base_url=base_url)
    agent_skill_exec_agent = SkillAwareAgent(use_chat_history=True, use_trim_messages=True, model=model_name, base_url=base_url)
    agent_simple = SkillAwareAgent(use_chat_history=True, use_trim_messages=False, model=model_name, base_url=base_url)
    p_exec_insurBench_temp = env.get_template('p_exec_insurBench.jinja')
    p_skill_select_temp = env.get_template('p_skill_select.jinja')
    p_skill_discov_temp = env.get_template('p_skill_discov.jinja')
    p_default_system_temp = env.get_template('p_default_system.jinja')
    p_skill_exec_temp = env.get_template('p_skill_exec.jinja')


    print(f"Results will be saved to: {jsonl_path}")

    skill_count = 0
    count_row = 0
    if os.path.exists(jsonl_path):
        df_exist = pd.read_json(jsonl_path, lines=True)
        count_row = len(df_exist)
        print(f"Resume from row: {count_row}")


    # Process each sample
    for idx, row in tqdm( loaded_df.iterrows(), total=len(loaded_df), desc="Processing samples",):

        if idx < count_row:
            continue
        
        sample_start_time, email_history, cls_label = time.time(), row.content, row.cls_label
        true_label = {"Non": "NO", "Oui": "YES"}.get(cls_label, cls_label)

        # Step 1: Skill Selection
        # print(f"Start Skill Selection Phase for Sample {idx + 1}/{len(loaded_df)}")
        skill_context = "\n".join([
            f"- **{skill['name']}**: {skill['description']}"
            for skill in all_skills
        ])

        p_skill_select = p_skill_select_temp.render(SKILL_CONTEXT=skill_context)
        p_exec_insurBench = p_exec_insurBench_temp.render(EMAIL_HISTORY = email_history)
        skill_select_resp = agent_skill_aware.chat(user_input=p_exec_insurBench, custom_system_prompt=p_skill_select)
        selected_skills = skills_utils.parse_skills_from_json_response(json_response=skill_select_resp, skills_hub_dir=skills_folder)

        # Track whether "movie-sentiment-analysis" was selected in Step 1 
        selected_skill_names_step1 = [s["name"] for s in selected_skills]
        hit_target_skill = "insurance-mail-triage".lower() in skill_select_resp.lower() ## This is hard Coded


        # Build record and append to JSONL
        record = {
            "index": int(idx),
            "email_history": email_history,
            "true_label": true_label,
            "selected_skills_step1": selected_skill_names_step1,
            "skill_select_response": skill_select_resp,
            "skill_number_to_test": num_skills_to_test,
            
        }

        dataframe_record = pd.DataFrame([record])
        dataframe_record.to_json(jsonl_path, orient="records", lines=True, mode="a" if os.path.exists(jsonl_path) else "w")
        agent_skill_aware.clear_history()
        agent_skill_exec_agent.clear_history()

    
    print(f"\n{'='*60}")
    print(f"All {len(loaded_df)} samples processed. Results saved to: {jsonl_path}")
    print(f"{'='*60}")

In [None]:
import pandas as pd
from pathlib import Path

results_dir = Path("/home/snt/projects_lujun/LabAgentSkill/assets/results")
target_skill = "insurance-mail-triage"

for file in sorted(results_dir.glob("skill_scaling_*.jsonl")):
    df = pd.read_json(file, lines=True)
    num_skills_to_test = df["skill_number_to_test"].iloc[0]
    # Count how many rows have the target skill in selected_skills_step1
    count_correct = df["selected_skills_step1"].apply(
        lambda skills: target_skill in skills if isinstance(skills, list) else False
    ).sum()
    count_total = len(df)
    print(f"File: {file.name} | Skills pool: {num_skills_to_test} | Hit rate: {count_correct}/{count_total} = {count_correct/count_total:.3f}")

Using model: gpt-4o-mini
✓ SkillAwareAgent initialized
  Model: gpt-4o-mini
  Chat History: ENABLED ✓
  Trim Messages: ENABLED ✓
✓ SkillAwareAgent initialized
  Model: gpt-4o-mini
  Chat History: ENABLED ✓
  Trim Messages: ENABLED ✓
✓ SkillAwareAgent initialized
  Model: gpt-4o-mini
  Chat History: ENABLED ✓
  Trim Messages: DISABLED ✗
Results will be saved to: /home/snt/projects_lujun/LabAgentSkill/assets/results/skill_scaling_80_gpt-4o-mini_20260212_223110.jsonl


Processing samples: 100%|██████████| 4/4 [00:39<00:00,  9.97s/it]


All 4 samples processed. Results saved to: /home/snt/projects_lujun/LabAgentSkill/assets/results/skill_scaling_80_gpt-4o-mini_20260212_223110.jsonl



