In [12]:
# Imports and Setup
import adalflow as adal
from adalflow.datasets.types import Example
from adalflow.eval.answer_match_acc import AnswerMatchAcc
from adalflow.core import ModelClientType
from adalflow.components.model_client import AnthropicAPIClient
from typing import Dict, List
import json
import os

# Assuming you have the coa_agent function
from coa import coa_agent

# Model Kwargs

In [60]:
claude_model_kwargs = {
    "model": "claude-3-5-sonnet-20241022",
    "temperature": 0.0,
    "max_tokens": 5000
}

# Load Data

In [54]:
# Load Datasets Function
def load_datasets() -> List[Example]:
    # Load test data from a JSON file
    with open('eval_data.json', 'r') as f:
        eval_data = json.load(f)
    
    # Convert JSON data into two lists for questions and answers
    questions = []
    answers = []

    for pair in eval_data:
        questions.append(pair['question'])
        answers.append(pair['answer'])

    return questions, answers

# Eval Function

In [74]:
def compute_llm_as_judge(questions, pred_answers, gt_answers):
    from adalflow.eval.llm_as_judge import LLMasJudge, DefaultLLMJudge

    llm_judge = DefaultLLMJudge(
        model_client=AnthropicAPIClient(),
        model_kwargs={"model": "claude-3-5-sonnet-20241022", "temperature": 0.0, "max_tokens": 500},
    )
    llm_evaluator = LLMasJudge(llm_judge=llm_judge)
    eval_rslt = llm_evaluator.compute(
        questions=questions, gt_answers=gt_answers, pred_answers=pred_answers
    )
    return eval_rslt

# CoA Agent Testing

In [76]:

# Load the datasets
questions, gt_answers = load_datasets()

#Get reponses from the CoA agent for each question
pred_answers = []
for question in questions:
    coa_response = coa_agent(question, ModelClientType.ANTHROPIC(), claude_model_kwargs)
    list_response = coa_response
    pred_answers.append(list_response)

blue response: Tactical Command and Control (TAC C2) plays a crucial role in air operations by providing real-time coordination, decision-making, and management of air assets. Here are the key aspects of TAC C2:

1. Mission Coordination
- Coordinates multiple aircraft and missions simultaneously
- Ensures deconfliction between different air assets
- Manages airspace integration and separation
- Synchronizes air operations with ground forces

2. Real-time Decision Making
- Provides tactical direction to aircraft
- Responds to emerging threats and situations
- Adjusts mission parameters as needed
- Prioritizes tasks and resources

3. Battle Management
- Maintains situational awareness
- Monitors threat environment
- Coordinates combat air patrol operations
- Manages air defense operations

4. Communication
- Maintains communication links between aircraft and ground forces
- Relays critical information and updates
- Ensures proper information flow between command elements
- Coordinates wi

In [75]:
# Evaluate the Agent
evaluations = compute_llm_as_judge(questions, pred_answers, gt_answers)
print(evaluations)

LLMJudgeEvalResult(avg_score=0.06666666666666667, judgement_score_list=[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], confidence_interval=(0, 0.19700981908390952))
