In [66]:
import os
import json
import time
import csv
import uuid
import random
import asyncio
import aiohttp
import re
import pandas as pd
import together
from langchain_openai import ChatOpenAI
import google.generativeai as genai
import math
import requests
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


In [95]:
load_dotenv()

True

In [96]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY")            
OLLAMA_URL = os.getenv("OLLAMA_URL")               
GROQ_AVAILABLE = GROQ_API_KEY is not None  
OLLAMA_AVAILABLE = OLLAMA_URL is not None
os.environ["GEMINI_API_KEY"] = "GEMINI_API_KEY"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Output CSV
OUTPUT_CSV = "agent_evaluation_results.csv"

In [107]:
# Agent list (6 agents). They will be treated as distinct 'agents' though we reuse the same model.
AGENTS = [
    {"agent_id": "Agent_1", "agent_name": "llama-3.1-8b-instant", "backend": "groq", "model": "llama-3.1-8b-instant"},
    {"agent_id": "Agent_2", "agent_name": "meta-llama/llama-4-maverick-17b-128e-instruct", "backend": "groq", "model": "meta-llama/llama-4-maverick-17b-128e-instruct"},
    {"agent_id": "Agent_3", "agent_name": "openai/gpt-oss-120b", "backend": "groq", "model": "openai/gpt-oss-120b"},
    {"agent_id": "Agent_4", "agent_name": "tinyllama", "backend": "ollama", "model": "tinyllama"},
    {"agent_id": "Agent_5", "agent_name": "gemma:2b", "backend": "ollama", "model": "gemma:2b"},
    {"agent_id": "Agent_6", "agent_name": "tinydolphin", "backend": "ollama", "model": "tinydolphin"},
]

In [98]:
# Restriction: approximate 50-100 words. Convert to tokens roughly (conservative).
# We'll set a max token budget to try to keep output in range.
WORDS_MIN = 50
WORDS_MAX = 100
TOKENS_PER_WORD = 1.33  # approximate: tokens = words * 1.33 (conservative)
MAX_TOKENS_FOR_RESPONSE = math.ceil(WORDS_MAX * TOKENS_PER_WORD)  # upper bound

# Rate / concurrency
PARALLEL_WORKERS = 4
CALL_DELAY = 0.2
MAX_TOKENS = 512  

In [99]:
async def generate_with_groq(session, prompt: str, agent: Dict[str, str]) -> str:
    """
    Generate response using Groq async call.
    Falls back if error occurs.
    """
    groq_key = os.environ.get("GROQ_API_KEY") 
    if not groq_key:
        return f"[Groq not configured for {agent['agent_name']}]"
    
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {"Authorization": f"Bearer {groq_key}"}
    payload = {
        "model": agent["model"], 
        "messages": [
            {"role": "user", "content": f"{prompt}\n\nAnswer in {WORDS_MIN}-{WORDS_MAX} words."}
        ],
        "max_tokens": MAX_TOKENS,
        "temperature": 0.7
    }
    
    try:
        async with session.post(url, headers=headers, json=payload, timeout=60) as resp:
            data = await resp.json()
            # Groq follows OpenAI schema: choices -> message -> content
            return data["choices"][0]["message"]["content"].strip()
    except Exception as e:
        return f"[Groq exception: {e}]"

In [100]:
async def generate_with_ollama(session, prompt: str, agent: Dict[str, str]) -> str:
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": agent["model"],
        "prompt": f"{prompt}\n\nAnswer in {WORDS_MIN}-{WORDS_MAX} words.",
        "stream": True
    }
    try:
        async with session.post(url, json=payload) as resp:
            response_text = ""
            async for line in resp.content:
                if not line.strip():
                    continue
                try:
                    data = json.loads(line.decode("utf-8"))
                    if "response" in data:
                        response_text += data["response"]
                    if data.get("done"):
                        break
                except Exception:
                    continue
            return response_text.strip()
    except Exception as e:
        return f"[Ollama exception: {e}]"


In [101]:
async def generate_response(session, prompt: str, agent: Dict[str, str]) -> str:
    if agent["backend"] == "ollama":
        return await generate_with_ollama(session, prompt, agent)
    elif agent["backend"] == "groq":
        return await generate_with_groq(session, prompt, agent)
    return f"[Unknown backend for {agent['agent_name']}]"

In [102]:
JUDGE_RUBRIC_PROMPT = """
You are an impartial expert evaluator. Your task is to evaluate an AI agentâ€™s response
against the given prompt and ground_truth (if available).

You must return STRICTLY one JSON object following this schema:

{
  "agent_id": "<agent_id>",
  "agent_name": "<agent_name>",
  "prompt_id": "<prompt_id>",
  "scores": {
    "instruction_following": <-1 | 0 | 1>,
    "hallucination": <-1 | 0 | 1>,
    "assumption_control": <-1 | 0 | 1>,
    "coherence_accuracy": <-1 | 0 | 1>
  },
  "explanations": {
    "instruction_following": "<1 concise sentence explanation>",
    "hallucination": "<1 concise sentence explanation>",
    "assumption_control": "<1 concise sentence explanation>",
    "coherence_accuracy": "<1 concise sentence explanation>"
  },
  "total_score": <float from -1.0 to 1.0 rounded to 2 decimals>
}

--------------------
### Dimension Definitions & Scoring:

- instruction_following: Measures whether the agent follows the explicit user instructions.  
  - 1 = Perfectly followed instructions (format, count, constraints).  
  - 0 = Partially followed, minor deviation.  
  - -1 = Ignored or violated instructions.  

- hallucination: Measures factual grounding and avoidance of fabricated content.  
  - 1 = No fabricated/false claims (fully grounded in ground_truth if given).  
  - 0 = Slight speculation or unverifiable claim, but mostly correct.  
  - -1 = Contains major fabricated/false claims.  

- assumption_control: Measures whether the agent avoids or properly qualifies unjustified assumptions.  
  - 1 = No unjustified assumptions or assumptions clearly stated.  
  - 0 = Some minor assumptions not stated but not harmful.  
  - -1 = Many or major unjustified assumptions.  

- coherence_accuracy: Measures clarity, logical flow, and factual correctness.  
  - 1 = Clear, logically flowing, accurate.  
  - 0 = Somewhat clear but contains redundancy or mild confusion.  
  - -1 = Confusing, disorganized, or factually wrong.  

--------------------
Now evaluate the following case and return STRICT JSON only.
Prompt: '''{prompt}'''
Ground Truth: '''{ground_truth}'''
Agent Response: '''{agent_response}'''
"""


In [103]:
def score_with_gemini(prompt: str, response: str, ground_truth: str) -> Dict:
    full_prompt = JUDGE_RUBRIC_PROMPT + "\n\n"
    full_prompt += f"Prompt: '''{prompt}'''\n\n"
    full_prompt += f"Ground truth: '''{ground_truth}'''\n\n" if ground_truth else "Ground truth: None\n\n"
    full_prompt += f"Agent response: '''{response}'''\n\n"
    full_prompt += "Output EXACT JSON now."    
    
    try:
      model = genai.GenerativeModel("gemini-1.5-flash")  # safer default
      result = model.generate_content(full_prompt)

      # Grab raw text from Gemini
      if result.candidates and result.candidates[0].content.parts:
          raw_text = result.candidates[0].content.parts[0].text
      else:
          raw_text = result.text

      # Try to extract JSON safely
      json_str = raw_text.strip()
      if "{" in json_str:
          json_str = json_str[json_str.find("{"): json_str.rfind("}") + 1]

      return json.loads(json_str)

    except Exception as e:
        return {"scores": {}, "explanations": {"error": f"Gemini output parse failed: {e}"}}


In [81]:
# def simulate_judge(agent_record: Dict[str,str], ground_truth: Optional[str]) -> Dict[str,Any]:
#     """Deterministic simulation for judge outputs when Gemini is unavailable."""
#     resp = agent_record["response"]
#     aid = agent_record["agent_id"]
#     aname = agent_record["agent_name"]
#     pid = agent_record["prompt_id"]

#     seed = abs(hash(aid + resp)) % (2**31)
#     rnd = random.Random(seed)
#     # heuristics
#     instr = 5 if "-" in resp or "\n" in resp and resp.strip().startswith(("-", "1")) else 3
#     hall = 5 if ground_truth and any(w.lower() in resp.lower() for w in ground_truth.split()[:6]) else rnd.randint(2,5)
#     assumption = 5 if "I assume" in resp or "assuming" in resp else rnd.randint(2,5)
#     coherence = 5 if len(resp.split()) >= 40 else rnd.randint(3,5)
#     scores = {
#         "instruction_following": int(max(1, min(5, instr))),
#         "hallucination": int(max(1, min(5, hall))),
#         "assumption_control": int(max(1, min(5, assumption))),
#         "coherence_accuracy": int(max(1, min(5, coherence)))
#     }
#     total = round(sum(scores.values())/4.0, 2)
#     explanations = {
#         "instruction_following": "Simulated: format looks like requested." if scores["instruction_following"]>=4 else "Simulated: format mismatch.",
#         "hallucination": "Simulated: aligns with ground truth." if scores["hallucination"]>=4 else "Simulated: possible unsupported claims.",
#         "assumption_control": "Simulated: assumptions minimal or stated." if scores["assumption_control"]>=4 else "Simulated: assumptions present without statement.",
#         "coherence_accuracy": "Simulated: largely coherent." if scores["coherence_accuracy"]>=4 else "Simulated: somewhat terse."
#     }
#     return {
#         "agent_id": aid,
#         "agent_name": aname,
#         "prompt_id": pid,
#         "scores": scores,
#         "explanations": explanations,
#         "total_score": total,
#         "response": resp,
#         "ground_truth": ground_truth
#     }

In [104]:
async def evaluate_agents(prompt: str, ground_truth: str) -> List[Dict]:
    results = []
    async with aiohttp.ClientSession() as session:
        tasks = [generate_response(session, prompt, agent) for agent in AGENTS]
        responses = await asyncio.gather(*tasks)

    for agent, response in zip(AGENTS, responses):
        judge = score_with_gemini(prompt, response, ground_truth)
        scores = judge.get("scores", {})
        explanations = judge.get("explanations", {})
        total = sum(scores.values()) / len(scores) if scores else 0
        results.append({
            "agent_id": agent["agent_id"],
            "agent_name": agent["agent_name"],
            "prompt_id": f"Prompt_{uuid.uuid4().hex[:8]}",
            "response": response,
            "ground_truth": ground_truth,
            "scores": scores,
            "explanations": explanations,
            "total_score": total
        })
    return results

In [106]:

# === RUN ===
if __name__ == "__main__":
    # PROMPT = "What are the ethical concerns of deploying AI systems in society?"
    # GROUND_TRUTH = "Core concerns include algorithmic bias and fairness, data privacy and consent, transparency and accountability, and job displacement risks."
    PROMPT = "List exactly 3 programming languages used for data science. Do not explain."
    GROUND_TRUTH = "Python, R, Julia"

    print("Running multi-provider generation + Gemini judging pipeline...")
    # results = asyncio.run(evaluate_agents(PROMPT, GROUND_TRUTH))
    results = await evaluate_agents(PROMPT, GROUND_TRUTH)
    print(json.dumps(results, indent=2))

    # Write CSV
    rows = []
    for r in results:
        row = {
            "agent_id": r["agent_id"],
            "agent_name": r["agent_name"],
            "prompt_id": r["prompt_id"],
            "response": r["response"],
            "ground_truth": r["ground_truth"],
            **{f"{k}_score": v for k, v in r["scores"].items()},
            "total_score": r["total_score"]
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    df.to_csv("agent_evaluation_results.csv", index=False)
    print("CSV written to agent_evaluation_results.csv")

Running multi-provider generation + Gemini judging pipeline...
[
  {
    "agent_id": "Agent_1",
    "agent_name": "GroqAgent-1",
    "prompt_id": "Prompt_955a50eb",
    "response": "Here are three programming languages used for data science:\n\n1. Python\n2. R\n3. Julia",
    "ground_truth": "Python, R, Julia",
    "scores": {
      "instruction_following": 0,
      "hallucination": 1,
      "assumption_control": 1,
      "coherence_accuracy": 1
    },
    "explanations": {
      "instruction_following": "The agent followed the instructions to list three languages but added extra formatting (numbers).",
      "hallucination": "The agent did not hallucinate; the response is factually correct.",
      "assumption_control": "The agent made no unjustified assumptions.",
      "coherence_accuracy": "The response is coherent, accurate, and well-organized."
    },
    "total_score": 0.75
  },
  {
    "agent_id": "Agent_2",
    "agent_name": "GroqAgent-2",
    "prompt_id": "Prompt_359cb3d1",
 