# Agent Colab: Research Paper Entity Extraction Benchmark

Evaluates **Gemini 3 Pro Preview** on an agentic, data-driven reasoning task.

**Setup:** Google Colab Pro with `google.colab.ai` (no API keys needed)

**Agent must:**
1. Load data from files (environment interaction)
2. Extract entities and resolve ambiguities (multi-step reasoning)
3. Analyze citation network and detect anomalies (graph reasoning)
4. Save final_report.json to disk (artifact generation)
5. Pass all unit tests

## Setup

In [None]:
import subprocess, sys, os, shutil

REPO_URL = "https://github.com/EhsanKA/agentic_task.git"
REPO_DIR = "/content/agentic_task"

if os.path.exists(REPO_DIR):
    shutil.rmtree(REPO_DIR)
subprocess.run(["git", "clone", REPO_URL, REPO_DIR], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", REPO_DIR], check=True)

# Verify correct version loaded
import importlib
if "benchmark" in sys.modules:
    importlib.reload(sys.modules["benchmark"])
from benchmark.evaluation.agent import build_agent_context
import inspect
print("build_agent_context signature:", inspect.signature(build_agent_context))

In [None]:
from google.colab import ai
import json

available_models = ai.list_models()
print("Available models:", available_models)

## Data Generation

In [None]:
from benchmark.data.loader import setup_data
from benchmark.evaluation.prompt import BENCHMARK_PROMPT
from benchmark.evaluation.agent import select_model, build_agent_context, execute_agent_code, extract_variables

_, _, _, DATA_DIR = setup_data()
MODEL_NAME = select_model(available_models)
print(f"Data directory: {DATA_DIR}")
print(f"Model: {MODEL_NAME}")

## Agent Execution

In [None]:
context = build_agent_context(BENCHMARK_PROMPT, DATA_DIR)

print("Sending task to agent...")
agent_response = ai.generate_text(prompt=context, model_name=MODEL_NAME)
print("Response received.")
print(agent_response[:2000])

In [None]:
exec_result = execute_agent_code(agent_response, DATA_DIR)

# Retry once if first attempt failed
if exec_result and "__error__" in exec_result:
    print(f"\nFirst attempt failed: {exec_result['__error__']}")
    print("Retrying with error context...")
    retry_prompt = (
        f"Your previous code produced this error:\n{exec_result['__traceback__']}\n\n"
        f"Fix the bug and return the corrected complete code.\n\n{context}"
    )
    agent_response = ai.generate_text(prompt=retry_prompt, model_name=MODEL_NAME)
    exec_result = execute_agent_code(agent_response, DATA_DIR)

results = extract_variables(exec_result) if exec_result else {}
for k, v in results.items():
    globals()[k] = v

## Agent Output

In [None]:
try:
    print(json.dumps(validation_results, indent=2))
    print(json.dumps(final_report, indent=2, default=str))
except NameError as e:
    print(f"Missing: {e}")

## Unit Tests

In [None]:
from benchmark.evaluation.tests import set_context, run_all_tests

results["_data_dir"] = DATA_DIR
set_context(results)
test_result = run_all_tests()

## Summary

In [None]:
import os
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"Artifact saved: {os.path.exists(os.path.join(DATA_DIR, 'final_report.json'))}")
print(f"Tests: {test_result.testsRun} run, {len(test_result.failures)} failures, {len(test_result.errors)} errors")
if test_result.wasSuccessful():
    print("BENCHMARK PASSED")
else:
    print("BENCHMARK FAILED")
print("=" * 60)