# Model Evaluation: Fake News Detector Agent

This notebook evaluates different LLM models across the agent's nodes:
- **Entry**: Relevance analysis
- **Planner**: Plan generation
- **Researcher**: Query generation
- **Analyst**: Final analysis and scoring

## Baseline: GPT-4
## Test Models: llama3.1:8b, qwen2.5:1.5b

In [1]:
# Setup and imports
import os
import json
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
from typing import Dict, List, Any
from copy import deepcopy

from src.graphs.v1 import graph
from src.models.context import ModelsRegistry, ModelConfig
from tavily import TavilyClient
from src.utils.observability import (
    UsageMetadataCallbackHandler,
    set_callback_handler,
)

load_dotenv()

# Initialize Tavily client
tavily = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])

## Configuration

In [2]:
# Test posts - mix of relevant and irrelevant content
TEST_POSTS = [
    {
        "id": "irrelevant_1",
        "text": "Bom dia!",
        "expected_relevant": False,
    },
    {
        "id": "fake_news_1",
        "text": "Cloroquina cura COVID",
        "expected_relevant": True,
    },
    {
        "id": "political_1",
        "text": "Gastar dinheiro público com campanhas de incentivo ao aborto, ideologia de gênero ou troca de sexo de crianças? NÃO! Não vamos permitir!",
        "expected_relevant": True,
    },
]

# Baseline model (GPT-4)
BASELINE_MODEL = "gpt-4"

# Ollama configuration for local models
OLLAMA_CONFIG = {
    "model_provider": "ollama",
    "base_url": "http://localhost:11434",
}

# Models to test
TEST_MODELS = [
    "llama3.1:8b",
    "qwen2.5:1.5b",
]

## Helper Functions

In [3]:
def create_baseline_registry() -> ModelsRegistry:
    """Create registry with baseline model (GPT-4) for all nodes."""
    return ModelsRegistry(
        entry=BASELINE_MODEL,
        planner=BASELINE_MODEL,
        researcher=BASELINE_MODEL,
        analyst=BASELINE_MODEL,
        default_temperature=0.0,
    )


def create_test_registry(node_name: str, test_model: str) -> ModelsRegistry:
    """
    Create registry with test model for a specific node, baseline for others.
    
    Args:
        node_name: Node to use test model (entry, planner, researcher, analyst)
        test_model: Model name to test (e.g., llama3.1:8b)
    """
    config = {
        "entry": BASELINE_MODEL,
        "planner": BASELINE_MODEL,
        "researcher": BASELINE_MODEL,
        "analyst": BASELINE_MODEL,
        "default_temperature": 0.0,
    }
    
    # Replace the specific node with test model config
    config[node_name] = ModelConfig(
        model=test_model,
        temperature=0.0,
        **OLLAMA_CONFIG,
    )
    
    return ModelsRegistry(**config)


def run_evaluation(registry: ModelsRegistry, post: str) -> Dict[str, Any]:
    """Run the graph with a specific registry and return results."""
    # Create fresh callback handler for each run
    callback = UsageMetadataCallbackHandler()
    set_callback_handler(callback)
    
    config = {
        "configurable": {"thread_id": str(datetime.now().timestamp())},
        "callbacks": [callback],
    }
    runtime_context = {"models_registry": registry, "tavily": tavily}
    
    initial_state = {
        "post": post,
        "max_revisions": 3,
    }
    
    try:
        resp = graph.invoke(initial_state, context=runtime_context, config=config)
        
        # Extract results
        result = {
            "success": True,
            "relevance_analysis": resp["relevance_analysis"].model_dump() if hasattr(resp["relevance_analysis"], "model_dump") else resp["relevance_analysis"],
            "metrics": {k: v.model_dump() for k, v in resp["metrics"].items()},
        }
        
        # Add response if relevant
        if result["relevance_analysis"].get("relevant", False):
            result["plan"] = resp.get("plan", "")
            result["response"] = resp["response"].model_dump() if hasattr(resp["response"], "model_dump") else resp["response"]
        
        return result
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
        }

## 1. Run Baseline Evaluation (GPT-4)

In [4]:
print("Running baseline evaluation with GPT-4...\n")

baseline_results = {}
baseline_registry = create_baseline_registry()

for post_data in TEST_POSTS:
    post_id = post_data["id"]
    post_text = post_data["text"]
    
    print(f"Processing: {post_id}")
    print(f"  Post: {post_text[:50]}..." if len(post_text) > 50 else f"  Post: {post_text}")
    
    result = run_evaluation(baseline_registry, post_text)
    baseline_results[post_id] = result
    
    if result["success"]:
        print(f"  Relevant: {result['relevance_analysis']['relevant']}")
        if result['relevance_analysis']['relevant']:
            print(f"  Score: {result['response']['score']}")
    else:
        print(f"  Error: {result['error']}")
    print()

Running baseline evaluation with GPT-4...

Processing: irrelevant_1
  Post: Bom dia!




[entry] Execution time: 18.83s | Base model: gpt-4 | Tokens: 615 (585 prompt + 30 completion)
  Relevant: False

Processing: fake_news_1
  Post: Cloroquina cura COVID
[entry] Execution time: 2.41s | Base model: gpt-4 | Tokens: 615 (587 prompt + 28 completion)
[planner] Execution time: 11.50s | Base model: gpt-4 | Tokens: 602 (226 prompt + 376 completion)
[researcher] Execution time: 16.36s | Base model: gpt-4 | Tokens: 732 (612 prompt + 120 completion)
[analyst] Execution time: 4.35s | Base model: gpt-4 | Tokens: 1678 (1531 prompt + 147 completion)
  Relevant: True
  Score: 1.0

Processing: political_1
  Post: Gastar dinheiro público com campanhas de incentivo...
[entry] Execution time: 2.39s | Base model: gpt-4 | Tokens: 656 (620 prompt + 36 completion)
[planner] Execution time: 14.81s | Base model: gpt-4 | Tokens: 648 (259 prompt + 389 completion)
[researcher] Execution time: 7.57s | Base model: gpt-4 | Tokens: 686 (625 prompt + 61 completion)
[analyst] Execution time: 5.36s | Base m

## 2. Test Different Models per Node

In [5]:
# Store all test results
test_results = {}
nodes = ["entry", "planner", "researcher", "analyst"]

for test_model in TEST_MODELS:
    print(f"\n{'='*60}")
    print(f"Testing model: {test_model}")
    print(f"{'='*60}")
    
    test_results[test_model] = {}
    
    for node in nodes:
        print(f"\n--- Node: {node} ---")
        test_results[test_model][node] = {}
        
        registry = create_test_registry(node, test_model)
        
        for post_data in TEST_POSTS:
            post_id = post_data["id"]
            post_text = post_data["text"]
            
            print(f"  Processing: {post_id}...", end=" ")
            
            result = run_evaluation(registry, post_text)
            test_results[test_model][node][post_id] = result
            
            if result["success"]:
                print(f"OK (relevant={result['relevance_analysis']['relevant']})")
            else:
                print(f"FAILED: {result['error'][:50]}")


Testing model: llama3.1:8b

--- Node: entry ---
  Processing: irrelevant_1... [entry] Execution time: 56.70s | Base model: llama3.1:8b | Tokens: 581 (548 prompt + 33 completion)
OK (relevant=False)
  Processing: fake_news_1... [entry] Execution time: 7.44s | Base model: llama3.1:8b | Tokens: 577 (550 prompt + 27 completion)
[planner] Execution time: 11.73s | Base model: gpt-4 | Tokens: 613 (226 prompt + 387 completion)




[researcher] Execution time: 8.28s | Base model: gpt-4 | Tokens: 721 (623 prompt + 98 completion)
[analyst] Execution time: 5.58s | Base model: gpt-4 | Tokens: 1498 (1354 prompt + 144 completion)
OK (relevant=True)
  Processing: political_1... [entry] Execution time: 11.74s | Base model: llama3.1:8b | Tokens: 612 (583 prompt + 29 completion)
OK (relevant=False)

--- Node: planner ---
  Processing: irrelevant_1... [entry] Execution time: 1.83s | Base model: gpt-4 | Tokens: 615 (585 prompt + 30 completion)
OK (relevant=False)
  Processing: fake_news_1... [entry] Execution time: 1.44s | Base model: gpt-4 | Tokens: 615 (587 prompt + 28 completion)
[planner] Execution time: 154.78s | Base model: llama3.1:8b | Tokens: 814 (230 prompt + 584 completion)
[researcher] Execution time: 8.42s | Base model: gpt-4 | Tokens: 885 (820 prompt + 65 completion)
[analyst] Execution time: 6.58s | Base model: gpt-4 | Tokens: 1629 (1525 prompt + 104 completion)
OK (relevant=True)
  Processing: political_1... 

## 3. Compare Results with Baseline

In [6]:
def compare_relevance(baseline: Dict, test: Dict) -> Dict:
    """Compare relevance analysis between baseline and test."""
    if not baseline["success"] or not test["success"]:
        return {"match": False, "reason": "One or both failed"}
    
    b_rel = baseline["relevance_analysis"]["relevant"]
    t_rel = test["relevance_analysis"]["relevant"]
    
    return {
        "match": b_rel == t_rel,
        "baseline_relevant": b_rel,
        "test_relevant": t_rel,
        "baseline_reasoning": baseline["relevance_analysis"].get("reasoning", "")[:100],
        "test_reasoning": test["relevance_analysis"].get("reasoning", "")[:100],
    }


def compare_scores(baseline: Dict, test: Dict) -> Dict:
    """Compare final scores between baseline and test."""
    if not baseline["success"] or not test["success"]:
        return {"comparable": False, "reason": "One or both failed"}
    
    b_rel = baseline["relevance_analysis"]["relevant"]
    t_rel = test["relevance_analysis"]["relevant"]
    
    if not b_rel or not t_rel:
        return {"comparable": False, "reason": "One or both not relevant"}
    
    b_score = baseline["response"]["score"]
    t_score = test["response"]["score"]
    
    return {
        "comparable": True,
        "baseline_score": b_score,
        "test_score": t_score,
        "difference": t_score - b_score,
        "baseline_justification": baseline["response"].get("justification", "")[:200],
        "test_justification": test["response"].get("justification", "")[:200],
    }

In [7]:
# Build comparison dataframe
comparison_data = []

for test_model in TEST_MODELS:
    for node in nodes:
        for post_data in TEST_POSTS:
            post_id = post_data["id"]
            
            baseline = baseline_results.get(post_id, {})
            test = test_results.get(test_model, {}).get(node, {}).get(post_id, {})
            
            rel_comparison = compare_relevance(baseline, test)
            score_comparison = compare_scores(baseline, test)
            
            row = {
                "model": test_model,
                "node": node,
                "post_id": post_id,
                "success": test.get("success", False),
                "relevance_match": rel_comparison.get("match", False),
                "baseline_relevant": rel_comparison.get("baseline_relevant"),
                "test_relevant": rel_comparison.get("test_relevant"),
            }
            
            if score_comparison.get("comparable"):
                row["baseline_score"] = score_comparison["baseline_score"]
                row["test_score"] = score_comparison["test_score"]
                row["score_diff"] = score_comparison["difference"]
            
            comparison_data.append(row)

df_comparison = pd.DataFrame(comparison_data)
df_comparison

Unnamed: 0,model,node,post_id,success,relevance_match,baseline_relevant,test_relevant,baseline_score,test_score,score_diff
0,llama3.1:8b,entry,irrelevant_1,True,True,False,False,,,
1,llama3.1:8b,entry,fake_news_1,True,True,True,True,1.0,0.7,-0.3
2,llama3.1:8b,entry,political_1,True,False,True,False,,,
3,llama3.1:8b,planner,irrelevant_1,True,True,False,False,,,
4,llama3.1:8b,planner,fake_news_1,True,True,True,True,1.0,0.9,-0.1
5,llama3.1:8b,planner,political_1,True,True,True,True,0.9,0.9,0.0
6,llama3.1:8b,researcher,irrelevant_1,True,True,False,False,,,
7,llama3.1:8b,researcher,fake_news_1,True,True,True,True,1.0,1.0,0.0
8,llama3.1:8b,researcher,political_1,True,True,True,True,0.9,0.8,-0.1
9,llama3.1:8b,analyst,irrelevant_1,True,True,False,False,,,


## 4. Metrics Analysis

In [8]:
# Extract metrics for comparison
metrics_data = []

# Baseline metrics
for post_id, result in baseline_results.items():
    if result["success"]:
        for node_name, node_metrics in result["metrics"].items():
            metrics_data.append({
                "model": "BASELINE (GPT-4)",
                "tested_node": "all",
                "metric_node": node_name,
                "post_id": post_id,
                "execution_time": node_metrics["execution_time"],
                "prompt_tokens": node_metrics["prompt_tokens"],
                "completion_tokens": node_metrics["completion_tokens"],
                "total_tokens": node_metrics["total_tokens"],
            })

# Test metrics
for test_model in TEST_MODELS:
    for node in nodes:
        for post_id, result in test_results.get(test_model, {}).get(node, {}).items():
            if result.get("success") and "metrics" in result:
                for node_name, node_metrics in result["metrics"].items():
                    metrics_data.append({
                        "model": test_model,
                        "tested_node": node,
                        "metric_node": node_name,
                        "post_id": post_id,
                        "execution_time": node_metrics["execution_time"],
                        "prompt_tokens": node_metrics["prompt_tokens"],
                        "completion_tokens": node_metrics["completion_tokens"],
                        "total_tokens": node_metrics["total_tokens"],
                    })

df_metrics = pd.DataFrame(metrics_data)
df_metrics

Unnamed: 0,model,tested_node,metric_node,post_id,execution_time,prompt_tokens,completion_tokens,total_tokens
0,BASELINE (GPT-4),all,entry,irrelevant_1,18.826879,585,30,615
1,BASELINE (GPT-4),all,entry,fake_news_1,2.408844,587,28,615
2,BASELINE (GPT-4),all,planner,fake_news_1,11.495560,226,376,602
3,BASELINE (GPT-4),all,researcher,fake_news_1,16.358005,612,120,732
4,BASELINE (GPT-4),all,analyst,fake_news_1,4.350441,1531,147,1678
...,...,...,...,...,...,...,...,...
73,qwen2.5:1.5b,analyst,analyst,fake_news_1,12.992837,1807,155,1962
74,qwen2.5:1.5b,analyst,entry,political_1,1.662532,620,37,657
75,qwen2.5:1.5b,analyst,planner,political_1,14.870713,259,442,701
76,qwen2.5:1.5b,analyst,researcher,political_1,16.247149,678,137,815


In [9]:
# Average execution time by model and node
if not df_metrics.empty:
    print("Average Execution Time (seconds) by Model and Node:\n")
    pivot_time = df_metrics.pivot_table(
        values="execution_time",
        index=["model", "tested_node"],
        columns="metric_node",
        aggfunc="mean"
    ).round(2)
    display(pivot_time)

Average Execution Time (seconds) by Model and Node:



Unnamed: 0_level_0,metric_node,analyst,entry,planner,researcher
model,tested_node,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BASELINE (GPT-4),all,4.86,7.88,13.15,11.96
llama3.1:8b,analyst,56.32,1.47,11.91,6.86
llama3.1:8b,entry,5.58,25.3,11.73,8.28
llama3.1:8b,planner,5.18,1.6,135.17,9.03
llama3.1:8b,researcher,5.64,1.72,9.56,38.75
qwen2.5:1.5b,analyst,11.63,1.74,16.68,12.13
qwen2.5:1.5b,entry,4.42,5.97,10.56,6.43
qwen2.5:1.5b,planner,4.83,1.73,22.41,21.43
qwen2.5:1.5b,researcher,4.23,1.45,12.23,17.31


## 5. Summary and Recommendations

In [10]:
print("="*60)
print("EVALUATION SUMMARY")
print("="*60)

# Success rate
print("\n1. SUCCESS RATE BY MODEL AND NODE:\n")
if not df_comparison.empty:
    success_rate = df_comparison.groupby(["model", "node"])["success"].mean() * 100
    print(success_rate.round(1).to_string())

# Relevance match rate
print("\n2. RELEVANCE MATCH WITH BASELINE:\n")
if not df_comparison.empty:
    relevance_match = df_comparison[df_comparison["success"]].groupby(["model", "node"])["relevance_match"].mean() * 100
    print(relevance_match.round(1).to_string())

# Score differences
print("\n3. AVERAGE SCORE DIFFERENCE FROM BASELINE:\n")
if not df_comparison.empty and "score_diff" in df_comparison.columns:
    score_diff = df_comparison[df_comparison["score_diff"].notna()].groupby(["model", "node"])["score_diff"].mean()
    if not score_diff.empty:
        print(score_diff.round(2).to_string())
    else:
        print("No comparable scores available")

EVALUATION SUMMARY

1. SUCCESS RATE BY MODEL AND NODE:

model         node      
llama3.1:8b   analyst       100.0
              entry         100.0
              planner       100.0
              researcher    100.0
qwen2.5:1.5b  analyst       100.0
              entry         100.0
              planner       100.0
              researcher    100.0

2. RELEVANCE MATCH WITH BASELINE:

model         node      
llama3.1:8b   analyst       100.0
              entry          66.7
              planner       100.0
              researcher    100.0
qwen2.5:1.5b  analyst       100.0
              entry         100.0
              planner       100.0
              researcher    100.0

3. AVERAGE SCORE DIFFERENCE FROM BASELINE:

model         node      
llama3.1:8b   analyst       0.00
              entry        -0.30
              planner      -0.05
              researcher   -0.05
qwen2.5:1.5b  analyst      -0.25
              entry        -0.05
              planner      -0.10
             

In [11]:
# Detailed comparison for each post
print("\n" + "="*60)
print("DETAILED RESPONSE COMPARISON")
print("="*60)

for post_data in TEST_POSTS:
    post_id = post_data["id"]
    print(f"\n--- Post: {post_id} ---")
    print(f"Text: {post_data['text'][:80]}..." if len(post_data['text']) > 80 else f"Text: {post_data['text']}")
    
    baseline = baseline_results.get(post_id, {})
    if baseline.get("success"):
        print(f"\nBASELINE (GPT-4):")
        print(f"  Relevant: {baseline['relevance_analysis']['relevant']}")
        print(f"  Reasoning: {baseline['relevance_analysis'].get('reasoning', '')[:150]}...")
        if baseline['relevance_analysis']['relevant']:
            print(f"  Score: {baseline['response']['score']}")
            print(f"  Justification: {baseline['response'].get('justification', '')[:150]}...")
    
    for test_model in TEST_MODELS:
        print(f"\n{test_model}:")
        for node in nodes:
            test = test_results.get(test_model, {}).get(node, {}).get(post_id, {})
            if test.get("success"):
                rel = test['relevance_analysis']['relevant']
                score = test.get('response', {}).get('score', 'N/A') if rel else 'N/A'
                baseline_match = "✓" if rel == baseline.get('relevance_analysis', {}).get('relevant') else "✗"
                print(f"  [{node}] relevant={rel} {baseline_match}, score={score}")
            else:
                print(f"  [{node}] FAILED: {test.get('error', 'Unknown')[:30]}")


DETAILED RESPONSE COMPARISON

--- Post: irrelevant_1 ---
Text: Bom dia!

BASELINE (GPT-4):
  Relevant: False
  Reasoning: Apenas uma saudação, sem afirmações factuais verificáveis...

llama3.1:8b:
  [entry] relevant=False ✓, score=N/A
  [planner] relevant=False ✓, score=N/A
  [researcher] relevant=False ✓, score=N/A
  [analyst] relevant=False ✓, score=N/A

qwen2.5:1.5b:
  [entry] relevant=False ✓, score=N/A
  [planner] relevant=False ✓, score=N/A
  [researcher] relevant=False ✓, score=N/A
  [analyst] relevant=False ✓, score=N/A

--- Post: fake_news_1 ---
Text: Cloroquina cura COVID

BASELINE (GPT-4):
  Relevant: True
  Reasoning: Contém afirmação verificável sobre eficácia médica...
  Score: 1.0
  Justification: A afirmação de que a cloroquina cura COVID-19 é falsa. As pesquisas indicam que a eficácia da cloroquina no tratamento da COVID-19 é insuficiente e a ...

llama3.1:8b:
  [entry] relevant=True ✓, score=0.7
  [planner] relevant=True ✓, score=0.9
  [researcher] relevant=True ✓, s

## 6. Save Results

In [12]:
# Save all results to JSON for further analysis
evaluation_output = {
    "timestamp": datetime.now().isoformat(),
    "baseline_model": BASELINE_MODEL,
    "test_models": TEST_MODELS,
    "test_posts": TEST_POSTS,
    "baseline_results": baseline_results,
    "test_results": test_results,
}

with open("evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_output, f, ensure_ascii=False, indent=2)

print("Results saved to evaluation_results.json")

# Save comparison DataFrame
if not df_comparison.empty:
    df_comparison.to_csv("evaluation_comparison.csv", index=False)
    print("Comparison data saved to evaluation_comparison.csv")

Results saved to evaluation_results.json
Comparison data saved to evaluation_comparison.csv
