In [1]:
import asyncio
import sys
from pathlib import Path
from dotenv import load_dotenv
import os

project_root = Path.cwd().parents[0] # ajusta si tu notebook está más adentro
sys.path.append(str(project_root))
load_dotenv()



True

In [18]:
# Cargar datos y crear dataset
from data_processing.data_loader import DataLoader
from data_processing.incident_consolidator import IncidentConsolidator
from pathlib import Path

execution_date = "2025-09-10"

# Configurar rutas explícitamente para usar 'datos' en lugar de 'datos_ejemplo'
# Desde notebooks/, subir un nivel para llegar a la raíz del proyecto
project_root = Path().resolve().parent
datos_path = project_root / "datos"

loader = DataLoader(
    base_path=datos_path,
    cv_path=datos_path / "cv",
    daily_path=datos_path / "daily_files",
    feedback_path=datos_path / "feedback"
)

source_ids = loader.get_all_source_ids()

print(f"Fecha de ejecución: {execution_date}")
print(f"Usando rutas:")
print(f"- Base: {loader.base_path}")
print(f"- CV: {loader.cv_path}")
print(f"- Daily: {loader.daily_path}")
print(f"- Feedback: {loader.feedback_path}")
print(f"Fuentes encontradas: {len(source_ids)}")
print(f"Source IDs: {source_ids}")


Fecha de ejecución: 2025-09-10
Usando rutas:
- Base: C:\Users\EQUIPO\Documents\AI\Prueba tecnica agentes\agent_factory\datos
- CV: C:\Users\EQUIPO\Documents\AI\Prueba tecnica agentes\agent_factory\datos\cv
- Daily: C:\Users\EQUIPO\Documents\AI\Prueba tecnica agentes\agent_factory\datos\daily_files
- Feedback: C:\Users\EQUIPO\Documents\AI\Prueba tecnica agentes\agent_factory\datos\feedback
Fuentes encontradas: 18
Source IDs: ['195385', '195436', '195439', '196125', '199944', '207936', '207938', '209773', '211544', '220504', '220505', '220506', '224602', '224603', '228036', '228038', '239611', '239613']


In [19]:
# Crear el dataset con todos los incidentes detectados
consolidator = IncidentConsolidator(execution_date)
dataset = consolidator.build_dataset(source_ids, loader)

print(f"Dataset creado con {len(dataset)} fuentes")
print(f"Fuentes en dataset: {list(dataset.keys())}")

# Mostrar ejemplo de una fuente
sample_source = list(dataset.keys())[0] if dataset else None
if sample_source:
    print(f"\nEjemplo de fuente {sample_source}:")
    sample_data = dataset[sample_source]
    print(f"- CV disponible: {'cv_text' in sample_data}")
    print(f"- Archivos del día: {len(sample_data.get('daily_files', []))}")
    print(f"- Archivos semana pasada: {len(sample_data.get('last_week_files', []))}")
    print(f"- Incidentes detectados: {list(sample_data.get('incidents', {}).keys())}")


Dataset creado con 18 fuentes
Fuentes en dataset: ['195385', '195436', '195439', '196125', '199944', '207936', '207938', '209773', '211544', '220504', '220505', '220506', '224602', '224603', '228036', '228038', '239611', '239613']

Ejemplo de fuente 195385:
- CV disponible: True
- Archivos del día: 7
- Archivos semana pasada: 37
- Incidentes detectados: ['missing', 'duplicated', 'empty', 'volume_variation', 'schedule', 'historical']


In [20]:
# Crear las herramientas para el agente
from report_builder import build_incident_toolkit

tools = build_incident_toolkit(dataset, execution_date)
print(f"Herramientas creadas: {len(tools)}")
print(f"Nombres de herramientas: {[tool.__name__ for tool in tools]}")


Herramientas creadas: 3
Nombres de herramientas: ['list_sources', 'get_source_cv_and_data', 'get_execution_date_info']


In [21]:
# Probar las herramientas limpias
print("=== PROBANDO HERRAMIENTAS LIMPIAS ===")

# 1. list_sources
list_sources = next(tool for tool in tools if tool.__name__ == "list_sources")
print("\n1. list_sources():")
sources_result = list_sources()
print(sources_result)

# 2. get_source_cv_and_data para la fuente 195385
get_source_cv_and_data = next(tool for tool in tools if tool.__name__ == "get_source_cv_and_data")
print(f"\n2. get_source_cv_and_data('195385'):")
cv_result = get_source_cv_and_data('195385')
print(cv_result[:1000] + "..." if len(cv_result) > 1000 else cv_result)

print(f"\n✅ Toolkit limpio con {len(tools)} herramientas:")
for tool in tools:
    print(f"  - {tool.__name__}")


=== PROBANDO HERRAMIENTAS LIMPIAS ===

1. list_sources():
{"execution_date": "2025-09-10", "sources": [{"source_id": "195385", "display_name": "_Settlement_Layout_2", "files_today": 7, "files_last_weekday": 37, "expected_files": 32, "missing_estimate": 25, "has_cv": true, "first_upload_utc": "2025-09-10T08:08:36.758445+00:00", "last_upload_utc": "2025-09-10T08:08:43.241813+00:00"}, {"source_id": "195436", "display_name": "MyPal_DBR RX", "files_today": 2, "files_last_weekday": 1, "expected_files": 1, "missing_estimate": 0, "has_cv": true, "first_upload_utc": "2025-09-10T15:11:31.898388+00:00", "last_upload_utc": "2025-09-10T15:11:32.988282+00:00"}, {"source_id": "195439", "display_name": "MyPal_Activity report", "files_today": 1, "files_last_weekday": 1, "expected_files": 1, "missing_estimate": 0, "has_cv": true, "first_upload_utc": "2025-09-10T02:37:10.679913+00:00", "last_upload_utc": "2025-09-10T02:37:10.679913+00:00"}, {"source_id": "196125", "display_name": "_Settlement_Layout_1", 

In [6]:
# Configurar y ejecutar el agente
import asyncio
from google.adk.sessions import InMemorySessionService
from google.adk.runners import Runner
from google.genai import types
from adk_components.agent_definition import create_report_agent
from config import settings

async def run_agent_with_prompt(prompt: str):
    """Ejecuta el agente con un prompt específico"""
    agent = create_report_agent(tools)
    session_service = InMemorySessionService()
    session = await session_service.create_session(
        app_name=settings.APP_NAME, user_id=settings.USER_ID
    )
    runner = Runner(agent=agent, app_name=settings.APP_NAME, session_service=session_service)
    content = types.Content(role='user', parts=[types.Part(text=prompt)])
    
    async for event in runner.run_async(user_id=session.user_id, session_id=session.id, new_message=content):
        if event.is_final_response():
            if event.content and event.content.parts:
                return event.content.parts[0].text
            return ''
    return ''

print("Función run_agent_with_prompt definida y lista para usar")


Función run_agent_with_prompt definida y lista para usar


In [10]:
# PRUEBA DEL NUEVO ENFOQUE "LLM + CVs"
# El agente ahora usa get_source_cv_and_data() para análisis experto


prompt_cv_analysis = f"""
OBJECTIVE: Generate a precise report based solely on CV analysis and real data.

AVAILABLE TOOLS (USE ONLY THESE):
- list_sources(): general overview of all sources
- get_source_cv_and_data(source_id): complete CV and raw data for expert analysis

SPECIFIC INSTRUCTIONS:
1. USE ONLY list_sources() and get_source_cv_and_data() - DO NOT use other tools
2. For EACH source in list_sources(), use get_source_cv_and_data() for complete analysis
3. Read each source's CV completely to understand their normal patterns
4. Intelligently interpret whether events are normal according to the CV or true incidents
5. Determine what day of the week {execution_date} is and verify specific patterns for that day in each CV

SPECIAL CASE TO VALIDATE:
- Source 195385: Are the files that arrived normal according to its CV?
- Is the timing 08:06 UTC within expected windows?
- Is lag -1 (Saturday files arriving Sunday) normal according to the CV?

CRITICAL RULES FOR ANALYSIS:
- If the CV says something is normal, it is NOT an incident
- Only report true deviations from CV patterns
- Use real record numbers in "All Good"
- Each source appears only once in the highest severity section
- IGNORE "raw_incidents" data if it contradicts CV analysis

⚠️ CRITICAL RULE ABOUT VOLUME VARIATIONS:
- IF VOLUME DECREASE IS CAUSED BY MISSING FILES, DO NOT REPORT VOLUMNE VARIATION!
- Only report volume variation if files arrived but with fewer/more rows
- Example: If 0 files arrived and 0 rows → Only missing files (NOT volume variation)
- Example: If 2 files arrived but with 50% fewer rows → Volume variation

SEVERITY CLASSIFICATION:
🔴 URGENT: Critical missing files according to CV OR 3+ "needs attention" incidents
🟡 NEEDS ATTENTION: Volume deviations, timing outside CV windows
🟢 ALL GOOD: Everything within normal CV patterns

GENERATE THE EXECUTIVE REPORT IN ENGLISH for {execution_date}
"""

print("🚀 EJECUTANDO NUEVO ENFOQUE 'LLM + CVs'...")
print("="*80)

response = await run_agent_with_prompt(prompt_cv_analysis)



🚀 EJECUTANDO NUEVO ENFOQUE 'LLM + CVs'...


In [None]:
# Asegurate de reejecutar las celdas anteriores para que la nueva fecha sea tomada en cuenta
response_1 = await run_agent_with_prompt(prompt_cv_analysis)


In [22]:
response_2 = await run_agent_with_prompt(prompt_cv_analysis)

# Evaluación

In [30]:
def evaluate_3vs3(agent_responses, feedback_responses):
    """
    Compara 3 respuestas del agente vs 3 del feedback
    
    agent_responses = [response_1, response_2, response_3]
    feedback_responses = [feedback_sept8, feedback_sept9, feedback_sept10]
    """
    
    print("📊 AGENT vs FEEDBACK EVALUATION (3 vs 3)")
    print("=" * 50)
    
    results = []
    
    # Comparar cada par (agente vs feedback correspondiente)
    for i, (agent_resp, feedback_resp) in enumerate(zip(agent_responses, feedback_responses), 1):
        
        print(f"\n🗓️ COMPARISON {i}:")
        
        # Parsear ambas respuestas
        agent_parsed = parse_simple(agent_resp)
        feedback_parsed = parse_simple(feedback_resp)
        
        # Calcular métricas
        accuracy = calculate_accuracy(agent_parsed, feedback_parsed)
        
        # Mostrar detalles
        print(f"  📈 Accuracy: {accuracy:.1%}")
        print(f"  🔴 Agent Urgent: {len(agent_parsed['urgent'])} | Feedback: {len(feedback_parsed['urgent'])}")
        print(f"  🟡 Agent Needs Att: {len(agent_parsed['needs_attention'])} | Feedback: {len(feedback_parsed['needs_attention'])}")
        print(f"  🟢 Agent All Good: {len(agent_parsed['all_good'])} | Feedback: {len(feedback_parsed['all_good'])}")
        
        # Identificar diferencias específicas
        differences = find_differences(agent_parsed, feedback_parsed)
        if differences:
            print(f"  ❌ Differences:")
            for diff in differences:
                print(f"    {diff}")
        else:
            print(f"  ✅ Perfect match!")
        
        results.append({
            'comparison': i,
            'accuracy': accuracy,
            'differences': differences
        })
    
    # Resumen general
    print(f"\n📋 OVERALL SUMMARY:")
    avg_accuracy = sum(r['accuracy'] for r in results) / len(results)
    print(f"  📈 Average Accuracy: {avg_accuracy:.1%}")
    
        
    return results

def parse_simple(response_text):
    """Extrae IDs de fuentes por severidad"""
    import re
    
    result = {'urgent': [], 'needs_attention': [], 'all_good': []}
    
    # Patrones más flexibles para ambos formatos
    urgent_patterns = [
        r'\*\s*Urgent Action Required\*(.*?)(?:\*\s*Needs Attention\*|\*\s*No Action Needed\*|\*\s*All Good\*|$)',
        r'Urgent Action Required(.*?)(?:Needs Attention|No Action Needed|All Good|$)'
    ]
    
    needs_patterns = [
        r'\*\s*Needs Attention\*(.*?)(?:\*\s*All Good\*|\*\s*No Action Needed\*|$)',
        r'Needs Attention(.*?)(?:All Good|No Action Needed|$)'
    ]
    
    good_patterns = [
        r'\*\s*(?:All Good|No Action Needed)\*(.*?)$',
        r'(?:All Good|No Action Needed)(.*?)$'
    ]
    
    # Buscar sección URGENT
    for pattern in urgent_patterns:
        match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
        if match:
            result['urgent'] = re.findall(r'\(id:\s*(\d+)\)', match.group(1))
            break
    
    # Buscar sección NEEDS ATTENTION
    for pattern in needs_patterns:
        match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
        if match:
            result['needs_attention'] = re.findall(r'\(id:\s*(\d+)\)', match.group(1))
            break
    
    # Buscar sección ALL GOOD
    for pattern in good_patterns:
        match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
        if match:
            result['all_good'] = re.findall(r'\(id:\s*(\d+)\)', match.group(1))
            break
    
    return result

def calculate_accuracy(agent_parsed, feedback_parsed):
    """Calcula precisión comparando clasificaciones"""
    
    # Obtener todas las fuentes mencionadas en ambos
    all_sources = set()
    for parsed in [agent_parsed, feedback_parsed]:
        for sources in parsed.values():
            all_sources.update(sources)
    
    if not all_sources:
        return 1.0
    
    correct = 0
    
    for source_id in all_sources:
        # Determinar clasificación del agente
        agent_severity = None
        for severity, sources in agent_parsed.items():
            if source_id in sources:
                agent_severity = severity
                break
        
        # Determinar clasificación del feedback
        feedback_severity = None
        for severity, sources in feedback_parsed.items():
            if source_id in sources:
                feedback_severity = severity
                break
        
        # Comparar
        if agent_severity == feedback_severity:
            correct += 1
    
    return correct / len(all_sources)

def find_differences(agent_parsed, feedback_parsed):
    """Encuentra diferencias específicas"""
    
    differences = []
    
    # Obtener todas las fuentes
    all_sources = set()
    for parsed in [agent_parsed, feedback_parsed]:
        for sources in parsed.values():
            all_sources.update(sources)
    
    for source_id in all_sources:
        # Clasificación del agente
        agent_severity = 'missing'
        for severity, sources in agent_parsed.items():
            if source_id in sources:
                agent_severity = severity
                break
        
        # Clasificación del feedback
        feedback_severity = 'missing'
        for severity, sources in feedback_parsed.items():
            if source_id in sources:
                feedback_severity = severity
                break
        
        # Si son diferentes
        if agent_severity != feedback_severity:
            differences.append(f"Source {source_id}: Agent={agent_severity} vs Feedback={feedback_severity}")
    
    return differences




agent_responses = [response, response_1, response_2]


feedback_responses = [
    """
*Report generated at UTC HOUR*: 23:30 UTC 
*  Urgent Action Required* 
• * _Payments_Layout_1_V3 (id: 220504)* – 2025-09-07: 14 files missing past 08:08–08:18 UTC window — entities: Clien_CBK, WhiteLabel, Shop, Google, POC, Market, Innovation, Donation, Beneficios, ApplePay, Anota-ai, AddCard, Clien _payments, ClienX_Clube ? *Action:* Notify provider to generate/re-send; re-run ingestion and verify completeness 
• * _Payments_Layout_2_V3 (id: 220505)* – 2025-09-07: 2 files missing past 08:02–08:11 UTC — expected: *_Clien _Debito_payments_accounting_report_2025_09_07.csv; *_Clien _MVP_payments_accounting_report_2025_09_07.csv ? *Action:* Notify provider to generate/re-send; re-run ingestion and verify completeness 
• * _Payments_Layout_3_V3 (id: 220506)* – 2025-09-06: 1 file missing past 08:03–08:19 UTC — expected: [hash]_Clien _3DS_payments_accounting_report_2025_09_06.csv ? *Action:* Notify provider to generate/re-send; re-run ingestion and verify completeness 

*  Needs Attention* 
• * _Settlement_Layout_2 (id: 195385)* – 2025-09-08: Saipos file delivered early at 08:06 UTC (usual ~17:20) — Confirm schedule change; adjust downstream triggers if needed
• * _Sale_adjustments_3 (id: 239611)* – 2025-09-08: ClienX volume 61,639 (> usual Monday 40k–55k) — Confirm coverage/window; monitor next run 
• * _STL adjustments_3 (id: 239613)* – 2025-09-08: ClienX volume 56,277 (>95% bound 50,211) — Validate downstream completed; track if persists • * _STL payments_2 (id: 228038)* – 2025-09-08: ClienX 1,023,337 (>95% band 869,600) and lag shifted to near-real-time — Confirm intentional lag change; keep a short-term volume watch

*  No Action Needed* 
• *Desco Devoluções (id: 211544)* – 2025-09-08: `[6,798] records`
 • *Desco   (id: 209773)* – 2025-09-08: `[190,541] records` 
• *Itm Devolução (id: 224603)* – 2025-09-08: `[26,364] records` 
• *Itm Pagamentos (id: 224602)* – 2025-09-08: `[678,305] records` 
• * _Sale payments_2 (id: 228036)* – 2025-09-08: `[1,233,496] records` 
• *MyPal_Activity report (id: 195439)* – 2025-09-08: `[347,476] records`
 • *MyPal_DBR RX (id: 195436)* – 2025-09-08: `[0] records` 
• *Soop - Tipo 2 (id: 207936)* – 2025-09-08: `[4,060] records`
 • *Soop - Tipo 3 (id: 207938)* – 2025-09-08: `[4,066] records`
 • *Soop Transaction   3 (id: 199944)* – 2025-09-08: `[179,070] records` 
• All other recent files appear normal
""",
    
    """
Report generated at UTC HOUR*: 20:30 
*:círculo_rojo: Urgent Action Required* 
• * _Payments_Layout_1_V3 (id: 220504)* – 2025-09-10 (acct date 2025_09_09): 14 expected files missing; only 4 arrived (DataOnly, Saipos, Anotaai_Wallet [empty], safemode [empty]). Missing suffixes: *_Clien _CBK_..._2025_09_09.csv, *_Clien _WhiteLabel_..._2025_09_09.csv, *_Clien _Shop_..._2025_09_09.csv, *_Clien _PwGoogle_..._2025_09_09.csv, *_Clien _POC_..._2025_09_09.csv, *_Clien _Market_..._2025_09_09.csv, *_Clien _Innovation_..._2025_09_09.csv, *_Clien _Donation_..._2025_09_09.csv, *_Clien _Beneficios_..._2025_09_09.csv, *_Clien _ApplePay_..._2025_09_09.csv, *_Clien _Anota-ai_..._2025_09_09.csv, *_Clien _AddCard_..._2025_09_09.csv, *_Clien _payments_..._2025_09_09.csv, *_ClienX_Clube_..._2025_09_09.csv ? *Action:* Request re-delivery/backfill for the 14 files; verify the 08:08–08:18 UTC job and re-ingest if objects exist upstream 
• * _Payments_Layout_2_V3 (id: 220505)* – 2025-09-10 (acct date 2025_09_09): Expected pair missing. Not present: *_Clien _Debito_payments_accounting_report_2025_09_09.csv, *_Clien _MVP_payments_accounting_report_2025_09_09.csv ? *Action:* Request re-delivery and/or re-run ingestion; check the 08:02–08:11 UTC schedule and reprocess if files are available upstream • * _Payments_Layout_3_V3 (id: 220506)* – 2025-09-10: No daily Clien _3DS file; 08:03–08:19 UTC window missed ? *Action:* Confirm whether today's Clien _3DS report (expected by pattern) was produced; ingest if present upstream or request export and backfill • * _Settlement_Layout_1 (id: 196125)* – 2025-09-10 (settlement date 2025-09-09): No files uploaded; entire morning drop (08:04–08:12 UTC) absent ? *Action:* Re-trigger the scheduled ingestion; confirm upstream generation and request re-delivery; backfill upon receipt 
• * _Settlement_Layout_2 (id: 195385)* – 2025-09-10 (settlement date 2025-09-09): Major shortfall—only 7 files (DataOnly 4; Saipos 3) vs typical 32–41; categories not present today: Clube, Donation, Shop, WhiteLabel, CBK, Beneficios, Anota-ai ? *Action:* Escalate to provider/ops; check upstream listings and re-ingest or request re-export for the absent categories 

*:círculo_amarillo_grande: Needs Attention* 

*:círculo_verde_grande: No Action Needed* 
• *Desco Devoluções (id: 211544)* – 2025-09-10: `[5322] records` 
• *Desco   (id: 209773)* – 2025-09-10: `[156356] records` 
• *Itm Devolução (id: 224603)* – 2025-09-10: `[18612] records` 
• *Itm Pagamentos (id: 224602)* – 2025-09-10: `[488161] records`
• * _Sale payments_2 (id: 228036)* – 2025-09-10: `[295257] records` 
• * _Sale_adjustments_3 (id: 239611)* – 2025-09-10: `[15292] records` 
• * _STL adjustments_3 (id: 239613)* – 2025-09-10: `[13642] records` 
• * _STL payments_2 (id: 228038)* – 2025-09-10: `[239383] records` 
• *MyPal_Activity report (id: 195439)* – 2025-09-10: `[386320] records` 
• *MyPal_DBR RX (id: 195436)* – 2025-09-10: `[206708] records` 
• *Soop - Tipo 2 (id: 207936)* – 2025-09-10: `[4986] records` 
• *Soop - Tipo 3 (id: 207938)* – 2025-09-10: `[5207] records`
• *Soop Transaction   3 (id: 199944)* – 2025-09-10: `[113926] records` 
• All other recent files appear normal""",
    
    """
Report generated at UTC HOUR*: 20:30 
*:círculo_rojo: Urgent Action Required* 
• * _Payments_Layout_1_V3 (id: 220504)* – 2025-09-10 (acct date 2025_09_09): 14 expected files missing; only 4 arrived (DataOnly, Saipos, Anotaai_Wallet [empty], safemode [empty]). Missing suffixes: *_Clien _CBK_..._2025_09_09.csv, *_Clien _WhiteLabel_..._2025_09_09.csv, *_Clien _Shop_..._2025_09_09.csv, *_Clien _PwGoogle_..._2025_09_09.csv, *_Clien _POC_..._2025_09_09.csv, *_Clien _Market_..._2025_09_09.csv, *_Clien _Innovation_..._2025_09_09.csv, *_Clien _Donation_..._2025_09_09.csv, *_Clien _Beneficios_..._2025_09_09.csv, *_Clien _ApplePay_..._2025_09_09.csv, *_Clien _Anota-ai_..._2025_09_09.csv, *_Clien _AddCard_..._2025_09_09.csv, *_Clien _payments_..._2025_09_09.csv, *_ClienX_Clube_..._2025_09_09.csv ? *Action:* Request re-delivery/backfill for the 14 files; verify the 08:08–08:18 UTC job and re-ingest if objects exist upstream 
• * _Payments_Layout_2_V3 (id: 220505)* – 2025-09-10 (acct date 2025_09_09): Expected pair missing. Not present: *_Clien _Debito_payments_accounting_report_2025_09_09.csv, *_Clien _MVP_payments_accounting_report_2025_09_09.csv ? *Action:* Request re-delivery and/or re-run ingestion; check the 08:02–08:11 UTC schedule and reprocess if files are available upstream • * _Payments_Layout_3_V3 (id: 220506)* – 2025-09-10: No daily Clien _3DS file; 08:03–08:19 UTC window missed ? *Action:* Confirm whether today's Clien _3DS report (expected by pattern) was produced; ingest if present upstream or request export and backfill • * _Settlement_Layout_1 (id: 196125)* – 2025-09-10 (settlement date 2025-09-09): No files uploaded; entire morning drop (08:04–08:12 UTC) absent ? *Action:* Re-trigger the scheduled ingestion; confirm upstream generation and request re-delivery; backfill upon receipt 
• * _Settlement_Layout_2 (id: 195385)* – 2025-09-10 (settlement date 2025-09-09): Major shortfall—only 7 files (DataOnly 4; Saipos 3) vs typical 32–41; categories not present today: Clube, Donation, Shop, WhiteLabel, CBK, Beneficios, Anota-ai ? *Action:* Escalate to provider/ops; check upstream listings and re-ingest or request re-export for the absent categories 

*:círculo_amarillo_grande: Needs Attention* 

*:círculo_verde_grande: No Action Needed* 
• *Desco Devoluções (id: 211544)* – 2025-09-10: `[5322] records` 
• *Desco   (id: 209773)* – 2025-09-10: `[156356] records` 
• *Itm Devolução (id: 224603)* – 2025-09-10: `[18612] records` 
• *Itm Pagamentos (id: 224602)* – 2025-09-10: `[488161] records`
• * _Sale payments_2 (id: 228036)* – 2025-09-10: `[295257] records` 
• * _Sale_adjustments_3 (id: 239611)* – 2025-09-10: `[15292] records` 
• * _STL adjustments_3 (id: 239613)* – 2025-09-10: `[13642] records` 
• * _STL payments_2 (id: 228038)* – 2025-09-10: `[239383] records` 
• *MyPal_Activity report (id: 195439)* – 2025-09-10: `[386320] records` 
• *MyPal_DBR RX (id: 195436)* – 2025-09-10: `[206708] records` 
• *Soop - Tipo 2 (id: 207936)* – 2025-09-10: `[4986] records` 
• *Soop - Tipo 3 (id: 207938)* – 2025-09-10: `[5207] records`
• *Soop Transaction   3 (id: 199944)* – 2025-09-10: `[113926] records` 
• All other recent files appear normal
"""
]

In [31]:
evaluate_3vs3(agent_responses, feedback_responses)

📊 AGENT vs FEEDBACK EVALUATION (3 vs 3)

🗓️ COMPARISON 1:
  📈 Accuracy: 66.7%
  🔴 Agent Urgent: 8 | Feedback: 3
  🟡 Agent Needs Att: 1 | Feedback: 4
  🟢 Agent All Good: 11 | Feedback: 10
  ❌ Differences:
    Source 228036: Agent=urgent vs Feedback=all_good
    Source 196125: Agent=urgent vs Feedback=missing
    Source 239613: Agent=urgent vs Feedback=needs_attention
    Source 195385: Agent=all_good vs Feedback=needs_attention
    Source 228038: Agent=urgent vs Feedback=needs_attention
    Source 239611: Agent=urgent vs Feedback=needs_attention

🗓️ COMPARISON 2:
  📈 Accuracy: 38.9%
  🔴 Agent Urgent: 8 | Feedback: 5
  🟡 Agent Needs Att: 3 | Feedback: 0
  🟢 Agent All Good: 8 | Feedback: 13
  ❌ Differences:
    Source 207936: Agent=urgent vs Feedback=all_good
    Source 228036: Agent=urgent vs Feedback=all_good
    Source 220505: Agent=needs_attention vs Feedback=urgent
    Source 239613: Agent=urgent vs Feedback=all_good
    Source 220506: Agent=needs_attention vs Feedback=urgent
    Sou

[{'comparison': 1,
  'accuracy': 0.6666666666666666,
  'differences': ['Source 228036: Agent=urgent vs Feedback=all_good',
   'Source 196125: Agent=urgent vs Feedback=missing',
   'Source 239613: Agent=urgent vs Feedback=needs_attention',
   'Source 195385: Agent=all_good vs Feedback=needs_attention',
   'Source 228038: Agent=urgent vs Feedback=needs_attention',
   'Source 239611: Agent=urgent vs Feedback=needs_attention']},
 {'comparison': 2,
  'accuracy': 0.3888888888888889,
  'differences': ['Source 207936: Agent=urgent vs Feedback=all_good',
   'Source 228036: Agent=urgent vs Feedback=all_good',
   'Source 220505: Agent=needs_attention vs Feedback=urgent',
   'Source 239613: Agent=urgent vs Feedback=all_good',
   'Source 220506: Agent=needs_attention vs Feedback=urgent',
   'Source 195385: Agent=all_good vs Feedback=urgent',
   'Source 199944: Agent=urgent vs Feedback=all_good',
   'Source 228038: Agent=urgent vs Feedback=all_good',
   'Source 239611: Agent=urgent vs Feedback=all_g