In [None]:
!pip install --force-reinstall ../dist/alquimia_fair_forge-0.0.1.tar.gz -q

In [None]:
from fair_forge.metrics import BestOf
from pydantic import SecretStr
from helpers.retriever import LocalRetriever

In [None]:
from getpass import getpass
judge_api_key = SecretStr(getpass("Please enter your Judge API key: "))

In [None]:
metrics = BestOf.run(
    LocalRetriever,
    judge_api_key=judge_api_key,
    judge_model="llama-3.3-70b-versatile",
    verbose=False,
    criteria="Decide which assistant is more helpful and accurate if I want a high level of emotions in the response"
)

In [None]:
def print_bestof_metric_simple(metric):
    print("\n" + "="*80)
    print(f"{'BEST OF TOURNAMENT RESULTS':^80}")
    print("="*80)
    print(f"Session ID: {metric.session_id}")
    print(f"Winner: {metric.bestof_winner_id}")
    print("="*80)
    
    # Tournament bracket
    print(f"\n{'TOURNAMENT BRACKET':^80}")
    print("-"*80)
    for contest in metric.bestof_contests:
        left_mark = "‚úì" if contest.winner_id == contest.left_id else "‚úó"
        right_mark = "‚úì" if contest.winner_id == contest.right_id else "‚úó"
        
        print(f"\nRound {contest.round}:")
        print(f"  [{left_mark}] {contest.left_id}")
        print(f"  [{right_mark}] {contest.right_id}")
        print(f"  ‚Üí Winner: {contest.winner_id} (Confidence: {contest.confidence*100:.0f}%)")
    
    # Detailed rounds
    print(f"\n\n{'DETAILED ANALYSIS':^80}")
    print("="*80)
    
    for contest in metric.bestof_contests:
        print(f"\n{'‚îÄ'*80}")
        print(f"ROUND {contest.round}: {contest.left_id} vs {contest.right_id}")
        print(f"{'‚îÄ'*80}")
        print(f"Winner: {contest.winner_id}")
        print(f"Confidence: {contest.confidence*100:.0f}%")
        
        print(f"\nüìù VERDICT:")
        print(f"   {contest.verdict}\n")
        
        reasoning = contest.reasoning
        
        # Left contestant
        print(f"\n{contest.left_id.upper()}:")
        print(f"  ‚úÖ Strengths:")
        for s in reasoning.get(f'{contest.left_id}_strengths', []):
            print(f"     ‚Ä¢ {s}")
        print(f"  ‚ö†Ô∏è  Weaknesses:")
        for w in reasoning.get(f'{contest.left_id}_weaknesses', []):
            print(f"     ‚Ä¢ {w}")
        
        # Right contestant
        print(f"\n{contest.right_id.upper()}:")
        print(f"  ‚úÖ Strengths:")
        for s in reasoning.get(f'{contest.right_id}_strengths', []):
            print(f"     ‚Ä¢ {s}")
        print(f"  ‚ö†Ô∏è  Weaknesses:")
        for w in reasoning.get(f'{contest.right_id}_weaknesses', []):
            print(f"     ‚Ä¢ {w}")
    
    # Final winner summary
    print("\n" + "="*80)
    print(f"{'üèÜ CHAMPION: ' + metric.bestof_winner_id + ' üèÜ':^80}")
    print("="*80)
    
    # Aggregate strengths and weaknesses from all rounds where winner participated
    all_strengths = []
    all_weaknesses = []
    final_verdicts = []
    
    for contest in metric.bestof_contests:
        if contest.winner_id == metric.bestof_winner_id:
            reasoning = contest.reasoning
            strengths_key = f'{metric.bestof_winner_id}_strengths'
            weaknesses_key = f'{metric.bestof_winner_id}_weaknesses'
            
            all_strengths.extend(reasoning.get(strengths_key, []))
            all_weaknesses.extend(reasoning.get(weaknesses_key, []))
            final_verdicts.append(contest.verdict)
    
    print(f"\n{'WHY ' + metric.bestof_winner_id.upper() + ' WON':^80}")
    print("-"*80)
    
    if final_verdicts:
        print("\nüìã Key Verdicts:")
        for i, verdict in enumerate(final_verdicts, 1):
            print(f"\n  Round {i}:")
            print(f"  {verdict}")
    
    if all_strengths:
        print(f"\n‚úÖ Overall Strengths:")

In [None]:
print_bestof_metric_simple(metrics[0])