### Qwen vs Grok
Here we are comparing accuracy and speed of Qwen (together.ai) vs GRok

In [1]:
import asyncio
import time
import os
from dataclasses import dataclass
from typing import Literal, Union
import httpx
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel




from dotenv import load_dotenv
load_dotenv('../envs/.env', override=True)

# Verify keys are loaded
TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
GROK_API_KEY = os.environ.get("GROK_API_KEY")



@dataclass
class TestDeps:
    """Dependencies for the test agent"""
    query: str


class QuizItemAnswer(BaseModel):
    answer: str = Field(description="The answer text")
    explanation: str = Field(description="The explanation text")
    correct: bool = Field(description="Whether the answer is correct")


class QuizItemTest(BaseModel):
    question: str = Field(description="The quiz question text")
    answers: list[QuizItemAnswer] = Field(
        description="4 answers with only one correct",
        min_length=4,
        max_length=4
    )


class TestOutput(BaseModel):
    mode: Literal["quiz"] = "quiz"
    quiz_items: list[QuizItemTest] = Field(
        description="Array of quiz items",
        min_length=5,
        max_length=5
    )


async def test_llm_performance(
    model: Union[str, OpenAIChatModel],
    query: str,
    num_runs: int = 3
) -> dict:
    """
    Test the performance of an LLM model.
    
    Args:
        model: The model identifier (e.g., "gpt-4o-mini", "qwen-2.5-72b-instruct") 
               or an OpenAIChatModel instance for custom configuration
        query: The query/prompt to test with
        num_runs: Number of times to run the test for averaging
    
    Returns:
        Dictionary with performance metrics
    """
    agent = Agent(
        model=model,
        output_type=TestOutput,
        deps_type=TestDeps,
    )
    
    # Get model name for display
    model_name = model if isinstance(model, str) else f"{model.model_name}"
    
    timings = []
    results = []
    errors = []
    
    for i in range(num_runs):
        print(f"Run {i+1}/{num_runs} for {model_name}...")
        start_time = time.time()
        
        try:
            result = await agent.run(
                f"Generate 5 quiz questions about: {query}",
                deps=TestDeps(query=query)
            )
            elapsed = time.time() - start_time
            
            timings.append(elapsed)
            # Use .output instead of .data for newer pydantic-ai versions
            output = result.output if hasattr(result, 'output') else result.data
            results.append(output)
            print(f"  ✓ Completed in {elapsed:.2f}s")
            
        except Exception as e:
            elapsed = time.time() - start_time
            errors.append(str(e))
            print(f"  ✗ Failed after {elapsed:.2f}s: {e}")
    
    return {
        "model": model_name,
        "query": query,
        "num_runs": num_runs,
        "successful_runs": len(results),
        "failed_runs": len(errors),
        "timings": timings,
        "avg_time": sum(timings) / len(timings) if timings else None,
        "min_time": min(timings) if timings else None,
        "max_time": max(timings) if timings else None,
        "results": results,
        "errors": errors
    }


async def compare_models(
    models: list[Union[str, OpenAIChatModel]],
    query: str,
    num_runs: int = 3
) -> dict:
    """
    Compare multiple models on the same query.
    
    Args:
        models: List of model identifiers or OpenAIChatModel instances to compare
        query: The query/prompt to test with
        num_runs: Number of times to run each model
    
    Returns:
        Dictionary with comparison results
    """
    print(f"\n{'='*60}")
    print(f"Comparing {len(models)} models on query: '{query}'")
    print(f"{'='*60}\n")
    
    results = []
    for model in models:
        model_name = model if isinstance(model, str) else f"{model.model_name}"
        print(f"\nTesting {model_name}...")
        result = await test_llm_performance(model, query, num_runs)
        results.append(result)
    
    # Print summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    for r in results:
        print(f"\n{r['model']}:")
        print(f"  Average time: {r['avg_time']:.2f}s" if r['avg_time'] else "  Failed")
        print(f"  Min time: {r['min_time']:.2f}s" if r['min_time'] else "")
        print(f"  Max time: {r['max_time']:.2f}s" if r['max_time'] else "")
        print(f"  Success rate: {r['successful_runs']}/{r['num_runs']}")
    
    return {
        "query": query,
        "models": results,
        "fastest": min(results, key=lambda x: x['avg_time'] or float('inf'))['model'] if any(r['avg_time'] for r in results) else None
    }


# Example invocation cell below (keeps the key private; agent implementations will use system env vars):


In [2]:
# Example: Create custom model instances for Together.ai and Grok
from pydantic_ai.providers.openai import OpenAIProvider
from pydantic_ai.providers.grok import GrokProvider

# Qwen model via Together.ai
qwen_together = OpenAIChatModel(
    "Qwen/Qwen3-235B-A22B-Instruct-2507-tput",
    provider=OpenAIProvider(
        base_url="https://api.together.xyz/v1",
        api_key=os.environ.get("TOGETHER_API_KEY"),
    ),
)



In [3]:
# Example 1: Mix of string models and custom OpenAIModel instances
results = await compare_models(
    models=[
       

        qwen_together,          # Custom OpenAIModel instance
         "grok:grok-4-fast-non-reasoning",  # String model identifier
    ],
    query="Generate me 1 quiz questions about Python programming basics. ",
    num_runs=2
)


Comparing 2 models on query: 'Generate me 1 quiz questions about Python programming basics. '


Testing Qwen/Qwen3-235B-A22B-Instruct-2507-tput...
Run 1/2 for Qwen/Qwen3-235B-A22B-Instruct-2507-tput...
  ✓ Completed in 33.00s
Run 2/2 for Qwen/Qwen3-235B-A22B-Instruct-2507-tput...
  ✓ Completed in 29.22s

Testing grok:grok-4-fast-non-reasoning...
Run 1/2 for grok:grok-4-fast-non-reasoning...
  ✓ Completed in 3.82s
Run 2/2 for grok:grok-4-fast-non-reasoning...
  ✓ Completed in 4.09s

SUMMARY

Qwen/Qwen3-235B-A22B-Instruct-2507-tput:
  Average time: 31.11s
  Min time: 29.22s
  Max time: 33.00s
  Success rate: 2/2

grok:grok-4-fast-non-reasoning:
  Average time: 3.95s
  Min time: 3.82s
  Max time: 4.09s
  Success rate: 2/2
