In [None]:
%pip install anthropic

Collecting anthropic
  Downloading anthropic-0.60.0-py3-none-any.whl.metadata (27 kB)
Downloading anthropic-0.60.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.1/293.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.60.0


# Importing Libraries

In [None]:
import math
import re
import json
import os
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass

import openai
from openai import AsyncOpenAI
import anthropic

**class Evaluation Result**

In [None]:
class EvaluationResult:
    """Data class to hold evaluation results."""
    model_id: str
    score: float
    followed_instructions: int
    total_instructions: int

# Class Api Key Manager

In [None]:
class APIKeyManager:
    """Manages API keys for different services with multiple configuration options."""

    def __init__(self):
        self._api_keys = {}

    def set_openai_key(self, api_key: str) -> None:
        """Set OpenAI API key."""
        self._api_keys['openai'] = api_key
        # Also set for openai module
        openai.api_key = api_key

    def set_claude_key(self, api_key: str) -> None:
        """Set Claude API key."""
        self._api_keys['claude'] = api_key

    def get_openai_key(self) -> Optional[str]:
        """Get OpenAI API key with fallback to environment variable."""
        return self._api_keys.get('openai') or os.getenv('OPENAI_API_KEY')

    def get_claude_key(self) -> Optional[str]:
        """Get Claude API key with fallback to environment variable."""
        return self._api_keys.get('claude') or os.getenv('ANTHROPIC_API_KEY')


    def is_openai_configured(self) -> bool:
        """Check if OpenAI API key is available."""
        return self.get_openai_key() is not None

    def is_claude_configured(self) -> bool:
        """Check if Claude API key is available."""
        return self.get_claude_key() is not None

    def get_available_services(self) -> List[str]:
        """Get list of services with configured API keys."""
        services = []
        if self.is_openai_configured():
            services.append('openai')
        if self.is_claude_configured():
            services.append('claude')
        return services


# class BaseJudge

In [None]:
class BaseJudge(ABC):
    """Abstract base class for instruction-following judges."""

    def __init__(self, name: str):
        self.name = name

    @abstractmethod
    async def evaluate(self, instructions: List[str], original_paragraph: str, completion: str) -> int:
        """
        Evaluate how many instructions were followed exactly.

        Args:
            instructions: List of editing instructions to evaluate
            original_paragraph: The original text before editing
            completion: The edited text to evaluate

        Returns:
            Number of instructions followed exactly (0 to len(instructions))
        """
        pass

    def _create_evaluation_prompt(self, instructions: List[str], original_paragraph: str, completion: str) -> Tuple[str, str]:
        """Create system and user prompts for evaluation."""
        system_prompt = (
            "You are an expert evaluator of instruction-following in text editing tasks. "
            "You will be given a set of editing instructions, the original paragraph, and a modified paragraph. "
            "You must return ONLY the number of instructions that were followed exactly. "
            "Do not explain or justify. Just return a single number (0 to N)."
        )

        user_prompt = f"""
Instructions:
{chr(10).join(f"{i+1}. {instr}" for i, instr in enumerate(instructions))}

Original Paragraph:
{original_paragraph}

Edited Output:
{completion}

How many of the above {len(instructions)} instructions were followed exactly?
Return just the number, nothing else.
"""
        return system_prompt, user_prompt

    def _extract_score(self, response_text: str, max_instructions: int) -> int:
        """Extract numerical score from response text."""
        match = re.search(r'\d+', response_text.strip())
        if match:
            value = int(match.group(0))
            return max(0, min(value, max_instructions))
        else:
            print(f"[{self.name}] Unexpected format: {response_text}")
            return 0


# Class OpenAi

In [None]:
class OpenAIJudge(BaseJudge):
    """OpenAI-based instruction-following judge."""

    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4"):
        super().__init__("OpenAI")
        # Use provided key or get from APIKeyManager/environment
        if api_key is None:
            api_key = os.getenv('OPENAI_API_KEY')

        if not api_key:
            raise ValueError("OpenAI API key is required. Set it via parameter, environment variable OPENAI_API_KEY, or APIKeyManager.")

        self.client = AsyncOpenAI(api_key=api_key)
        self.model = model

    async def evaluate(self, instructions: List[str], original_paragraph: str, completion: str) -> int:
        """Evaluate using OpenAI GPT model."""
        system_prompt, user_prompt = self._create_evaluation_prompt(instructions, original_paragraph, completion)

        try:
            response = await self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                max_tokens=50
            )
            reply = response.choices[0].message.content
            return self._extract_score(reply, len(instructions))

        except Exception as e:
            print(f"[{self.name}] Error: {e}")
            return 0

# Class Claude

In [None]:
class ClaudeJudge(BaseJudge):
    """Claude-based instruction-following judge."""

    def __init__(self, api_key: Optional[str] = None, model: str = "claude-3-5-sonnet-20240620"):
        super().__init__("Claude")
        # Use provided key or get from environment
        if api_key is None:
            api_key = os.getenv('ANTHROPIC_API_KEY')

        if not api_key:
            raise ValueError("Claude API key is required. Set it via parameter, environment variable ANTHROPIC_API_KEY, or APIKeyManager.")

        self.client = anthropic.AsyncAnthropic(api_key=api_key)
        self.model = model

    async def evaluate(self, instructions: List[str], original_paragraph: str, completion: str) -> int:
        """Evaluate using Claude model."""
        system_prompt, user_prompt = self._create_evaluation_prompt(instructions, original_paragraph, completion)

        try:
            response = await self.client.messages.create(
                model=self.model,
                max_tokens=50,
                temperature=0,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": user_prompt}
                ]
            )

            reply = response.content[0].text
            return self._extract_score(reply, len(instructions))

        except Exception as e:
            print(f"[{self.name}] Error: {e}")
            return 0

## Class InstructionProcess

In [None]:
class InstructionProcessor:
    """Utility class for processing instructions and prompts."""

    @staticmethod
    def extract_instructions_and_paragraph(prompt: str) -> Tuple[List[str], str]:
        """
        Extract instructions and paragraph from a formatted prompt.

        Args:
            prompt: The formatted prompt containing instructions and paragraph

        Returns:
            Tuple of (instructions_list, paragraph)

        Raises:
            ValueError: If markers are not found or content is empty
        """
        start_marker = "Instructions:"
        end_marker = "Return the final output as plain text with line breaks between paragraphs."

        if start_marker not in prompt or end_marker not in prompt:
            raise ValueError("Start or end marker not found in prompt.")

        # Find start and end positions
        start_index = prompt.index(start_marker) + len(start_marker)
        end_index = prompt.index(end_marker) + len(end_marker)

        # Extract instruction block
        instructions_block = prompt[start_index:prompt.index(end_marker)].strip()

        # Split instructions into list by lines and remove empty ones
        instructions_list = [line.strip() for line in instructions_block.splitlines() if line.strip()]

        # Extract paragraph after end marker
        paragraph = prompt[end_index:].strip()

        if not instructions_list:
            raise ValueError("No instructions found between markers.")
        if not paragraph:
            raise ValueError("No paragraph found after the instructions block.")

        return instructions_list, paragraph

    @staticmethod
    def chunk_instructions(instructions: List[str], n_parts: int = 2) -> List[List[str]]:
        """
        Split instructions into approximately equal chunks.

        Args:
            instructions: List of instructions to split
            n_parts: Number of parts to split into

        Returns:
            List of instruction chunks
        """
        total = len(instructions)
        base_size = total // n_parts
        remainder = total % n_parts

        chunks = []
        start = 0

        for i in range(n_parts):
            # First 'remainder' chunks get one extra item
            chunk_size = base_size + (1 if i < remainder else 0)
            end = start + chunk_size
            chunks.append(instructions[start:end])
            start = end

        return chunks

# class InstructionFollowingEvaluator:

In [None]:
class InstructionFollowingEvaluator:
    """
    Main evaluator class that coordinates multiple judges to score instruction-following.
    """

    def __init__(self, judges: Optional[List[BaseJudge]] = None, api_key_manager: Optional[APIKeyManager] = None, n_chunks: int = 2):
        """
        Initialize the evaluator.

        Args:
            judges: List of judge instances to use for evaluation. If None, auto-creates based on available API keys.
            api_key_manager: APIKeyManager instance for handling API keys
            n_chunks: Number of chunks to split instructions into for parallel processing
        """
        self.n_chunks = n_chunks
        self.processor = InstructionProcessor()
        self.api_key_manager = api_key_manager or APIKeyManager()

        # Auto-configure from environment if no explicit manager provided
        if api_key_manager is None:
            self.api_key_manager.configure_from_env()

        # Auto-create judges if none provided
        if judges is None:
            self.judges = self._create_default_judges()
        else:
            self.judges = judges

    def _create_default_judges(self) -> List[BaseJudge]:
        """Create judges based on available API keys."""
        judges = []

        if self.api_key_manager.is_openai_configured():
            try:
                judges.append(OpenAIJudge(api_key=self.api_key_manager.get_openai_key()))
            except ValueError as e:
                print(f"Failed to create OpenAI judge: {e}")

        if self.api_key_manager.is_claude_configured():
            try:
                judges.append(ClaudeJudge(api_key=self.api_key_manager.get_claude_key()))
            except ValueError as e:
                print(f"Failed to create Claude judge: {e}")

        if not judges:
            raise ValueError("No judges could be created. Please configure API keys.")

        return judges

    def add_judge(self, judge: BaseJudge) -> None:
        """Add a judge to the evaluator."""
        self.judges.append(judge)

    def remove_judge(self, judge_name: str) -> bool:
        """Remove a judge by name. Returns True if removed, False if not found."""
        for i, judge in enumerate(self.judges):
            if judge.name == judge_name:
                del self.judges[i]
                return True
        return False

    def get_judge_names(self) -> List[str]:
        """Get names of all configured judges."""
        return [judge.name for judge in self.judges]

    async def _evaluate_single_completion(
        self,
        instructions: List[str],
        paragraph: str,
        completion: str
    ) -> List[float]:
        """
        Evaluate a single completion using all judges.

        Args:
            instructions: List of instructions to evaluate against
            paragraph: Original paragraph
            completion: Completion to evaluate

        Returns:
            List of scores from each judge (normalized 0.0 to 1.0)
        """
        # Split instructions into chunks for parallel processing
        instruction_chunks = self.processor.chunk_instructions(instructions, self.n_chunks)
        total_instructions = len(instructions)

        judge_scores = []

        # Evaluate with each judge
        for judge in self.judges:
            # Process chunks in parallel for this judge
            chunk_tasks = [
                judge.evaluate(chunk, paragraph, completion)
                for chunk in instruction_chunks
            ]

            chunk_results = await asyncio.gather(*chunk_tasks, return_exceptions=True)

            # Sum up followed instructions across chunks
            total_followed = 0
            for result in chunk_results:
                if isinstance(result, Exception):
                    print(f"[{judge.name}] Chunk evaluation failed: {result}")
                    continue
                total_followed += result

            # Normalize score
            normalized_score = total_followed / total_instructions if total_instructions > 0 else 0.0
            judge_scores.append(normalized_score)

        return judge_scores

    async def score(
        self,
        prompts: List[str],
        completions: List[str],
        models: List[str]
    ) -> Dict[str, float]:
        """
        Score prompt-completion pairs using multiple judges.

        Args:
            prompts: List of full prompts (including instructions + paragraph)
            completions: List of completions corresponding to the prompts
            models: List of model identifiers for each completion

        Returns:
            Dictionary: model_id → average compliance score (0.0 to 1.0)
        """
        if not (len(prompts) == len(completions) == len(models)):
            raise ValueError("Input lists must be equal length")

        if not prompts:
            return {}

        # Extract instructions and paragraph from first prompt (assuming all prompts have same format)
        try:
            instructions, paragraph = self.processor.extract_instructions_and_paragraph(prompts[0])
        except ValueError as e:
            raise ValueError(f"Failed to parse prompt format: {e}")

        model_scores = defaultdict(list)

        # Create tasks for all completions
        evaluation_tasks = []
        for i, (prompt, completion, model_id) in enumerate(zip(prompts, completions, models)):
            task = self._evaluate_single_completion(instructions, paragraph, completion)
            evaluation_tasks.append((task, model_id))

        # Execute all evaluations concurrently
        for task, model_id in evaluation_tasks:
            try:
                judge_scores = await task

                # Average across judges for this completion
                if judge_scores:
                    final_score = sum(judge_scores) / len(judge_scores)
                    model_scores[model_id].append(final_score)
                else:
                    print(f"No valid scores for model {model_id}")
                    model_scores[model_id].append(0.0)

            except Exception as e:
                print(f"Evaluation failed for model {model_id}: {e}")
                model_scores[model_id].append(0.0)

        # Average scores per model
        avg_scores = {
            model_id: sum(scores) / len(scores) if scores else 0.0
            for model_id, scores in model_scores.items()
        }

        return avg_scores

# Convienice and Helping Functions

In [None]:
async def create_evaluator(
    openai_api_key: Optional[str] = None,
    claude_api_key: Optional[str] = None,
    use_openai: bool = True,
    use_claude: bool = True,
    config_file: Optional[str] = None,
    api_config: Optional[Dict[str, str]] = None
) -> InstructionFollowingEvaluator:
    """
    Create an evaluator with specified judges and flexible API key configuration.

    Args:
        openai_api_key: OpenAI API key (overrides other sources)
        claude_api_key: Claude API key (overrides other sources)
        use_openai: Whether to include OpenAI judge
        use_claude: Whether to include Claude judge
        config_file: Path to JSON config file with API keys
        api_config: Dictionary with API key configuration

    Returns:
        Configured InstructionFollowingEvaluator instance
    """
    # Setup API key manager
    api_manager = APIKeyManager()


    # Direct API key parameters take highest priority
    if openai_api_key:
        api_manager.set_openai_key(openai_api_key)
    if claude_api_key:
        api_manager.set_claude_key(claude_api_key)



    # Create judges based on preferences and availability
    judges = []

    if use_openai and api_manager.is_openai_configured():
        try:
            judges.append(OpenAIJudge(api_key=api_manager.get_openai_key()))
        except ValueError as e:
            print(f"OpenAI judge creation failed: {e}")

    if use_claude and api_manager.is_claude_configured():
        try:
            judges.append(ClaudeJudge(api_key=api_manager.get_claude_key()))
        except ValueError as e:
            print(f"Claude judge creation failed: {e}")

    if not judges:
        available = api_manager.get_available_services()
        raise ValueError(f"No judges could be created. Available services: {available}")

    return InstructionFollowingEvaluator(judges, api_manager)


def create_api_manager() -> APIKeyManager:
    """Create and return a new APIKeyManager instance."""
    return APIKeyManager()


async def evaluate_completions(
    prompts: List[str],
    completions: List[str],
    models: List[str],
    openai_api_key: Optional[str] = None,
    claude_api_key: Optional[str] = None,
    config_file: Optional[str] = None
) -> Dict[str, float]:
    """
    Convenience function to evaluate completions with flexible API key configuration.

    Args:
        prompts: List of prompts
        completions: List of completions
        models: List of model identifiers
        openai_api_key: OpenAI API key
        claude_api_key: Claude API key
        config_file: Path to config file with API keys

    Returns:
        Dictionary of model scores
    """
    evaluator = await create_evaluator(
        openai_api_key=openai_api_key,
        claude_api_key=claude_api_key,
        config_file=config_file
    )

    return await evaluator.score(prompts, completions, models)

# Main

In [None]:
# creating data
import re
from typing import List
def read_prompt_as_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:

     return f.read()

In [None]:


def load_completions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    completions = re.findall(r"Completion \d+:\s*(.*?)\s*(?=Completion \d+:|$)", text, re.DOTALL)
    return [c.strip() for c in completions]

def load_models(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        models = [line.strip() for line in f.readlines() if line.strip()]
    return models


In [None]:
prompt = read_prompt_as_text("/content/Prompts.txt")
completions = load_completions("/content/Completions.txt")
model_ids = load_models("/content/Models.txt")

In [None]:
async def main():
    # Sample data
    prompt = read_prompt_as_text("/content/Prompts.txt")
    completions = load_completions("/content/Completions.txt")
    models = load_models("/content/Models.txt")
    prompts = [prompt] * len(completions)

    # Method 1: Direct API key configuration
    evaluator1 = await create_evaluator(
        openai_api_key="",
        claude_api_key="" )
     # Get scores using any evaluator
    scores = await evaluator1.score(prompts, completions, models)

    print("Model Scores:")
    for model_id, score in scores.items():
        print(f"{model_id}: {score:.3f}")

In [None]:
import asyncio

await main()

Model Scores:
Gemini: 0.536
Mistral: 0.500
OpenAi: 0.536
