In [5]:
from openai import OpenAI
from alloprompt import Prompt

client_or = OpenAI(api_key="sk-5t6Ij-P05SHjAXcwheltWg", base_url="https://llm-api.allobrain.com/")

In [6]:
from typing import Dict, Any
from AlloLLMEval.core.base import PromptExecutorBase

class SentimentExecutor(PromptExecutorBase[str, str]):
    def __init__(self):
        super().__init__(config_schema={
            "model": str,
            "temperature": float
        })
        
    def execute(self, input_data: str, config: Dict[str, Any]) -> str:
        model_output = client_or.chat.completions.create(
            model=config["model"],
            temperature=config["temperature"],
            messages=[
                {"role": "system", "content": "You are a sentiment classifier. You are given a movie review and you need to classify it as positive, negative or neutral. You must only return the sentiment."},
                {"role": "user", "content": input_data}
            ]
        ).choices[0].message.content.lower()
        if "positive" in model_output:
            return "positive"
        elif "negative" in model_output:
            return "negative"
        return "neutral"

from typing import Dict, Any
from AlloLLMEval.core.metrics import MetricEvaluatorBase, MetricOutput, MetricStatus

class StrictGroundTruthEvaluator(MetricEvaluatorBase):
    def evaluate(
        self,
        executor: PromptExecutorBase,
        input_data: Any,
        base_output: Any,
        executor_config: Dict[str, Any],
        test_config: Dict[str, Any],
        evaluation_params: Dict[str, Any]
    ) -> MetricOutput:
        ground_truth = evaluation_params.get("ground_truth")
        if not ground_truth:
            raise ValueError("Ground truth must be provided in evaluation_params")
        
        score = 1.0 if base_output == ground_truth else 0.0
        status = MetricStatus.PASSED if score == 1.0 else MetricStatus.FAILED
        
        return MetricOutput(
            score=score,
            status=status,
            visualization=None,
            details={
                "expected": ground_truth,
                "received": base_output
            },
            threshold={"min_score": 1.0}
        )

from typing import Dict, Any, List
from AlloLLMEval.core.metrics import MetricEvaluatorBase, MetricOutput, MetricStatus

class StabilityEvaluator(MetricEvaluatorBase):
    def evaluate(
        self,
        executor: PromptExecutorBase,
        input_data: Any,
        base_output: Any,
        executor_config: Dict[str, Any],
        test_config: Dict[str, Any],
        evaluation_params: Dict[str, Any]
    ) -> MetricOutput:
        num_runs = test_config.get("num_runs", 5)
        outputs: List[Any] = [base_output]
        
        # Run multiple times with temperature=1
        high_temp_config = dict(executor_config)
        high_temp_config["temperature"] = 1.0
        
        for _ in range(num_runs - 1):
            output = executor.execute(input_data, high_temp_config)
            outputs.append(output)
        
        # Calculate how many outputs are the same
        matching_outputs = sum(1 for out in outputs if out == base_output)
        stability_score = matching_outputs / len(outputs)
        
        return MetricOutput(
            score=stability_score,
            status=MetricStatus.PASSED if stability_score >= 0.6 else MetricStatus.FAILED,
            visualization=None,
            details={
                "outputs": outputs,
                "unique_outputs": list(set(outputs))
            },
            threshold={"min_stability": 0.6}
        )

In [8]:
from AlloLLMEval.core.test_runner import TestRunner, TestConfig

# Sample movie reviews dataset
reviews = [
    ("The movie was fantastic! Great acting and storyline.", "positive"),
    ("Terrible waste of time and money. Awful plot.", "negative"), 
    ("It was okay, nothing special but not bad either.", "neutral"),
    ("Best film I've seen all year! A masterpiece!", "positive"),
    ("I fell asleep halfway through. Very boring.", "negative")
]

# Initialize components
executor = SentimentExecutor()
ground_truth_evaluator = StrictGroundTruthEvaluator({})
stability_evaluator = StabilityEvaluator({})

# Ground truth test
ground_truth_test = TestRunner(
    executor=executor,
    evaluator=ground_truth_evaluator,
    config=TestConfig(
        executor_config={"model": "gpt-4", "temperature": 0.0},
        metric_config={}
    )
)

# Run ground truth tests
print("Ground Truth Test Results:")
for review, ground_truth in reviews:
    gt_result = ground_truth_test.run(
        input_data=review,
        evaluation_params={"ground_truth": ground_truth}
    )
    print(f"\nReview: {review}")
    print(f"Expected: {ground_truth}")
    print(f"Received: {gt_result.executor_output}")
    print(f"Score: {gt_result.metric_output.score}")

# Stability test
stability_test = TestRunner(
    executor=executor,
    evaluator=stability_evaluator,
    config=TestConfig(
        executor_config={"model": "gpt-4", "temperature": 1},
        metric_config={"num_runs": 5}
    )
)

# Run stability tests
print("\nStability Test Results:")
for review, _ in reviews:
    stability_result = stability_test.run(
        input_data=review
    )
    print(f"\nReview: {review}")
    print(f"Stability Score: {stability_result.metric_output.score}")
    print(f"Unique outputs: {stability_result.metric_output.details['unique_outputs']}")

Ground Truth Test Results:

Review: The movie was fantastic! Great acting and storyline.
Expected: positive
Received: Positive
Score: 0.0

Review: Terrible waste of time and money. Awful plot.
Expected: negative
Received: Negative
Score: 0.0

Review: It was okay, nothing special but not bad either.
Expected: neutral
Received: Neutral
Score: 0.0

Review: Best film I've seen all year! A masterpiece!
Expected: positive
Received: Positive
Score: 0.0

Review: I fell asleep halfway through. Very boring.
Expected: negative
Received: Negative
Score: 0.0

Stability Test Results:


KeyboardInterrupt: 