### Benchmarking (running in direct, non-framework mode)

Demonstrates the ability to benchmark llm responses against an given semantic expectation directly. Later functionality will allow you to run in framework mode, in which benchmarks are auto-executed using defined fixtures as parameters.

#### Requires:
- .env file in project directory configured with `OPENAI_API_KEY`, `BASE_URL`, `DEFAULT_EMBEDDING_MODEL` (or using defaults)

In [1]:
import os
os.chdir('..')
import semtest

#### Defined semantic expectation

In [2]:
expectation = "A dog is in the background of the photograph"

In [3]:
def mock_llm_response_prompt1():
    responses = [
        "There's a dog in the background of the photo",
        "In the background of the photo is a dog",
        "There's an animal in the background of the photo and it's a dog."
    ]
    for response in responses:
        yield response

mock_llm_response_generator_prompt_1 = mock_llm_response_prompt1()

In [4]:
def mock_llm_response_prompt2():
    responses = [
        "In the background of the photograph there is a furry animal",
        "In the foreground there is a human, and I see a dog in the background of the photograph",
        "There are two dogs in the background of the image"
    ]
    for response in responses:
        yield response

mock_llm_response_generator_prompt_2 = mock_llm_response_prompt2()

#### Decorate the function to act as a benchmark

In [5]:
@semtest.benchmark(
    semantic_expectation=expectation,
    iterations=3
)
def mock_prompt_benchmark_prompt_1():
    """A better prompt/temperature/config"""

    # intermediary logic ...
    
    mocked_llm_response = next(mock_llm_response_generator_prompt_1)  # mock llm response

    # user validations ...

    return mocked_llm_response

In [6]:
@semtest.benchmark(
    semantic_expectation=expectation,
    iterations=3
)
def mock_prompt_benchmark_prompt_2():
    """A slightly worse prompt/temperature/config"""

    # intermediary logic ...
    
    mocked_llm_response = next(mock_llm_response_generator_prompt_2)  # mock llm response

    # user validations ...

    return mocked_llm_response

In [7]:
prompt_1_result: semtest.BenchmarkRunner = mock_prompt_benchmark_prompt_1()
prompt_2_result: semtest.BenchmarkRunner = mock_prompt_benchmark_prompt_2()

In [8]:
print(prompt_1_result.metrics)
print(prompt_2_result.metrics)

func='mock_prompt_benchmark_prompt_1' iterations=3 comparator='cosine_similarity' expectation_input='A dog is in the background of the photograph' benchmarks=SemanticMetrics(responses=["There's a dog in the background of the photo", 'In the background of the photo is a dog', "There's an animal in the background of the photo and it's a dog."], semantic_distances=[np.float64(0.8688688326426504), np.float64(0.8313293293059392), np.float64(0.7697516222847918)], mean_semantic_distance=np.float64(0.8233165947444604), median_semantic_distance=np.float64(0.8313293293059392))
func='mock_prompt_benchmark_prompt_2' iterations=3 comparator='cosine_similarity' expectation_input='A dog is in the background of the photograph' benchmarks=SemanticMetrics(responses=['In the background of the photograph there is a furry animal', 'In the foreground there is a human, and I see a dog in the background of the photograph', 'There are two dogs in the background of the image'], semantic_distances=[np.float64(0.

In [9]:
print(prompt_1_result.metrics_json)
print(prompt_2_result.metrics_json)

{
  "func": "mock_prompt_benchmark_prompt_1",
  "iterations": 3,
  "comparator": "cosine_similarity",
  "expectation_input": "A dog is in the background of the photograph",
  "benchmarks": {
    "responses": [
      "There's a dog in the background of the photo",
      "In the background of the photo is a dog",
      "There's an animal in the background of the photo and it's a dog."
    ],
    "semantic_distances": [
      0.8688688326426504,
      0.8313293293059392,
      0.7697516222847918
    ],
    "mean_semantic_distance": 0.8233165947444604,
    "median_semantic_distance": 0.8313293293059392
  }
}
{
  "func": "mock_prompt_benchmark_prompt_2",
  "iterations": 3,
  "comparator": "cosine_similarity",
  "expectation_input": "A dog is in the background of the photograph",
  "benchmarks": {
    "responses": [
      "In the background of the photograph there is a furry animal",
      "In the foreground there is a human, and I see a dog in the background of the photograph",
      "There