### Benchmarking (running in direct, non-framework mode)

Demonstrates the ability to benchmark llm responses against an given semantic expectation directly. Later functionality will allow you to run in framework mode, in which benchmarks are auto-executed using defined fixtures as parameters.

#### Requires:
- .env file in project directory configured with `OPENAI_API_KEY`, `BASE_URL`, `DEFAULT_EMBEDDING_MODEL` (or using defaults)

In [1]:
import os
os.chdir('..')

import semtest

#### Defined semantic expectation

In [2]:
expectation = "A dog is in the background of the photograph"

In [3]:
def mock_llm_response_prompt1():
    responses = [
        "There's a dog in the background of the photo",
        "In the background of the photo is a dog",
        "There's an animal in the background of the photo and it's a dog."
    ]
    for response in responses:
        yield response

mock_llm_response_generator_prompt_1 = mock_llm_response_prompt1()

In [4]:
def mock_llm_response_prompt2():
    responses = [
        "In the background of the photograph there is a furry animal",
        "In the foreground there is a human, and I see a dog in the background of the photograph",
        "There are two dogs in the background of the image"
    ]
    for response in responses:
        yield response

mock_llm_response_generator_prompt_2 = mock_llm_response_prompt2()

#### Decorate the function to act as a benchmark

In [5]:
@semtest.benchmark(
    semantic_expectation=expectation,
    iterations=3
)
def mock_prompt_benchmark_prompt_1():
    """A better prompt/temperature/config"""

    # intermediary logic ...
    
    mocked_llm_response = next(mock_llm_response_generator_prompt_1)  # mock llm response

    # user validations ...

    return mocked_llm_response

In [6]:
@semtest.benchmark(
    semantic_expectation=expectation,
    iterations=3
)
def mock_prompt_benchmark_prompt_2():
    """A slightly worse prompt/temperature/config"""

    # intermediary logic ...
    
    mocked_llm_response = next(mock_llm_response_generator_prompt_2)  # mock llm response

    # user validations ...

    return mocked_llm_response

In [7]:
prompt_1_result: semtest.BenchmarkMetadata = mock_prompt_benchmark_prompt_1()
prompt_2_result: semtest.BenchmarkMetadata = mock_prompt_benchmark_prompt_2()

In [8]:
print(prompt_1_result)
print(prompt_2_result)

func='mock_prompt_benchmark_prompt_1' iterations=3 comparator='cosine_similarity' expectation_input='A dog is in the background of the photograph' benchmarks=SemanticMetrics(responses=["There's a dog in the background of the photo", 'In the background of the photo is a dog', "There's an animal in the background of the photo and it's a dog."], result_embeddings=[[-0.01653723604977131, 0.03403007611632347, -0.01244938001036644, 0.02832300402224064, 0.024620043113827705, -0.03684379532933235, 0.0013073176378384233, -0.01871388778090477, 0.009303854778409004, 0.018435169011354446, -0.007259926293045282, -0.011832219548523426, 0.01458621397614479, -0.04085201770067215, -0.010405451990664005, 0.016895586624741554, -0.03968405723571777, 0.019523493945598602, -0.0020870629232376814, -0.013053268194198608, -0.023014098405838013, -0.030738554894924164, -0.0027141771279275417, 0.02014729008078575, -0.029331695288419724, 0.006782124750316143, -0.011872036382555962, 0.009542754851281643, 0.00286846

In [9]:
print(prompt_1_result.model_dump_json(indent=2))
print(prompt_2_result.model_dump_json(indent=2))

{"func":"mock_prompt_benchmark_prompt_1","iterations":3,"comparator":"cosine_similarity","expectation_input":"A dog is in the background of the photograph","benchmarks":{"responses":["There's a dog in the background of the photo","In the background of the photo is a dog","There's an animal in the background of the photo and it's a dog."],"semantic_distances":[0.8688760239791908,0.831344642790557,0.7697577683647548],"mean_semantic_distance":0.8233261450448341,"median_semantic_distance":0.831344642790557}}
{"func":"mock_prompt_benchmark_prompt_2","iterations":3,"comparator":"cosine_similarity","expectation_input":"A dog is in the background of the photograph","benchmarks":{"responses":["In the background of the photograph there is a furry animal","In the foreground there is a human, and I see a dog in the background of the photograph","There are two dogs in the background of the image"],"semantic_distances":[0.7213289868782804,0.7030560435471228,0.7529891407174136],"mean_semantic_distanc