In [None]:
import os

import ipywidgets as widgets
import pandas as pd
from dotenv import load_dotenv
from IPython.display import display
from openai import OpenAI

In [None]:
load_dotenv(".env")
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

In [None]:
def get_completion(prompt):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0.9,
        response_format={
            "type": "text",
        },
    )

    completion = completion.to_dict()

    content = completion["choices"][0]["message"]["content"]

    return content

#### 4. **Evaluate Quality**

* So far, there has been no feedback loop to judge the quality of the responses other than checking the results manually (A.K.A vibe). 

* This is called **blind prompting** and it's only fine when our prompts are used temporally for a single task and rarely visited.

* If we are planning to use the same prompt for an application, then we need to be more rigorous with measuring the results. 

* This is because when a new LLM model is released, or a new version of the same LLM model is released, there is no guarantee a prompt that works on the older model/version would work on the new model/version.  

How to evaluate?

* **Step 1**: Generate a number of prompts for a task
* **Step 2**: get a response for *multiple* runs for each prompt 
* **Step 3**: save results in a file
* **Step 4**: evaluate each of the responses using one of the below strategies, which is best done blind and randomised to avoid favouring one prompt over another

**Strategies**:

* Simple thumps-up/down rating system
* 3, 5, or 10 points rating system 
* Use a ground-truth
* Use an LLM as judge

<span style="color:green">
    <p>Task:</p>
    <ul>
        <li>Iterate through <strong>2 prompts</strong></li>
        <li>For each prompt, iterate through <strong>3 runs</strong></li>
        <li>Give each prompt and run a name</li>
        <li>Get a response from the model</li>
        <li>Convert responses into a dataframe</li>
        <li>Save the dataframe as a CSV file</li>
    </ul>
</span>

In [None]:
prompt_1 = """Product description: A pair of shoes that can fit any foot size.
Seed words: adaptable, fit, omni-fit.
Product names:"""

prompt_2 = """Product description: A home milkshake maker.
Seed words: fast, healthy, compact.
Product names: HomeShaker, Fit Shaker, QuickShake, Shake Maker

Product description: A watch that can tell accurate time in space.
Seed words: astronaut, space-hardened, eliptical orbit
Product names: AstroTime, SpaceGuard, Orbit-Accurate, EliptoTime.

Product description: A pair of shoes that can fit any foot size.
Seed words: adaptable, fit, omni-fit.
Product names:"""

In [None]:
prompts = [prompt_1, prompt_2]
responses = []
no_runs = 3

for ind, prompt in enumerate(prompts):
    for run in range(no_runs):
        variant = f"{ind}_{run}"
        response = get_completion(prompt)
        variant = f"{ind}_{run}"

        data = {
            "variant": variant,
            "prompt": prompt,
            "response": response,
        }

        responses.append(data)

df = pd.DataFrame(responses)

df.to_csv("data/responses.csv", index=False)

In [None]:
display(df)

In [None]:
# load the responses.csv file
df = pd.read_csv("data/responses.csv")

# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# df is your dataframe and 'response' is the column with text to test
response_index = 0
# add a new column to store feedback
df["feedback"] = pd.Series(dtype="str")


def on_button_clicked(b):
    global response_index
    #  convert thumbs up / down to 1 / 0
    user_feedback = 1 if b.description == "\U0001f44d" else 0

    # update the feedback column
    df.at[response_index, "feedback"] = user_feedback

    response_index += 1
    if response_index < len(df):
        update_response()
    else:
        df.to_csv("results.csv", index=False)

        print("A/B testing completed. Here's the results:")
        # Calculate score and num rows for each variant
        summary_df = (
            df.groupby("variant")
            .agg(count=("feedback", "count"), score=("feedback", "mean"))
            .reset_index()
        )
        print(summary_df)


def update_response():
    new_response = df.iloc[response_index]["response"]
    if pd.notna(new_response):
        new_response = "<p>" + new_response + "</p>"
    else:
        new_response = "<p>No response</p>"
    response.value = new_response
    count_label.value = f"Response: {response_index + 1}"
    count_label.value += f"/{len(df)}"


response = widgets.HTML()
count_label = widgets.Label()

update_response()

thumbs_up_button = widgets.Button(description="\U0001f44d")
thumbs_up_button.on_click(on_button_clicked)

thumbs_down_button = widgets.Button(description="\U0001f44e")
thumbs_down_button.on_click(on_button_clicked)

button_box = widgets.HBox([thumbs_down_button, thumbs_up_button])

display(response, button_box, count_label)