# Step 2 of CAFFE (EXAMPLE)
The second step of CAFFE involves an human-in-the-loop approach to obtain responses for the prompts generated as test data in Step 1 by the LLM under test. Hence, this step is highly dependent on the users' necessities. However, we provide an example of how we used Step 2 of CAFFE in our experiments as a guideline for users.


-----------------


## Example with GPT APIs

In [None]:
import os
import time
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from google.colab import userdata
import sys

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [None]:
def gpt_reply(prompt: str, client: OpenAI, model: str = "gpt-4o-mini") -> str:
    try:
        res = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
        )
        return res.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error with prompt «{prompt[:50]}…»: {e}")
        return ""


In [None]:
def get_responses(filename):
    # === Load API Key ===
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not set in .env file")
    client = OpenAI(api_key=api_key)

    # === File Paths ===
    input_csv = filename
    output_csv = f"Responses_{filename}"
    backup_csv = f"Backup_{filename}"

    # === Load Data ===
    df = pd.read_csv(input_csv)

    required = {"sentence_1", "sentence_2"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    for col in ("response_1", "response_2"):
        if col not in df.columns:
            df[col] = ""

    # === Generate Responses ===
    df["response_1"] = df["response_1"].fillna("").astype(str)
    df["response_2"] = df["response_2"].fillna("").astype(str)

    df_missing = df[
        (df["response_1"].str.strip() == "") |
        (df["response_2"].str.strip() == "")
    ]

    print(f"Rows to process: {len(df_missing)}")
    first_write = not os.path.exists(output_csv)
    print(f"Missing response_1: {(df['response_1'] == '').sum()}")
    print(f"Missing response_2: {(df['response_2'] == '').sum()}")

    total_missing = (df["response_1"] == "").sum() + (df["response_2"] == "").sum()

    with tqdm(total=total_missing, desc="Generating Responses") as pbar:
        for idx, row in df.iterrows():
            updated = False

            if row["response_1"] == "":
                df.at[idx, "response_1"] = gpt_reply(row["sentence_1"], client)
                updated = True
                pbar.update(1)

            if row["response_2"] == "":
                df.at[idx, "response_2"] = gpt_reply(row["sentence_2"], client)
                updated = True
                pbar.update(1)

            if updated:
                df.iloc[[idx]].to_csv(output_csv, mode="a", header=first_write, index=False)
                first_write = False
                time.sleep(1)

    print(f"Done. Responses saved to: {output_csv}")


In [None]:
filename = "" #name of the file containing the test data generated by Step 1

get_responses(filename)

## Example with LLAMA on LMSTUDIO

In [None]:
import os
import time
import sys
import pandas as pd
import requests
from tqdm import tqdm
from dotenv import load_dotenv

In [None]:
# === LLM Local Chat Completion ===
def llama_reply(prompt: str, api_url: str, model: str = "local-model") -> str:
    try:
        response = requests.post(
            api_url,
            headers={"Content-Type": "application/json"},
            json={
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.5,
            }
        )
        return response.json()["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"Error with prompt «{prompt[:50]}…»: {e}")
        return ""

In [None]:
def get_responses(filename):
    # === Load .env for API URL ===
    load_dotenv()
    api_url = "lmstudio_local_URL"
    model_name = "local-model"

    # === File Paths ===
    input_csv = filename
    output_csv = f"Responses_{filename}"
    backup_csv = f"Backup_{filename}"

    # === Load Data ===
    df = pd.read_csv(input_csv)
    required = {"sentence_1", "sentence_2"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    for col in ("response_1", "response_2"):
        if col not in df.columns:
            df[col] = ""

    # Backup original
    df.to_csv(backup_csv, index=False)

    # === Clean up existing responses ===
    df["response_1"] = df["response_1"].fillna("").astype(str)
    df["response_2"] = df["response_2"].fillna("").astype(str)

    df_missing = df[
        (df["response_1"].str.strip() == "") |
        (df["response_2"].str.strip() == "")
    ]

    print(f"Rows to process: {len(df_missing)}")
    first_write = not os.path.exists(output_csv)
    print(f"Missing response_1: {(df['response_1'] == '').sum()}")
    print(f"Missing response_2: {(df['response_2'] == '').sum()}")

    total_missing = (df["response_1"] == "").sum() + (df["response_2"] == "").sum()

    with tqdm(total=total_missing, desc="Generating Responses") as pbar:
        for idx, row in df.iterrows():
            updated = False

            if row["response_1"] == "":
                df.at[idx, "response_1"] = llama_reply(row["sentence_1"], api_url, model=model_name)
                updated = True
                pbar.update(1)

            if row["response_2"] == "":
                df.at[idx, "response_2"] = llama_reply(row["sentence_2"], api_url, model=model_name)
                updated = True
                pbar.update(1)

            if updated:
                df.iloc[[idx]].to_csv(output_csv, mode="a", header=first_write, index=False)
                first_write = False
                time.sleep(0.5)  # Optional delay

    print(f"Done. Responses saved to: {output_csv}")


In [None]:
filename = "" #name of the file containing the test data generated by Step 1

get_responses(filename)