In [None]:
import pandas as pd
import os
from deepeval import evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import PromptAlignmentMetric, GEval

#os.environ["OPENAI_API_KEY"] = "API_KEY"

df = pd.read_csv("FL_LLM_Explanations.csv")
evaluation_model = "gpt-4o"

def make_geval(name, steps, params):
    return GEval(name=name, evaluation_steps=steps, evaluation_params=params, model=evaluation_model)

correctness_metric = make_geval("Correctness",
    ["Does the output contain factual errors?", "Heavily penalize exclusion of necessary information."],
    [LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT])

helpfulness_metric = make_geval("Helpfulness",
    ["Does it provide useful information for understanding the agent's failure?", "Does it assist the user in drawing conclusions?"],
    [LLMTestCaseParams.ACTUAL_OUTPUT])

conciseness_metric = make_geval("Conciseness",
    ["Is it concise without losing meaning?", "Is there repetition?"],
    [LLMTestCaseParams.ACTUAL_OUTPUT])

relevance_metric = make_geval("Relevance",
    ["Does the output focus on the agentâ€™s failure context?", "Does it avoid unrelated responses?"],
    [LLMTestCaseParams.ACTUAL_OUTPUT])

coherence_metric = make_geval("Coherence",
    ["Is the response logically structured?", "Are sentences grammatically correct?"],
    [LLMTestCaseParams.ACTUAL_OUTPUT])

results = []

for idx, row in df.iterrows():
    prompt = row["Prompt"]
    output = str(row["Explanation"]).strip()
    reference = row.get("Reference", "")

    test_case = LLMTestCase(input=prompt, actual_output=output, expected_output=reference)

    evaluation_result = evaluate(
        [test_case],
        [
            PromptAlignmentMetric(prompt_instructions=[prompt], model=evaluation_model, include_reason=True),
            correctness_metric, helpfulness_metric, conciseness_metric,
            relevance_metric, coherence_metric
        ]
    )

    metrics_data = evaluation_result.test_results[0].metrics_data

    results.append({
        "Episode": row["Episode"],
        "PromptType": row["PromptType"],
        "Prompt": prompt,
        "Reference": reference,  
        "SecondToLastStateMap": row["SecondToLastStateMap"],
        "LastStateMap": row["LastStateMap"],
        "LastAction": row["LastAction"],
        "LLM": row["LLM"],  
        "Explanation": output,  
        "Prompt Alignment Score": metrics_data[0].score,
        "Prompt Alignment Reason": metrics_data[0].reason,
        "Correctness Score": metrics_data[1].score,
        "Correctness Reason": metrics_data[1].reason,
        "Helpfulness Score": metrics_data[2].score,
        "Helpfulness Reason": metrics_data[2].reason,
        "Conciseness Score": metrics_data[3].score,
        "Conciseness Reason": metrics_data[3].reason,
        "Relevance Score": metrics_data[4].score,
        "Relevance Reason": metrics_data[4].reason,
        "Coherence Score": metrics_data[5].score,
        "Coherence Reason": metrics_data[5].reason
    })

eval_df = pd.DataFrame(results)

desired_order = [
    "Episode", "PromptType", "Prompt", "Reference",
    "SecondToLastStateMap", "LastStateMap", "LastAction",
    "LLM", "Explanation",
    "Prompt Alignment Score", "Prompt Alignment Reason",
    "Correctness Score", "Correctness Reason",
    "Helpfulness Score", "Helpfulness Reason",
    "Conciseness Score", "Conciseness Reason",
    "Relevance Score", "Relevance Reason",
    "Coherence Score", "Coherence Reason"
]
eval_df = eval_df[desired_order]

eval_df.to_csv("FL_DeepEval.csv", index=False)

print(eval_df.groupby(["PromptType", "LLM"])[[
    "Prompt Alignment Score", "Correctness Score", "Helpfulness Score",
    "Conciseness Score", "Relevance Score", "Coherence Score"
]].describe())