In [None]:
import pandas as pd
import os
from deepeval import evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import PromptAlignmentMetric, GEval

#os.environ["OPENAI_API_KEY"] = "API_KEY"

df = pd.read_csv("LL_LLM_Explanations.csv")
evaluation_model = "gpt-4o"

def make_geval(name, steps, params):
    return GEval(
        name=name,
        evaluation_steps=steps,
        evaluation_params=params,
        model=evaluation_model
    )

correctness_metric = make_geval(
    "Correctness",
    [
        "Does the explanation correctly describe why the agent failed?",
        "Does it include the correct cause of failure based on velocity, angle, and contact states?",
        "Heavily penalize factual errors or missing necessary details."
    ],
    [LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

helpfulness_metric = make_geval(
    "Helpfulness",
    [
        "Does the explanation provide insights that would help someone understand why the agent failed?",
        "Does it mention key contributing factors clearly?"
    ],
    [LLMTestCaseParams.ACTUAL_OUTPUT]
)

conciseness_metric = make_geval(
    "Conciseness",
    [
        "Is the explanation concise but still informative?",
        "Does it avoid redundancy or unnecessary elaboration?"
    ],
    [LLMTestCaseParams.ACTUAL_OUTPUT]
)

relevance_metric = make_geval(
    "Relevance",
    [
        "Is the explanation focused on the failure and not on unrelated aspects?",
        "Does it directly address the failure context?"
    ],
    [LLMTestCaseParams.ACTUAL_OUTPUT]
)

coherence_metric = make_geval(
    "Coherence",
    [
        "Is the explanation logically structured and grammatically sound?",
        "Are cause-and-effect relationships clear?"
    ],
    [LLMTestCaseParams.ACTUAL_OUTPUT]
)

results = []

for idx, row in df.iterrows():
    prompt = row["Prompt"]
    output = str(row["Explanation"]).strip()
    reference = str(row["Reference"]).strip()

    test_case = LLMTestCase(
        input=prompt,
        actual_output=output,
        expected_output=reference
    )

    evaluation_result = evaluate(
        [test_case],
        [
            PromptAlignmentMetric(prompt_instructions=[prompt], model=evaluation_model, include_reason=True),
            correctness_metric,
            helpfulness_metric,
            conciseness_metric,
            relevance_metric,
            coherence_metric
        ]
    )

    m = evaluation_result.test_results[0].metrics_data

    results.append({
        "Episode": row["Episode"],
        "PromptType": row["PromptType"],
        "Prompt": prompt,
        "Reference": reference,
        "LLM": row["LLM"],
        "Explanation": output,
        "Prompt Alignment Score": m[0].score,
        "Prompt Alignment Reason": m[0].reason,
        "Correctness Score": m[1].score,
        "Correctness Reason": m[1].reason,
        "Helpfulness Score": m[2].score,
        "Helpfulness Reason": m[2].reason,
        "Conciseness Score": m[3].score,
        "Conciseness Reason": m[3].reason,
        "Relevance Score": m[4].score,
        "Relevance Reason": m[4].reason,
        "Coherence Score": m[5].score,
        "Coherence Reason": m[5].reason
    })

eval_df = pd.DataFrame(results)

cols = [
    "Episode", "PromptType", "Prompt", "Reference", "LLM", "Explanation",
    "Prompt Alignment Score", "Prompt Alignment Reason",
    "Correctness Score", "Correctness Reason",
    "Helpfulness Score", "Helpfulness Reason",
    "Conciseness Score", "Conciseness Reason",
    "Relevance Score", "Relevance Reason",
    "Coherence Score", "Coherence Reason"
]

eval_df = eval_df[cols]

regular_df = eval_df[eval_df["PromptType"] != "ShortSpecificModified"]
modified_df = eval_df[eval_df["PromptType"] == "ShortSpecificModified"]

regular_df.to_csv("LL_DeepEval.csv", index=False)
modified_df.to_csv("Modified_LL_DeepEval.csv", index=False)

print(regular_df.groupby(["PromptType", "LLM"])[[
    "Prompt Alignment Score", "Correctness Score", "Helpfulness Score",
    "Conciseness Score", "Relevance Score", "Coherence Score"
]].describe())

print(modified_df.groupby(["PromptType", "LLM"])[[
    "Prompt Alignment Score", "Correctness Score", "Helpfulness Score",
    "Conciseness Score", "Relevance Score", "Coherence Score"
]].describe())