In [5]:
import json
import pandas as pd
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_sys_stimhartnow_result.json", "r") as f:
    output_sys = json.load(f)

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_gpt4o_domain_stimhartnow_result.json", "r") as f:
    output_gpt4o = json.load(f)

insight_sys = (
    output_sys["analysis"]["analysis"]["descriptive"] + " " +
    output_sys["analysis"]["analysis"]["predictive"] + " " +
    output_sys["analysis"]["analysis"]["domain_related"]
)

insight_gpt4o = (
    output_gpt4o["insights"]["customer_retention"]["insight"] + " " +
    output_gpt4o["insights"]["acquisition_channels"]["insight"] + " " +
    output_gpt4o["insights"]["customer_demographics"]["insight"] + " " +
    output_gpt4o["insights"]["financial_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["contract_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["premium_customers"]["insight"]
)

# 3. Create a test case without reference output
test_case_sys = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_sys,
)

test_case_gpt4o = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_gpt4o,
)

In [6]:
# 4. Define GEval metrics (self-evaluation — no expected_output)
insightful = GEval(
    name="Insightful",
    criteria="Does the output offer a deep or non-obvious understanding? Does it connect patterns or trends that aren't immediately apparent?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

novelty = GEval(
    name="Novelty",
    criteria="Does the output go beyond generic interpretation? Would it surprise or teach something new to a domain expert?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

domain_relevance = GEval(
    name="Domain Relevance",
    criteria="Is the output specific to the CRM domain? Does it reference domain-specific terms or relationships?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [None]:
# # 5. Run evaluation (assertion-based print)
# print("\n=== Insight Evaluation Report ===")
# assert_test(test_case_sys, [insightful, novelty, domain_relevance])

In [7]:
from deepeval import evaluate

results_sys = evaluate(
    test_cases=[test_case_sys],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
# Each element of `results` is a tuple: (test_case_sys, metric_results)
for test_case, metric_results in results_sys:
    print(f"\nInput:  {test_case.input}")
    for name, m_res in metric_results.items():
        print(f"{name:<17}: {m_res.score:.2f}  |  {m_res.reasoning}")




Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.34s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8771895692767624, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output effectively connects patterns in the CRM dataset, such as acquisition channels and customer demographics, to insights on customer retention and CLV. It offers deeper understanding by suggesting predictive analysis and strategies for optimizing customer engagement, which are not immediately obvious from the input. The analysis is creative, revealing new perspectives on leveraging CRM data for targeted marketing and resource allocation., error: None)
  - ✅ Novelty (GEval) (score: 0.8262005544713571, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides insights into customer engagement and financial contributions, identifies potential predictive analysis areas, and suggests strategies for optimizing customer profitability, which are not obvious from the input data. It offers novel perspectives on acquisition cha




AttributeError: 'str' object has no attribute 'input'

In [8]:
results_gpt4o = evaluate(
    test_cases=[test_case_gpt4o],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
# Each element of `results` is a tuple: (test_case_sys, metric_results)
for test_case, metric_results in results_gpt4o:
    print(f"\nInput:  {test_case.input}")
    for name, m_res in metric_results.items():
        print(f"{name:<17}: {m_res.score:.2f}  |  {m_res.reasoning}")

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.61s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.7819834938204584, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides insights beyond surface-level information by identifying potential issues with retention strategies and reliance on external channels. It highlights hidden patterns like the focus on emerging businesses and the potential for growth in premium customers. The connection between CLV and revenue offers a unique perspective. However, it doesn't significantly challenge conventional wisdom or present unexpected interpretations., error: None)
  - ✅ Novelty (GEval) (score: 0.645442406712409, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides insights such as the reliance on external channels for acquisition and the potential for growth in premium customers, which are not immediately obvious. However, it lacks a novel perspective that would significantly intrigue a domain expert, as it mostly describes stand




AttributeError: 'str' object has no attribute 'input'