In [2]:
import json
import pandas as pd
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_sys_stimhartnow_result.json", "r") as f:
    output_sys = json.load(f)

# Load model outputs — each element is a JSON string
with open("/Users/zhangran/Desktop/BP@UnitedStates/Code/D2D_Data2Dashboard/exp_result/exp01_d2insight_gpt4o_domain_stimhartnow_result.json", "r") as f:
    output_gpt4o = json.load(f)

insight_sys = (
    output_sys["analysis"]["analysis"]["descriptive"] + " " +
    output_sys["analysis"]["analysis"]["predictive"] + " " +
    output_sys["analysis"]["analysis"]["domain_related"]
)

insight_gpt4o = (
    output_gpt4o["insights"]["customer_retention"]["insight"] + " " +
    output_gpt4o["insights"]["acquisition_channels"]["insight"] + " " +
    output_gpt4o["insights"]["customer_demographics"]["insight"] + " " +
    output_gpt4o["insights"]["financial_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["contract_analysis"]["insight"] + " " +
    output_gpt4o["insights"]["premium_customers"]["insight"]
)

# 3. Create a test case without reference output
test_case_sys = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_sys,
)

test_case_gpt4o = LLMTestCase(
    input="Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.",
    actual_output=insight_gpt4o,
)

In [20]:
# 4. Define GEval metrics (self-evaluation — no expected_output)
insightful = GEval(
    name="Insightful",
    criteria="Does the output offer a deep or non-obvious understanding? Does it connect patterns or trends that aren't immediately apparent?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

novelty = GEval(
    name="Novelty",
    criteria="Does the output go beyond generic interpretation? Would it surprise or teach something new to a domain expert?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

domain_relevance = GEval(
    name="Depth",
    criteria="Does the analysis demonstrate deep domain expertise in the specific domain?",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [None]:
# # 5. Run evaluation (assertion-based print)
# print("\n=== Insight Evaluation Report ===")
# assert_test(test_case_sys, [insightful, novelty, domain_relevance])

In [21]:
from deepeval import evaluate

results_sys = evaluate(
    test_cases=[test_case_sys],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_sys:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")





Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.55s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8916081046866233, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output demonstrates a deep understanding by identifying non-obvious insights such as the impact of acquisition channels and premium status on retention and CLV. It synthesizes information to suggest predictive analysis and strategies for optimizing contracts and customer engagement, showing connections not immediately visible in the input., error: None)
  - ✅ Novelty (GEval) (score: 0.5990577392908683, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides insights into customer engagement and financial contributions, and suggests predictive analysis for CLV and retention, which are not immediately obvious from the input. However, it lacks novel concepts or analysis that challenge conventional thinking, and the insights may not be unexpected for experts in CRM., error: None)
  - ✅ Depth (GEval) (score: 0.9423169658535





==== Label: test_results ====
Input:  Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.
Output: The dataset provides a comprehensive view of customer interactions and lifecycle within a CRM context, capturing 3,117 unique customer records across 19 attributes. Key metrics include 'Periods Active', 'Revenue, Total', and 'Expected CLV', which offer insights into customer engagement and financial...
Insightful (GEval)       : 0.89  |  The output demonstrates a deep understanding by identifying non-obvious insights such as the impact of acquisition channels and premium status on retention and CLV. It synthesizes information to suggest predictive analysis and strategies for optimizing contracts and customer engagement, showing connections not immediately visible in the input.
Novelty (GEval)          : 0.60  |  The output provides insights into customer engagement and financial contributions, and suggests predictive anal

In [22]:
results_gpt4o = evaluate(
    test_cases=[test_case_gpt4o],
    metrics=[insightful, novelty, domain_relevance],
)

# ── pretty‑print ───────────────────────────────
for label, test_case_list in results_gpt4o:
    print(f"\n==== Label: {label} ====")

    if test_case_list is None:
        print("⚠️  No results for this label.\n")
        continue

    for test_case in test_case_list:
        print(f"Input:  {test_case.input}")
        print(f"Output: {test_case.actual_output[:300]}...")

        for metric in test_case.metrics_data:
            print(f"{metric.name:<25}: {metric.score:.2f}  |  {metric.reason}")

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.14s/test case]



Metrics Summary

  - ✅ Insightful (GEval) (score: 0.8651354864666055, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output identifies unique insights such as the reliance on external channels for acquisition and the potential focus on emerging businesses, which are not explicitly mentioned in the input. It connects patterns like the high CLV versus average revenue, suggesting future revenue potential, and addresses complexities like the low number of premium customers as a growth area., error: None)
  - ❌ Novelty (GEval) (score: 0.39037252806207723, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The output provides a standard analysis of the dataset, such as customer retention issues and acquisition channels, which are expected insights. It lacks novel perspectives or unexpected insights for domain experts, such as innovative strategies or unique patterns in customer behavior., error: None)
  - ✅ Depth (GEval) (score: 0.8031840415870386, thres





==== Label: test_results ====
Input:  Customer Relationship Management dataset with features like acquisition channel, retention, churn, periods active, etc.
Output: The majority of customers have terminated their contracts, indicating potential issues with customer retention strategies. Most customers are acquired through external channels, suggesting a reliance on external marketing or partnerships for customer acquisition. The majority of customers are small ...
Insightful (GEval)       : 0.87  |  The output identifies unique insights such as the reliance on external channels for acquisition and the potential focus on emerging businesses, which are not explicitly mentioned in the input. It connects patterns like the high CLV versus average revenue, suggesting future revenue potential, and addresses complexities like the low number of premium customers as a growth area.
Novelty (GEval)          : 0.39  |  The output provides a standard analysis of the dataset, such as customer reten