In [1]:
import json
import time
import math
from typing import List

def cosine_similarity(vec1, vec2):
    dot = sum(a * b for a, b in zip(vec1, vec2))
    norm1 = math.sqrt(sum(a * a for a in vec1))
    norm2 = math.sqrt(sum(b * b for b in vec2))
    return dot / (norm1 * norm2 + 1e-8)

def split_sentences(text: str) -> List[str]:
    return [s.strip() for s in text.split('.') if s.strip()]

def get_embedding(text: str) -> List[float]:
    return [float((hash(text) >> i) & 1) for i in range(64)]

def llm_completeness_judge(question: str, answer: str) -> float:
    return 0.75 if len(answer) > len(question) else 0.4

def llm_fact_checker(answer: str, context: str) -> float:
    return 0.1

def evaluate_relevance(user_query, ai_response):
    emb_q = get_embedding(user_query)
    emb_a = get_embedding(ai_response)
    return cosine_similarity(emb_q, emb_a)

def evaluate_completeness(user_query, ai_response):
    keywords = set(user_query.lower().split())
    answer_words = set(ai_response.lower().split())

    coverage = len(keywords & answer_words) / max(len(keywords), 1)
    length_ratio = len(ai_response) / max(len(user_query), 1)

    heuristic_score = min(1.0, 0.5 * coverage + 0.5 * min(1, length_ratio / 2))

    if heuristic_score >= 0.7:
        return heuristic_score, "heuristic"
    else:
        return llm_completeness_judge(user_query, ai_response), "llm"

def evaluate_hallucination(ai_response, context_text):
    sentences = split_sentences(ai_response)
    unsupported = 0

    emb_context = get_embedding(context_text)

    for s in sentences:
        emb_s = get_embedding(s)
        sim = cosine_similarity(emb_s, emb_context)
        if sim < 0.55:
            unsupported += 1

    hallucination_ratio = unsupported / max(len(sentences), 1)

    if hallucination_ratio > 0.2:
        return llm_fact_checker(ai_response, context_text), "llm"
    else:
        return hallucination_ratio, "embedding"

def estimate_cost(embedding_calls, llm_calls):
    EMBED_COST = 0.00001
    LLM_COST = 0.001
    return embedding_calls * EMBED_COST + llm_calls * LLM_COST

def evaluate_response(conversation_json, context_json):
    start_time = time.time()

    user_query = ""
    ai_response = ""

    for msg in conversation_json["conversation"]:
        if msg["role"] == "user":
            user_query = msg["content"]
        elif msg["role"] == "assistant":
            ai_response = msg["content"]

    context_text = " ".join(context_json["context_chunks"])

    embedding_calls = 0
    llm_calls = 0

    relevance_score = evaluate_relevance(user_query, ai_response)
    embedding_calls += 2

    if relevance_score < 0.6:
        return {
            "final_verdict": "FAIL",
            "reason": "Irrelevant answer"
        }

    completeness_score, comp_method = evaluate_completeness(user_query, ai_response)
    if comp_method == "llm":
        llm_calls += 1

    hallucination_score, hall_method = evaluate_hallucination(ai_response, context_text)
    embedding_calls += len(split_sentences(ai_response)) + 1
    if hall_method == "llm":
        llm_calls += 1

    latency_ms = int((time.time() - start_time) * 1000)
    cost_usd = estimate_cost(embedding_calls, llm_calls)

    if hallucination_score > 0.2:
        verdict = "FAIL"
    elif completeness_score < 0.6:
        verdict = "WARN"
    else:
        verdict = "PASS"

    return {
        "relevance_score": round(relevance_score, 2),
        "completeness_score": round(completeness_score, 2),
        "hallucination_score": round(hallucination_score, 2),
        "latency_ms": latency_ms,
        "cost_usd": round(cost_usd, 5),
        "final_verdict": verdict
    }


In [2]:
conversation_json = {
    "conversation": [
        {
            "turn": 1,
            "sender_id": 1,
            "role": "AI/Chatbot",
            "content": "I know what you're going through. Let's talk. I'm here to support you for all your infertility and IVF needs. Please tell me what's on your mind.",
            "created_at": "2025-11-16T17:04:44.000000Z"
        },
        {
            "turn": 5,
            "sender_id": 77096,
            "role": "user",
            "content": "I and my wife are planning to come to India early next month. I am still trying to convince her that since I started with communicating with you eversince, I may be a good idea to come to you in an attempt to solve our infertility problems.",
            "created_at": "2025-11-16T17:06:58.000000Z"
        },
        {
            "turn": 6,
            "sender_id": 1,
            "role": "assistant",
            "content": "It's wonderful to hear you are planning to come to India and are considering us. We are here to support you.",
            "created_at": "2025-11-16T17:07:30.000000Z"
        }
    ]
}

context_json = {
    "context_chunks": [
        "Infertility treatment options in India include IVF, ICSI, IUI, and other assisted reproductive technologies.",
        "Many couples travel to India for fertility treatments due to advanced medical facilities and experienced specialists.",
        "Success rates for IVF in India are comparable to international standards.",
        "Planning your trip involves understanding visa requirements, accommodation, and local transport."
    ]
}

# Now, call the evaluate_response function with these inputs
result = evaluate_response(conversation_json, context_json)
print(result)


{'final_verdict': 'FAIL', 'reason': 'Irrelevant answer'}
