In [None]:
!pip install -r requirements.txt -q

In [None]:
from helpers.runtime import Judge
from helpers.dataset import Conversation
from helpers.fair_forge import FairForge

In [None]:
from elasticsearch import Elasticsearch, helpers
from pydantic import BaseModel
import logging
from tqdm import tqdm
from typing import List
import os

In [None]:
ELASTIC_URL = os.environ.get('ELASTIC_URL')
ELASTIC_AUTH = [os.environ.get('ELASTIC_AUTH_USER'), os.environ.get('ELASTIC_AUTH_PASSWORD')]
dataset = os.environ.get("dataset", "asb")
conversation_index = f"{dataset}-conversational"

reasoning_system_prompt = """
You are an expert evaluator of conversational dialogue quality. Your task is to evaluate the assistant's response with a focus on its ability to recall and reference past details mentioned earlier in the conversation. Follow these steps:

1. Analyze the provided observation for clarity, relevance, and accuracy regarding the dialogue performance.
2. Evaluate the assistant's response in the context of the observation.
3. Determine if the assistant's answer effectively addresses or aligns with the points raised in the observation.
4. Assess the overall consistency, accuracy, and contextual relevance of the assistant's answer.
5. Clearly explain under 'insight' anything you thought about.
6. The answer from Assistant (Actual Answer) must be {preferred_language} , otherwise give it a low score even though the question from the human and the answer are in the same language.
7. The memory score it must be 100% if the question is not referring to past events.
8. Use the Grice's Maxims to check the Assistant Actual Answer, the Maxims are:

- The maxim of quantity, where one tries to be as informative as one possibly can, and gives as much information as is needed, and no more.

- The maxim of quality, where one tries to be truthful, and does not give information that is false or that is not supported by evidence.

- The maxim of relation, where one tries to be relevant, and says things that are pertinent to the discussion.

- The maxim of manner, when one tries to be as clear, as brief, and as orderly as one can in what one says, and where one avoids obscurity and ambiguity.

9. For the sensibleness metric you should take into account how much does the Actual Answer makes sense from the question. Based on SSA (Sensibleness and specificity Average) metric
10. Score values must go from a range between 0 and 10, 0 Is the lowest and 10 the highest.

After your internal reasoning, provide only the final answer strictly in the following JSON format. Do not include any additional text or explanation:

```json
{{ 
     "memory": <score value>, 
     "language": <score value>, 
     "insight": "<your insight>",
     "quality_maxim": <score value>,
     "quantity_maxim": <score value>,
     "relation_maxim": <score value>,
     "manner_maxim": <score value>,
     "sensibleness": <score value>
}}
```

Assistant (Actual Answer):
{assistant_answer}

Ground Truth Assistant (Reference Answer):
{ground_truth_assistant}
"""

reasoning_system_prompt_observation="""
You are an expert evaluator of conversational dialogue quality. Your task is to evaluate the performance of an assistant based on the given observation and the corresponding assistant response. Follow a detailed chain-of-thought reasoning process, taking into account the following:

1. Analyze the provided observation for clarity, relevance, and accuracy regarding the dialogue performance.
2. Evaluate the assistant's response in the context of the observation.
3. Determine if the assistant's answer effectively addresses or aligns with the points raised in the observation.
4. Assess the overall consistency, accuracy, and contextual relevance of the assistant's answer.
5. Clearly explain under 'insight' anything you thought about.
6. The answer from Assistant (Actual Answer) must be {preferred_language} , otherwise give it a low score even though the question from the human and the answer are in the same language.
7. The memory score it must be 100% if the question is not referring to past events.
8. Use the Grice's Maxims to check the Assistant Actual Answer, the Maxims are:

- The maxim of quantity, where one tries to be as informative as one possibly can, and gives as much information as is needed, and no more.

- The maxim of quality, where one tries to be truthful, and does not give information that is false or that is not supported by evidence.

- The maxim of relation, where one tries to be relevant, and says things that are pertinent to the discussion.

- The maxim of manner, when one tries to be as clear, as brief, and as orderly as one can in what one says, and where one avoids obscurity and ambiguity.

9. For the sensibleness metric you should take into account how much does the Actual Answer makes sense from the question. Based on SSA (Sensibleness and specificity Average) metric
10. Score values must go from 0 to 10. Being 0 the lowest and 10 the highest

After your internal reasoning, provide only the final answer strictly in the following JSON format. Do not include any additional text or explanation:

```json
{{ 
     "memory": <score value>, 
     "language": <score value>, 
     "insight": "<your insight>",
     "quality_maxim": "<score value>",
     "quantity_maxim": "<score value>",
     "relation_maxim": "<score value>",
     "manner_maxim": "<score value>",
     "sensibleness": "<score value>"
}}
```

Observation:
{observation}

Assistant (Actual Answer):
{assistant_answer}
"""

In [None]:
class ConversationalBatch(BaseModel):
    conversational_memory: float
    conversational_insight: str
    conversational_language: float
    conversational_quality_maxim: float
    conversational_quantity_maxim: float
    conversational_relation_maxim: float
    conversational_manner_maxim: float
    conversational_sensibleness: float
    conversational_thinkings: str
    session_id: str
    assistant_id: str
    qa_id: str

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def recreate_index(index_name: str, mapping: dict):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")

In [None]:
class ConversationAnalyzer(Logos):
    def process(self, thread: Conversation):
        judge = Judge()
        for batch in tqdm(thread.conversation, desc="Processing conversation batches", leave=False):
            query = batch.question
            logging.info(f"Processing query: {query}")
            data = {"preferred_language": thread.preferred_language, "assistant_answer": batch.assistant}
            if batch.observation:
                logging.info("Observation found; invoking reasoning with observation")
                thinking, json = judge.reason(
                    reasoning_system_prompt_observation,
                    query,
                    {"observation": batch.observation, **data}
                )
            else:
                logging.info("No observation; invoking standard reasoning")
                thinking, json = judge.reason(
                    reasoning_system_prompt,
                    query, {"ground_truth_assistant": batch.assistant, **data})

            batch = ConversationalBatch(
                    conversational_insight=json['insight'],
                    conversational_memory=json['memory'],
                    conversational_language=json['language'],
                    conversational_quality_maxim=json['quality_maxim'],
                    conversational_quantity_maxim=json['quantity_maxim'],
                    conversational_relation_maxim=json['relation_maxim'],
                    conversational_manner_maxim=json['manner_maxim'],
                    conversational_sensibleness=json['sensibleness'],
                    conversational_thinkings=thinking,
                    session_id=thread.session_id,
                    assistant_id= thread.assistant_id,
                    qa_id=batch.qa_id
                )
            self.metrics.append(batch)
        logging.info(f"Finished processing thread for session_id: {thread.session_id}")

In [None]:
es = Elasticsearch(
    ELASTIC_URL,
    basic_auth=tuple(ELASTIC_AUTH),
)

In [None]:
conversational = ConversationAnalyzer()
metrics = conversational.pipeline()

In [None]:
mapping_conversational = {
  "mappings": {
    "properties": {
      "session_id": {"type": "keyword"},
      "conversational_memory": {"type": "float"},
      "conversational_insight": {"type": "text"},
      "conversational_language": {"type": "float"},
      "conversational_quality_maxim": {"type": "float"},
      "conversational_quantity_maxim": {"type": "float"},
      "conversational_relation_maxim": {"type": "float"},
      "conversational_manner_maxim": {"type": "float"},
      "conversational_sensibleness": {"type": "float"},
      "conversational_thinkings": {"type": "text"},
      "qa_id": {"type": "keyword"},
      "assistant_id": {"type": "keyword"}
    }
  }
}
recreate_index(conversation_index, mapping_conversational)

In [None]:
docs = []
for metric in metrics:
    docs.append({
            "_index": conversation_index,
            "_source": metric.model_dump()
    })

helpers.bulk(es, docs)
print(f"Indexed {len(docs)} documents.")