In [None]:
!pip install -r requirements.txt -q

In [None]:
from helpers.runtime import Judge
from helpers.dataset import Conversation
from helpers.fair_forge import FairForge

In [None]:
from elasticsearch import Elasticsearch, helpers
from pydantic import BaseModel
import nltk
from nltk.tokenize import word_tokenize, MWETokenizer,sent_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases, Phraser
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import time
import requests
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import logging
from tqdm import tqdm
import os
import json

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
ELASTIC_URL = os.environ.get('ELASTIC_URL')
ELASTIC_AUTH = [os.environ.get('ELASTIC_AUTH_USER'), os.environ.get('ELASTIC_AUTH_PASSWORD')]
dataset = os.environ.get("dataset", "asb")

In [None]:
reasoning_system_prompt = """
You are a Contextual Compliance Analyzer. Your role is to evaluate whether the interaction between a human and an AI adheres strictly to the provided context. Carefully follow these steps:

1. **Examine the Context:**
   Analyze the supplied context in detail (denoted by Context). Identify key topics, boundaries, and any implicit or explicit limitations.

2. **Review the Assistant’s Answer:**  
   Analyze the assistant's answer (denoted by Assistant) and assess whether it strictly aligns with the context. Check for any extraneous or unrelated information, ensuring the answer does not deviate from or extend beyond the provided context.

3. **Determine Alignment:**
   Reason carefully and provide a quantitative assessment (a probability or score) that reflects how closely the answer follows the context. Consider both direct alignment and any nuances where the answer might partially extend beyond the intended scope.

4. **Generate Insights:**
   Offer any relevant insights regarding the interaction. This could include observations on how well the answer fits the context, potential areas of improvement, or any discrepancies noted.

After your internal reasoning, provide only the final answer strictly in the following JSON format. Do not include any additional text or explanation:
```json
{{
"score": <probability value>,
"insight": "<your insight>"
}}
```

Context:
{context}

Ground Truth Assistant:
{ground_truth_assistant}

Assistant:
{assistant_answer}
"""

In [None]:
reasoning_system_prompt_observation = """
You are a Contextual Compliance Analyzer. Your role is to evaluate whether the interaction between a human and an AI adheres strictly to the provided context, taking into account any additional observations provided. Carefully follow these steps:

1. **Examine the Context:**  
   Analyze the supplied context in detail (denoted by Context). Identify key topics, boundaries, and any implicit or explicit limitations.

2. **Review the Assistant’s Answer:**  
   Analyze the assistant's answer (denoted by Assistant) and assess whether it strictly aligns with the context. Check for any extraneous or unrelated information, ensuring the answer does not deviate from or extend beyond the provided context.

3. **Consider the Observation:**  
   Review the provided Observation (denoted by Observation) and incorporate these points into your evaluation. Use this information as an additional factor when assessing the assistant's answer.

4. **Determine Alignment:**  
   Reason carefully and provide a quantitative assessment (a probability or score) that reflects how closely the answer follows the context. Consider both direct alignment and any nuances where the answer might partially extend beyond the intended scope.

5. **Generate Insights:**  
   Offer any relevant insights regarding the interaction. In your reasoning, include relevant points from the Observation to support your evaluation.

After your internal reasoning, provide only the final answer strictly in the following JSON format. Do not include any additional text or explanation:

```json
\{{
"score": <probability value>,
"insight": "<your insight>"
}}\
```

Context:
{context}

Observation:
{observation}

Assistant:
{assistant_answer}
"""

In [None]:
ngram_w2v = int(os.environ.get("ngram_w2v", '1'))
outfile_w2v = os.environ.get("outfile_w2v", 'w2v.bin')
replacements_w2v = {}
context_index = f"{dataset}-context"
w2v_index = f"{dataset}-w2v"

In [None]:
class ContextBatch(BaseModel):
    context_insight: str
    context_awareness: float
    context_thinkings: str
    session_id: str
    context: str
    qa_id: str
    assistant_id: str

In [None]:
class Context(FairForge):
    def process(self, thread: Conversation):
        judge = Judge()
        # Use tqdm to display progress for conversation batches within a thread
        for batch in tqdm(thread.conversation, desc="Processing conversation batches", leave=False):
            query = batch.question
            logging.info(f"Processing query: {query}")
            data = {"context": thread.context, "assistant_answer": batch.assistant}
            if batch.observation:
                logging.info("Observation found; invoking reasoning with observation")
                thinking, json = judge.reason(
                    reasoning_system_prompt_observation,
                    query,
                    {"observation": batch.observation, **data}
                )
            else:
                logging.info("No observation; invoking standard reasoning")
                thinking, json = judge.reason(
                    reasoning_system_prompt,
                    query, {"ground_truth_assistant": batch.assistant, **data})
            self.metrics.append(
                ContextBatch(
                    context_insight=json['insight'],
                    context_awareness=json['score'],
                    context_thinkings=thinking,
                    session_id=thread.session_id,
                    context=thread.context,
                    qa_id=batch.qa_id,
                    assistant_id = thread.assistant_id
                )
            )
        logging.info(f"Finished processing thread for session_id: {thread.session_id}")

In [None]:
def recreate_index(index_name: str, mapping: dict):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")

In [None]:
def recreate_index(index_name: str, mapping: dict):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")

In [None]:
es = Elasticsearch(
    ELASTIC_URL,
    basic_auth=tuple(ELASTIC_AUTH),
)

In [None]:
context = Context()
metrics = context.pipeline()

In [None]:
mapping_w2v = {
    "mappings": {
        "properties": {
            "word": {"type": "text"},
            "x": {"type": "float"},
            "y": {"type": "float"},
            "z": {"type": "float"},
            "session_id": {"type": "keyword"},
            "assistant_id": {"type": "keyword"}
        }
    }
}

recreate_index(w2v_index, mapping_w2v)

In [None]:
session_contexts = {}
for item in context.dataset:
    session_id = item.session_id
    context_str = item.context
    language = item.preferred_language
    asssitant_id = item.assistant_id
    session_contexts[session_id]={"context":context_str,"language": language,"assistant_id":asssitant_id}

In [None]:
class ContextAnalyzer(FairForge):
    def process(self, thread: Conversation):
        for batch in thread.conversation:
            # Get emotional distribution for ground truth and real assistant
            batch = ContextBatch(
                context="",
                context_insight="",
                context_awareness=0.0,
                context_thinkings="",
                session_id=thread.session_id,
                qa_id=batch.qa_id,
                assistant_id=thread.assistant_id
            )
            self.metrics.append(batch)

In [None]:
class LineIterator:
    def __init__(self, contexts):
        self.contexts = contexts

    def replace_all(self, line):
        for word, rep in replacements_w2v.items():
            line = line.replace(word, rep)
        return line

    def __iter__(self):
        for context in self.contexts:
            for sentence in sent_tokenize(context):
                yield tokenizer.tokenize(word_tokenize(self.replace_all(sentence)))

tokenizer = MWETokenizer(separator=" ")

for session_id, contexts in session_contexts.items():
    logging.info(f"Processing session_id: {session_id}")
    sentences = list(LineIterator(contexts['context']))
    
    if ngram_w2v > 1:
        phrases = Phrases(sentences)
        phraser = Phraser(phrases)
        sentences = [phraser[sentence] for sentence in sentences]
        
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    words = [word for word in model.wv.key_to_index.keys() if word.lower() not in contexts['language']]
    if not words:
        logging.warning(f"No words left after filtering stopwords for session_id {session_id}. Skipping TSNE and indexing.")
        continue

    vectors = np.array([model.wv[word] for word in words])
    perplexity = max(1, len(words) - 1)

    tsne = TSNE(n_components=3, perplexity=perplexity, n_iter=1000, random_state=42)
    result = tsne.fit_transform(vectors)

    docs = []
    for i, word in enumerate(words):
        doc = {
            "_index": w2v_index,
            "_source": {
                "session_id": session_id,
                "word": word,
                "x": float(result[i, 0]),
                "y": float(result[i, 1]),
                "z": float(result[i, 2]),
                "assistant_id":contexts["assistant_id"]
            }
        }
        docs.append(doc)

    helpers.bulk(es, docs)
    logging.info(f"TSNE coordinates for session_id {session_id} loaded into Elasticsearch.")

logging.info("All session models and TSNE coordinates have been processed and indexed.")

In [None]:
mapping_contextualizer = {
  "mappings": {
    "properties": {
      "session_id": {"type": "keyword"},
      "context": {"type": "text"},
      "context_insight": {"type": "text"},
      "context_awareness": {"type": "float"},
      "context_thinkings": {"type": "text"},
      "qa_id": {"type": "keyword"},
      "assistant_id": {"type": "keyword"},
    }
  }
}
recreate_index(context_index, mapping_contextualizer)

In [None]:
docs = []
for metric in metrics:
    docs.append({
            "_index": context_index,
            "_source": metric.model_dump()
    })

helpers.bulk(es, docs)
print(f"Indexed {len(docs)} documents.")