In [None]:
!pip install -r requirements.txt -q

In [None]:
from helpers.dataset import Conversation
from elasticsearch import Elasticsearch, helpers
from pydantic import BaseModel
import numpy as np
import logging
from typing import List
import os
from helpers.fair_forge import FairForge
import pandas as pd
import math
import re
from collections import defaultdict
from scipy.stats import spearmanr

In [None]:
ELASTIC_URL = os.environ.get('ELASTIC_URL')
ELASTIC_AUTH = [os.environ.get('ELASTIC_AUTH_USER'), os.environ.get('ELASTIC_AUTH_PASSWORD')]
dataset = os.environ.get("dataset", "asb")
humanity_index = f"{dataset}-humanity"

TODO:
- Implement Emotion matching
- Implement Language Style Matching, LSM
- Implement Agreeableness
- Implement Empathy, Empathic Concern

## Emotional entropy

Based on NRC emotional lexicon and Plutchik eight basic emotions

In [None]:
emotion_columns = ['Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']

def load_emotion_lexicon(language: str):
    nrc = pd.read_csv("artifacts/lexicon.csv",sep=';')
    lexicon = {}
    for index, row in nrc.iterrows():
        word = str(row[language]).lower()
        emotions = [e for e in emotion_columns if row[e]==1]
        lexicon[word] = emotions
    return lexicon
    
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

def get_emotion_distribution(text, lexicon, emotion_list):
    counts = defaultdict(int) ## Creates a  dictionary that if no index found returns 0 
    total = 0
    for word in tokenize(text):
        if word in lexicon:
            for emotion in lexicon[word]:
                counts[emotion] += 1
                total += 1

    if total == 0:
        return {emotion: 0 for emotion in emotion_list}

    return {emotion: counts[emotion] / total for emotion in emotion_list} #frequency / total

def emotional_entropy(distribution): #entropy
    entropy = 0
    for p in distribution.values():
        if p > 0:
            entropy -= p * math.log2(p)
    return entropy

In [None]:
class HumanityBatch(BaseModel):
    session_id: str
    humanity_assistant_emotional_entropy: float
    humanity_ground_truth_spearman: float
    humanity_assistant_anger: float
    humanity_assistant_anticipation: float
    humanity_assistant_disgust: float
    humanity_assistant_fear: float
    humanity_assistant_joy: float
    humanity_assistant_sadness: float
    humanity_assistant_surprise: float
    humanity_assistant_trust: float
    qa_id: str
    assistant_id: str

class HumanityMetric(BaseModel):
    session_id: str
    conv_thread: List[HumanityBatch] = []

In [None]:
def recreate_index(index_name: str, mapping: dict):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")

In [None]:
es = Elasticsearch(
    ELASTIC_URL,
    basic_auth=tuple(ELASTIC_AUTH),
)

In [None]:
class HumanityAnalyzer(FairForge):
    def process(self, thread: Conversation):
        for batch in thread.conversation:
            query = batch.question
            lexicon = load_emotion_lexicon(thread.preferred_language)
            # Get emotional distribution for ground truth and real assistant
            assistant_distribution = get_emotion_distribution(batch.assistant, lexicon, emotion_columns)
            generated_vec = [assistant_distribution[e] for e in emotion_columns]
            ## Execute emotional entropy
            ent = emotional_entropy(assistant_distribution)
            if batch.ground_truth_assistant is None:
                spearman_val = 0
            else:
                ground_truth_assistant_distribution = get_emotion_distribution(batch.ground_truth_assistant, lexicon, emotion_columns)
                ## Spearman correlation between ground truth and real assistant answer
                expected_vec = [ground_truth_assistant_distribution[e] for e in emotion_columns]
                logging.info(f"Query: {query}")
                
                if np.std(generated_vec) == 0 or np.std(expected_vec) == 0:
                    logging.error("Spearman undefined due to constant vector.")
                    spearman_val = 0
                else:
                    spearman_val, _ = spearmanr(expected_vec, generated_vec)
                    
            logging.info(f"Spearman value: {round(spearman_val, 3)}")
            batch = HumanityBatch(
                humanity_assistant_emotional_entropy=ent,
                humanity_ground_truth_spearman=round(spearman_val, 3),
                session_id=thread.session_id,
                qa_id=batch.qa_id,
                assistant_id=thread.assistant_id,
                **{f"humanity_assistant_{key.lower()}":assistant_distribution[key] for key in emotion_columns}
            )
            self.metrics.append(batch)

In [None]:
humanity = HumanityAnalyzer()
metrics = humanity.pipeline()

In [None]:
mapping_humanity = {
  "mappings": {
    "properties": {
      "session_id": {"type": "keyword"},
      "humanity_assistant_emotional_entropy": {"type": "float"},
      "humanity_ground_truth_spearman": {"type": "float"},
      "humanity_assistant_anger": {"type": "float"},
      "humanity_assistant_anticipation": {"type": "float"},
      "humanity_assistant_disgust": {"type": "float"},
      "humanity_assistant_fear": {"type": "float"},
      "humanity_assistant_joy": {"type": "float"},
      "humanity_assistant_sadness": {"type": "float"},
      "humanity_assistant_surprise": {"type": "float"},
      "humanity_assistant_trust": {"type": "float"},
      "assistant_id": {"type": "keyword"},
      "qa_id": {"type": "keyword"},
    }
  }
}