In [None]:
import os
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Literal
import numpy as np
from numpy.linalg import norm
import re
import time
from rake_nltk import Rake
from pylatexenc.latex2text import LatexNodes2Text
import nltk
import json

client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

#code to set up keyphrase extraction + extraction of questions and ideal from files
nltk.download('stopwords')
nltk.download('words')
english_words = set(nltk.corpus.words.words())

custom_stopwords = set(nltk.corpus.stopwords.words('english')) - {"no", "not", "than", "more", "same", "before", "after", "now", "then", "above", "below", "over", "under", "like", "other", "such", "few", "most", "some", "between"}  # Keep logical comparatives- important for RAG analysis 

class eval_format(BaseModel):
    Evaluation: Literal["Same", "Similar", "Different"] = Field(
    description=r"""If a point is conveyed in both answers, as responses to the associated question, output "Same".
    If a similar points is conveyed in both answers, as responses to the associated question, output "Similar".
    If all of the points are different in both answers, as responses to the associated question, output "Different".""")

#function for the evaluation agent
async def eval_agent(question, answer, ideal, eval_model) -> str:
    """
    Runs the OpenAI Evaluation AI
    
    Args:
        question: Question that the two answers are answering (included for context)
        answer: Generated answer to the question
        ideal: "Ideal" answer the generated answer is to be compared to.
        eval_model: OpenAI model to power the agent
    
    Returns:
        Evaluation in the form of "Same", "Similar" or "Different". If the API call fails, returns "N/A"
    """
    
    eval_message="""
    You are an evaluation agent tasked with comparing the given two different answers to the same question. 
    Focus on the meaning of both answers, in the context of the question, when formulating your evaluation.
    If you are unsure about the above criteria for the answers to the associated question, output "Unsure".
    Ensure that differences between numerical values and results between the two answers are emphasised in your analysis, unless the question specifically allows for approximations/inexact numerical values. 
    Then, if the question specifically allows for approximations/inexact numerical values, only compare the numerical values approximately.
    """
    eval_assistant = client.beta.assistants.create(
        name="eval_test",
        instructions=eval_message,
        model=eval_model, 
        temperature = 0.0,
        top_p = 0.2,
        response_format= {
            "type": "json_schema",
            "json_schema": {
                "name": "answer",
                "schema": eval_format.model_json_schema()
            },
        }
    )
    
    thread = client.beta.threads.create(
                    messages=[],
                )
    
    parsed = client.beta.threads.messages.create(
                    thread_id=thread.id,
                    content=question+answer+str(ideal),
                    role='user',
                )
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=eval_assistant.id,
        # pass the latest system message as instructions
        instructions=eval_message,
    )
    run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    while run.status=="queued" or run.status=="in_progress":
        time.sleep(0.1)
        run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    if run.status=="completed":
        response_messages = client.beta.threads.messages.list(thread.id, order="asc")
        for message in response_messages.data:
            for content in message.content:
                output=content.text.value
                if output.startswith("{"):
                    data=json.loads(output)
                    try:
                        evaluation=data.get("Evaluation")
                    except:
                        print("Evaluation not found", end="\r", flush=True)
    if not ("evaluation" in locals()):
        evaluation="N/A"
    client.beta.assistants.delete(assistant_id=eval_assistant.id)
    return evaluation
    
def preprocess_text(text):
    """
    Preprocesses text for keyphrase extraction
    """
    # Replace decimals/commas in numbers with an underscore and replace hyphens with underscores, generally (except for negative numbers).
    #It is only these cases that the sentence tokenizer in Rake doesn't seem to handle well
    text = re.sub(r'(\d+)\.(\d+)', r'\1_\2', text)
    text = re.sub(r'(\d+)\,(\d+)', r'\1\2', text)
    #cursive l in text is formatted strangely in ChatGPT output
    text = text.replace("`", "l")
    # Pattern explanation:
    # (?<!\s)-(?!\d) - matches hyphens not preceded by whitespace or followed by digit
    # | - OR
    # (?<=\s)-(?=\D) - matches hyphens preceded by whitespace and followed by non-digit
    text = re.sub(r'(?<!\s)-(?!\d)|(?<=\s)-(?=\D)', '_', text)
    return text
    
#function for the enbedding answers algorithm
async def embedding_answers(answer, ideal, custom_stopwords, english_words) -> str:
    """
    Novel part of AI evaluation algorithm. This algorithm extracts the keyphrases from the generated and "ideal" answers and then compares the cosine similarity of the vector embeddings between the keyphrases of the "ideal" answer and the generated answer. It gets the maximum cosine similarity for each keyphrase in the "ideal" answer and takes the mean of all of them. This mean is the returned "score". There is some additional preprocessing due to formatting and additional handling of "names" that may not have a meaningful vector embedding, but that is the main idea.
    
    Args:
        answer: Generated answer to the question
        ideal: "Ideal" answer the generated answer is to be compared to.
        custom_stopwords: A list of common words for the keyphrase extractor to automatically ignore.
        english_words: A list of words in english
    Returns:
        A mean score between 0 and 1 (in practise, between ~0.7 and 1). Generated answer considered "correct" if mean score >=0.8 
    """"
    #tell Rake to leave logical comparatives alone
    r = Rake(stopwords=custom_stopwords)
    #Extraction given the text.
    text1=preprocess_text(answer)
    #ideal is formatted using latex for CosmoPaperQA
    ideal=LatexNodes2Text().latex_to_text(ideal)
    text2=preprocess_text(ideal)
    r.extract_keywords_from_text(text1)
    key_phrases1=r.get_ranked_phrases()
    r.extract_keywords_from_text(text2)
    key_phrases2=r.get_ranked_phrases()
    result_1=[]
    for string_ideal in key_phrases2:
        #check for "names" that need to be matched exactly
        #checks that string_ideal is one word with at least one letter and that is not in english
        if (not (" " in string_ideal)) and (any(char.isalpha() for char in string_ideal)) and (not (string_ideal in english_words)):
            #if this word does exist in the answer...
            string_ideal=string_ideal.replace("_", "")
            #sort out odd formatting issues surrounding underscores in "names"
            if (string_ideal in text1.lower()):
                #we have a match!
                result_1.append(1)
            else:
                #if not, no match, therefore "incorrect"
                #0.7 works welll as "incorrect" cosine similarity for text-embedding-ada-002 model. 
                #If using text-embedding-3-large model, 0.3 works better as "incorrect" cosine similarity
                result_1.append(0.7)
        else:
            max_cos=0
            check=0
            for string_gen in key_phrases1:
                if (string_ideal==string_gen and check==0):
                    max_cos=1
                    result_1.append(max_cos)
                    check=1
            if (max_cos!=1):
                resp1 = client.embeddings.create(
                    input=string_ideal,
                    model="text-embedding-ada-002",
                    encoding_format= "float",
                )
                for string_gen in key_phrases1:
                    resp2 = client.embeddings.create(
                        input=string_gen,
                        model="text-embedding-ada-002",
                        encoding_format= "float",
                    )
                    a=np.array(resp1.data[0].embedding)
                    b=np.array(resp2.data[0].embedding)
                    cos=np.dot(a,b)/(norm(a)*norm(b))
                    if (cos>max_cos):
                        max_cos=cos
                result_1.append(max_cos)
    #mean is a crude way to combine these scores.
    #will consider "correct" if mean >=0.8, otherwise "incorrect" (also a crude metric)
    #0.8 value is designed for text-embedding-ada-002 model. If using text-embedding-3-large model, 0.4 works better
    return np.mean(np.array(result_1)) 

"""
Demonstration of how eval_agent and embedding_answers are used to perform the Embed_AI performance evaluation algorithm

eval_ai = await eval_agent(this_question, this_answer, this_ideal, eval_model)
embedding_eval = await embedding_answers(this_answer, this_ideal, custom_stopwords, english_words)

if (embedding_eval >= 0.8 and (eval_ai in ["Same", "Similar"])):
            #Embed_AI evaluation algorithm will consider a generated answer "correct" if both the embedding_eval score is >=0.8 and if the AI evaluation returns "Same" or "Similar".
"""