In [None]:
import os
from openai import OpenAI
from pydantic import BaseModel, Field, conlist
import numpy as np
from numpy.linalg import norm
import re
from rake_nltk import Rake
from pylatexenc.latex2text import LatexNodes2Text
import nltk
import json
from typing import Literal
from typing import Any

from inspect_ai.solver import (
    TaskState,
    solver,
)
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import (
    CORRECT,
    INCORRECT,
    Score,
    Target,
    accuracy,
    stderr,
    scorer,
)
from inspect_ai.solver import bridge
from inspect_ai import eval

import pandas as pd

#poor man's pass by ref
class CSVHolder:
    def __init__(self, value):
        self.value = value

client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

#Vector store for files used for RAG, change this to whatever vector store you have for OpenAI. 
#See Create_Vector_Store_Example.ipynb in the repository to see how to do this.
vector_store=client.vector_stores.retrieve(vector_store_id="vs_67da9f09a6b48191a32189befe73c49e")

#file path to the csv file containing the evaluation dataset
lit = pd.read_csv('../cmbagent_dataset/cmbagent_dataset.csv', delimiter="\t")

question=[]
ideal=[]
for i in range(lit.shape[0]):
    question.append(lit.loc[i, "question"])
    ideal.append(lit.loc[i, "ideal"])

def inspect_ai_eval(rag_agent, eval_agent, embedding_answers):
    """
    Function to implement inspect_ai evaluation using rag_agent and eval_agent functions

    Args:
        rag_agent: function that implements the RAG for the generated answers to the questions in the dataset
        eval_agent: function that implements the AI evaluation agent for Embed_AI
        embedding_answers: function that implements the cosine similarity of vector embeddings component of Embed_AI
    Returns:
        nominally None, but will print out a csv file with all of the relevant evaluation and RAG information present.
    """
    #setup mytasks for evaluation
    mytasks = []
    for i in range(len(question)):
        mytasks.append({
            "input": question[i],
            "target": ideal[i]
        })
    #setup output DataFrame
    new_output_holder= CSVHolder(pd.DataFrame({
        'question': pd.Series(dtype='object'),
        'answer': pd.Series(dtype='object'),
        'ideal': pd.Series(dtype='object'),
        'AI_eval': pd.Series(dtype='object'),
        'embedding_eval': pd.Series(dtype='float'),
        'evaluation': pd.Series(dtype='object')
    }))
    
    async def my_agent(task_input: list[dict[str, Any]]) -> str:
        #replace rag_agent function if needed, to implement custom RAG agent.
        #Can put OpenAI RAG agent or PaperQA2 RAG agent here
        answer=rag_agent(task_input[0]["content"], vector_store, "gpt-4o-mini")
        return answer
    
    @solver
    def my_solver():
        async def solve(state: TaskState) -> TaskState:
            result = await my_agent(state["input"])
            return {"output":result}
        return solve
    
    @task
    def my_task(tasks):
        return Task(
            dataset=[Sample(
                input=tasks[i]["input"],
                target=tasks[i]["target"]
            ) for i in range(len(tasks))],
            solver = bridge(my_solver()),
            #replace "gpt-4o-mini" with model you want to use for evaluation AI
            scorer = my_scorer("gpt-4o-mini", custom_stopwords, english_words, new_output_holder),
        )
    
    @scorer(metrics=[accuracy(), stderr()])
    def my_scorer(eval_model: str, custom_stopwords: set, english_words: set, new_output_holder):
        async def score(state: TaskState, target: Target) -> Score:
            this_question = state.input_text
            this_answer = state.output.completion
            this_ideal = target.text

            #replace eval_agent function if needed, to implement custom evaluation agent
            eval_ai = await eval_agent(this_question, this_answer, this_ideal, eval_model
            embedding_eval = await embedding_answers(this_answer, this_ideal, custom_stopwords, english_words)
    
            #sort out new_output pass by ref
            if (embedding_eval >= 0.8 and (eval_ai in ["Same", "Similar"])):
                #Embed_AI evaluation algorithm will consider a generated answer "correct" if both the embedding_eval score is >=0.8 and if the AI evaluation returns "Same" or "Similar".
                new_entry=pd.DataFrame({"question":this_question, "answer":this_answer, "ideal":this_ideal, "AI_eval": eval_ai, "embedding_eval": embedding_eval, "evaluation":"CORRECT"}, index=[0])
                new_output_holder.value=pd.concat([new_output_holder.value, new_entry], ignore_index=True)
                new_output_holder.value.to_csv("output_CosmoPaperQA_OpenAI_eval.csv", index=False)
                return Score(value=CORRECT)
            else:
                new_entry=pd.DataFrame({"question":this_question, "answer":this_answer, "ideal":this_ideal, "AI_eval": eval_ai, "embedding_eval": embedding_eval, "evaluation":"INCORRECT"}, index=[0])
                new_output_holder.value=pd.concat([new_output_holder.value, new_entry], ignore_index=True)
                new_output_holder.value.to_csv("output_CosmoPaperQA_OpenAI_eval.csv", index=False)
                return Score(value=INCORRECT)
        return score
    
    new_output_holder.value.to_csv("output_CosmoPaperQA_OpenAI_eval.csv", index=False)
    logs = eval(
        my_task(mytasks)
    )
    print(logs)
    for log in logs:
        print(log.results)
    return None