In [3]:
import os
from openai import OpenAI
from pydantic import BaseModel, Field, conlist
import time
import numpy as np
from numpy.linalg import norm
from scipy.interpolate import UnivariateSpline
from matplotlib import pyplot as plt
import re
from IPython.display import clear_output
import sys
from rake_nltk import Rake
from pylatexenc.latex2text import LatexNodes2Text
import nltk

client = OpenAI(api_key = os.environ["OPENAI_API_KEY"])

class gen_format(BaseModel):
    Question: conlist(str, min_length=5, max_length=5) = Field(description="A list of five questions that could be answered by the given answer")
    
class rag_format(BaseModel):
    Ranked_Relevant_Information: list[str] = Field(description="The ranked list of information relevant to the query, with no paraphrasing or changing of the text from the files.")

def preprocess_text(text):
    # Replace decimals/commas in numbers with an underscore and replace hyphens with underscores, generally (except for negative numbers).
    #It is only these cases that the sentence tokenizer in Rake doesn't seem to handle well
    text = re.sub(r'(\d+)\.(\d+)', r'\1_\2', text)
    text = re.sub(r'(\d+)\,(\d+)', r'\1\2', text)
    #cursive l in text is formatted strangely in cmbagent output
    text = text.replace("`", "l")
    # Pattern explanation:
    # (?<!\s)-(?!\d) - matches hyphens not preceded by whitespace or followed by digit
    # | - OR
    # (?<=\s)-(?=\D) - matches hyphens preceded by whitespace and followed by non-digit
    text = re.sub(r'(?<!\s)-(?!\d)|(?<=\s)-(?=\D)', '_', text)
    return text
    
def embedding_answers(answer, ideal, custom_stopwords, english_words) -> str:
    #tell Rake to leave logical comparatives alone
    r = Rake(stopwords=custom_stopwords)
    # Extraction given the text.
    text1=preprocess_text(answer)
    #ideal is formatted using latex for CMBagent
    ideal=LatexNodes2Text().latex_to_text(ideal)
    text2=preprocess_text(ideal)
    r.extract_keywords_from_text(text1)
    key_phrases1=r.get_ranked_phrases()
    r.extract_keywords_from_text(text2)
    key_phrases2=r.get_ranked_phrases()
    print(key_phrases1)
    print(key_phrases2)
    result_1=[]
    for string_ideal in key_phrases2:
        #check for "names" that need to be matched exactly
        #checks that string_ideal is one word with at least one letter and that is not in english
        if (not (" " in string_ideal)) and (any(char.isalpha() for char in string_ideal)) and (not (string_ideal in english_words)):
            #if this word does exist in the answer...
            string_ideal=string_ideal.replace("_", "")
            #sort out odd formatting issues surrounding underscores in "names"
            if (string_ideal in text1.lower()):
                #we have a match!
                result_1.append(1)
            else:
                #if not, no match, therefore "incorrect"
                result_1.append(0.7)
        else:
            max_cos=0
            check=0
            for string_gen in key_phrases1:
                if (string_ideal==string_gen and check==0):
                    max_cos=1
                    result_1.append(max_cos)
                    check=1
            if (max_cos!=1):
                resp1 = client.embeddings.create(
                    input=string_ideal,
                    model="text-embedding-ada-002",
                    encoding_format= "float",
                )
                for string_gen in key_phrases1:
                    resp2 = client.embeddings.create(
                        input=string_gen,
                        model="text-embedding-ada-002",
                        encoding_format= "float",
                    )
                    a=np.array(resp1.data[0].embedding)
                    b=np.array(resp2.data[0].embedding)
                    cos=np.dot(a,b)/(norm(a)*norm(b))
                    if (cos>max_cos):
                        max_cos=cos
                result_1.append(max_cos)
    return np.array(result_1)

def embedding_questions(question, answer, question_model) -> str:
    og_question = client.embeddings.create(
        input=question,
        model="text-embedding-ada-002",
        encoding_format= "float",
    )
    gen_message="""
    You are a question generating agent. Your task is to generate a list of questions for the given answer.
    """
    gen_assistant = client.beta.assistants.create(
        name="gen_test",
        instructions=gen_message,
        model=question_model, 
        temperature = 0.0,
        top_p = 0.2,
        response_format= {
            "type": "json_schema",
            "json_schema": {
                "name": "answer",
                "schema": gen_format.model_json_schema()
            },
        }
    )
    
    thread = client.beta.threads.create(
                    messages=[],
                )
    
    parsed = client.beta.threads.messages.create(
                    thread_id=thread.id,
                    content=answer,
                    role='user',
                )
    
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=gen_assistant.id,
        # pass the latest system message as instructions
        instructions=gen_message,
    )
    run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    while run.status!="completed":
        run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    response_messages = client.beta.threads.messages.list(thread.id, order="asc")
    for message in response_messages.data:
        for content in message.content:
            output=content.text.value
            if output.startswith("{"):
                data=json.loads(output)
                generated=data["Question"]
    client.beta.assistants.delete(assistant_id=gen_assistant.id)
    result=[]
    for gen_question in generated:
        gen_question_vector = client.embeddings.create(
            input=gen_question,
            model="text-embedding-ada-002",
            encoding_format= "float",
        )
        a=np.array(gen_question_vector.data[0].embedding)
        b=np.array(og_question.data[0].embedding)
        result.append(np.dot(a, b)/(norm(a)*norm(b)))
    return result
    
def embedding_search(vector_store, question, answer, search_model) -> str:
    rag_message="""You are a retrieval agent tasked with performing file searches to find information for the purpose of providing answers.
        Find pieces of information that will be directly relevant for answering the query and rank these pieces of information from most relevant to least relevant.
        You must quote the passages from the files directly. Do not paraphrase or change the text in any way.
        Do not add anything else to the passage quotations, including sources and filenames.
        If no information is relevant, you must return a single piece of information, where you state "No information found".
        Ideally, these pieces of information will be sentences, phrases, data points or sets of data points, but you have limited flexiblility to include other pieces of information if you think they are appropriate.
        
        You must use tool call (i.e., file search).
        
        You know about the content of the code-base.
        """
    rag_assistant = client.beta.assistants.create(
        name="rag_test",
        instructions=rag_message,
        tools=[
            {"type": "file_search",
                "file_search":{
                    'max_num_results': 10,
                    "ranking_options": {
                        "ranker": "auto",
                        "score_threshold": 0.6
                    }
                }
            }
        ],
        tool_resources={"file_search": {"vector_store_ids":[vector_store.id]}},
        model=search_model, 
        temperature = 0,
        top_p = 0.2,
        response_format= {
            "type": "json_schema",
            "json_schema": {
                "name": "answer",
                "schema": rag_format.model_json_schema()
            },
        }
    )
    thread = client.beta.threads.create(
                    messages=[],
                )
    
    parsed = client.beta.threads.messages.create(
                    thread_id=thread.id,
                    content=question,
                    role='user',
                )
    
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=rag_assistant.id,
        # pass the latest system message as instructions
        instructions=rag_message,
    )
    run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    while run.status!="completed":
        run = client.beta.threads.runs.retrieve(run.id, thread_id=thread.id)
    response_messages = client.beta.threads.messages.list(thread.id, order="asc")
    for message in response_messages.data:
        for content in message.content:
            output=content.text.value
            if output.startswith("{"):
                data=json.loads(output)
                try:
                    information=data["Ranked_Relevant_Information"]
                except:
                    information=data["Ranked Relevant Information"]
    if ("information" in locals()):
        #uncomment for hallucination guarding
        """run_steps = client.beta.threads.runs.steps.list(
            thread_id=thread.id,
            run_id=run.id
        )
        j=0
        for step in run_steps.data:
            #wait until the runs.steps.list has finished
            while step.status!="completed":
                run_steps = client.beta.threads.runs.steps.list(
                    thread_id=thread.id,
                    run_id=run.id
                )
            if (j!=0):
                retrieved_step = client.beta.threads.runs.steps.retrieve(
                    thread_id=step.thread_id,
                    run_id=run.id,
                    step_id=step.id,
                    include=["step_details.tool_calls[*].file_search.results[*].content"]
                )
                #check for hallucinations and flag all "offending text" from the passed information list
                information=hallucination_check(retrieved_step, information, question, 20)
            j+=1
        """
        pass
    else:
        information=["No information."]
    answer_resp = client.embeddings.create(
        input=answer,
        model="text-embedding-ada-002",
        encoding_format= "float",
    )
    results=[]
    for i in range(len(information)):
        info_resp = client.embeddings.create(
            input=information[i],
            model="text-embedding-ada-002",
            encoding_format= "float",
        )
        a=np.array(info_resp.data[0].embedding)
        b=np.array(answer_resp.data[0].embedding)
        results.append(np.dot(a, b)/(norm(a)*norm(b)))
    i=0
    #add is binary True/False
    add=1
    my_diff=0
    mean_score=[]
    for score in results:
        if (i!=0):
            my_diff=np.absolute(score-store)
            if (i==1):
                ref_diff=my_diff
            if (my_diff>=2*ref_diff):
                add=0
        if (add==1):
            mean_score.append(score)
            store=score
            i+=1
    return mean_score
def hallucination_check(retrieved_step, information, question, char_match):
    raw_information=""
    for result in retrieved_step.step_details.tool_calls[0].file_search.results:
        raw_information=raw_information+result.content[0].text
    #regex expression to remove all references from the text before "cleaning".
    raw_information = re.sub(r"\(\d+\)|\[\d+\]|\(.*Fig.*\)|\(.*Table.*\)", "", raw_information)
    file_information=''.join(ch for ch in raw_information if ch.isalnum())
    for i in range(len(information)):
        verify=information[i].split("...")
        for split in verify:
            filtered_information=''.join(ch for ch in split if ch.isalnum())
            if not (filtered_information in file_information):
                clear_output(wait=True)
                print("AI is answering this question:\n"+question)
                print("AI understood:\n"+split)
                length=len(filtered_information)
                if length < char_match:
                    char_match = length
                index=file_information.find(filtered_information[:char_match])
                if (index!=-1):
                    print("The file contained:\n"+file_information[index:index+length])
                else:
                    index=file_information.find(filtered_information[length-char_match:])
                    if (index!=-1):
                        print("The file contained:\n"+file_information[index-length+char_match:index+char_match])
                    else:
                        print("The file contained:\n"+raw_information)
                print("Waiting for user input...")
                print("Correct the potential hallucination. If the AI is correct, type 'y'. If information given by the AI is not in the file segment, type 'n'.", end='', flush=True)
                user=input()
                print("User entered:", user)
                if (user!='y'):
                    print("The file contained:\n"+raw_information)
                    print("Correct the potential hallucination. If the AI is correct, type 'y'. If no information from these chunks is relevant, type 'n'. Otherwise, input what the information should be.", end='', flush=True)
                    user=input()
                    if (user!='y'):
                        if(user=='n'):
                            split="Null information."
                        else:
                            split=user
        information[i]=""
        for split in verify:
            information[i]=information[i]+"..."+split
        information[i]=information[i][3:]
    return information

In [7]:
import pandas as pd

nltk.download('stopwords')
nltk.download('words')
english_words = set(nltk.corpus.words.words())

custom_stopwords = set(nltk.corpus.stopwords.words('english')) - {"no", "not", "than", "more", "same", "before", "after", "now", "then", "above", "below", "over", "under", "like", "other", "such", "few", "most", "some", "between"}  # Keep logical comparatives- important for RAG analysis 

assistant_data = "/home/adrian/Documents/University Work/Part III Project/cmbagent_dataset/Source_Papers"
lit = pd.read_csv('../cmbagent_dataset/cmbagent_dataset.csv')
question=[]
ideal=[]
for i in range(lit.shape[0]):
    question.append(lit.loc[i, "question"])
    ideal.append(lit.loc[i, "ideal"])
#extract answer from these agent and ideal from LitQA2
with open("output_cmbagent.txt", 'r', encoding='utf-8') as file:
    file_content = file.read()
spaces=0
tab=0
i=0
answer=[]
#ideal=[]
for char in file_content:
    if (spaces==2 and char != "\t" and tab==0):
        answer[i]+=char
    elif (char =="\t"):
        spaces=0
        #ideal.append("")
        tab+=1
    elif (char != "\n" and tab==1):
        #ideal[i]+=char
        pass
    elif (char == "\n" and tab==1):
        i+=1
        tab=0
    if (char == " " and spaces<2 and tab==0):
        spaces+=1
        if (spaces==2):
            answer.append("")
for i in range(len(answer)):
    answer[i]=" ".join(word for word in answer[i].split() if (".pdf" not in word) and ("†" not in word))

[nltk_data] Downloading package stopwords to /home/adrian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/adrian/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [29]:
answer_store=[]
print("Embedding Answers")
for i in range(len(answer)):
    print(i/len(answer)*100, end="")
    print("\r", end="")
    answer_store.append(embedding_answers(answer[i], ideal[i], custom_stopwords, english_words))
    print(answer[i]+"\n"+ideal[i]+"\n"+str(answer_store[i])+" "+str(np.mean(answer_store[i]))+"\n")

Embedding Answers
['including act data enhances', 'reconstructed recombination histories', 'no significant departures', 'method examines deviations', 'standard recombination history', 'standard recombination history', 'planck 2018 analysis', 'standard scenario', 'standard prediction', 'principal_component analysis', 'z ),', 'semi_blind eigen_analysis', 'results indicate', 'high redshifts', 'free_electron fraction', 'free_electron fraction', 'xe', 'using', 'tested', 'particularly', 'form', 'constraints', 'consistent', 'additionally']
['semi_blind eigen_analysis', 'principal_component analysis', 'often referred']
The standard recombination history is tested in the Planck 2018 analysis by using a semi-blind eigen-analysis, which is a form of principal-component analysis. This method examines deviations of the free-electron fraction, xe(z), from the standard recombination history. The results indicate that there are no significant departures from the standard prediction, and the reconstruc