In [1]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import numpy as np

from typing import Any, Dict, List, Optional, Callable

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

import os
import sys
from pathlib import Path

sys.path.append("../eval_backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

import json
with open("../resources/eval_data.json", "r", encoding="utf-8") as file:
    # Load existing data into a list
    gt_dataset = json.load(file)

# load and split data into chunks

In [3]:
from backend.utils import load_and_chunk_doc, get_qa_llm, get_retriever
from backend.commons.configurations import Hyperparameters
import glob

hp_dict = {
        "id": 0,
        "chunk_size": 512,
        "chunk_overlap": 10,
        "num_retrieved_docs": 3,
        "use_llm_grader": False,
        "search_type": "mmr",
        "length_function_name": "text-embedding-ada-002",
        "grade_answer_prompt": "few_shot",
        "grade_docs_prompt": "default",
        "embedding_model": "text-embedding-ada-002",
        "retrieval_llm": "gpt-3.5-turbo",
        "grader_llm": "gpt-3.5-turbo"
    }

hp = Hyperparameters.from_dict(hp_dict)


llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
emb = OpenAIEmbeddings()

chunks = load_and_chunk_doc(hp, glob.glob("../resources/document_store/*.pdf")[1])
retriever = get_retriever(chunks,emb,3)
qa_llm = get_qa_llm(retriever, llm)

print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

number of tokens in document: 16634
number of chunks: 37
average number of tokens per chunk: 449.56756756756755


In [111]:
x = pd.read_csv("../x.csv", index_col=False)
x

Unnamed: 0.1,Unnamed: 0,predicted_answer,source_documents,hp_id,source,qa_id,predicted_answer.1,source_documents.1,hp_id.1,source.1,qa_id.1
0,0,The Supervisory Board met five times in the 20...,Retrieved document 0: ACTIVITIES OF THE SUPERV...,0.0,,,The Supervisory Board met five times in the 20...,Retrieved document 0: ACTIVITIES OF THE SUPERV...,1.0,,
1,1,The Supervisory Board received detailed report...,Retrieved document 0: The Supervisory Board re...,0.0,,,The Supervisory Board received detailed report...,Retrieved document 0: The Supervisory Board re...,1.0,,
2,0,,,,./resources/document_store\msg_life-gb-2021-EN...,7610ff41-1e9d-4bca-8ac5-557dcf3ca98c,,,,./resources/document_store\msg_life-gb-2021-EN...,7610ff41-1e9d-4bca-8ac5-557dcf3ca98c
3,1,,,,./resources/document_store\msg_life-gb-2021-EN...,e2a435ce-2332-4077-82fb-9489cd9b7979,,,,./resources/document_store\msg_life-gb-2021-EN...,e2a435ce-2332-4077-82fb-9489cd9b7979


In [122]:
df = pd.read_csv("../resources/hp_runs_data.csv")

In [124]:
df

Unnamed: 0,hp_id,predicted_answer,retrieved_docs,qa_id,source
0,0,The Supervisory Board met five times in the 20...,Retrieved document 0: 1 2 0 2 d r a o B y r o ...,7610ff41-1e9d-4bca-8ac5-557dcf3ca98c,./resources/document_store\msg_life-gb-2021-EN...
1,0,The Supervisory Board received detailed report...,Retrieved document 0: All significant matters ...,e2a435ce-2332-4077-82fb-9489cd9b7979,./resources/document_store\msg_life-gb-2021-EN...
2,1,The Supervisory Board met five times in the 20...,Retrieved document 0: ACTIVITIES OF THE SUPERV...,7610ff41-1e9d-4bca-8ac5-557dcf3ca98c,./resources/document_store\msg_life-gb-2021-EN...
3,1,The Supervisory Board received detailed report...,Retrieved document 0: The Supervisory Board re...,e2a435ce-2332-4077-82fb-9489cd9b7979,./resources/document_store\msg_life-gb-2021-EN...


In [88]:
df

Unnamed: 0,predicted_answer,retrieved_docs,qa_id
0,The Supervisory Board met five times in the 20...,Retrieved document 0: 1 2 0 2 d r a o B y r o ...,7610ff41-1e9d-4bca-8ac5-557dcf3ca98c
1,The Supervisory Board received detailed report...,Retrieved document 0: All significant matters ...,e2a435ce-2332-4077-82fb-9489cd9b7979
2,The text does not provide information about th...,Retrieved document 0: To prepare for their dec...,fa3300d1-88dd-4057-a1fb-57d1d445a3de
3,The context does not provide information about...,Retrieved document 0: The auditor audited the ...,563e90a7-d20d-4b39-adde-d3445e32cdfe
4,The control agreement signed by msg life ag an...,Retrieved document 0: As of the reporting date...,96251a76-5044-4b42-afb5-52fa1ecf8248
...,...,...,...
61,The independent auditor's report stated that t...,Retrieved document 0: We have audited the cons...,7d297161-4487-46a3-ad64-6c5008248662
62,The legal representatives are responsible for ...,"Retrieved document 0: Furthermore, the legal r...",bb8823b0-7c0f-459f-9970-52c41d74d077
63,The legal representatives are responsible for ...,Retrieved document 0: — otherwise appears to b...,803cca11-b041-43f1-a30b-59214f642225
64,The responsibilities of the auditor are to obt...,Retrieved document 0: Responsibility of the au...,39e401b8-95fd-4081-bb62-29a755e85aaf


In [82]:
from backend.commons.chroma import ChromaClient
from datetime import datetime

with ChromaClient() as client:
    collection = client.create_collection(name="mycol", metadata={"timestamp":  datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]})

    

In [83]:
collection.dict()

{'name': 'mycol',
 'id': UUID('b619c529-8277-4a91-a577-f0298e002dc5'),
 'metadata': {'timestamp': '2023-10-06 18:42:21,831'}}

# Generate Q/A pairs

In [None]:
# generate a Q/A pair for a document chunk
# result = generate_eval_set(llm, chunks)
# from eval_backend.utils import write_json
# write_json(gt_dataset, filename="./eval_set.json")

In [None]:
from langchain.chains import QAGenerationChain
from commons import QA_PROMPT_SELECTOR

eval_set = []
qa_chain = QAGenerationChain.from_llm(hp.retrieval_llm, prompt=QA_PROMPT_SELECTOR.get_prompt(hp.retrieval_llm))

await get_qa_from_chunk(chunks[7], qa_chain, eval_set)

In [None]:
eval_set

In [None]:
eval_set1 = []
qa_chain = QAGenerationChain.from_llm(hp.retrieval_llm)

await get_qa_from_chunk(chunks[7], qa_chain, eval_set1)

In [None]:
pprint(eval_set1)

In [None]:
def pprint(eval):
    for qa in eval:
        print(qa["question"])
        print(qa["answer"])
        print("-"*50)

In [None]:
for qa in gt_dataset:
    if len(qa['answer'])>200:
        print(qa["question"])
        print(f"-> {qa['answer']}")
        print("-"*50)

# Set up vectorstore and retriever

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from eval_backend.utils import get_retriever

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chunks_vs = split_data(data=data, chunk_size=512)
retriever = get_retriever(chunks_vs, OpenAIEmbeddings(model="text-embedding-ada-002"), 3)

# LLM chain for query answering based on document chunks

In [None]:
from eval_backend.utils import get_qa_llm
qa_llm = get_qa_llm(retriever)#, retrieval_llm=ChatOpenAI(temperature=0, model="gpt-4"))

await qa_llm.acall("hi")

In [None]:
await qa_llm.acall(gt_dataset[0])

# QA_LLM grading functions

In [None]:
from eval_backend.eval_metrics import grade_model_answer

In [None]:
gt_dataset[10]

In [None]:
qa_llm_answer = qa_llm(gt_dataset[10]["question"])
qa_llm_answer

In [None]:
res = grade_model_answer([gt_dataset[10], gt_dataset[10]],
                   [qa_llm_answer, qa_llm_answer],
                   grade_answer_prompt="GPT4")

In [None]:
res

In [None]:
# test for incorrect answer
qa_predicted = qa_llm("What is the Life Factory?")
qa_predicted["question"] = gt_dataset[0]["question"]

In [None]:
qa_predicted

In [None]:
grade_model_answer([gt_dataset[0]],
                   [qa_predicted],
                   grade_answer_prompt="GPT4")

# Calculate quality of retrieved documents

In [None]:
from eval_backend.eval_metrics import grade_model_retrieval

qa_pair_index = 10
query = gt_dataset[qa_pair_index]["question"]
docs_retrieved = retriever.get_relevant_documents(gt_dataset[10]["question"])

retrieved_doc_text = "\n\n".join(f"Retrieved document {i}: {doc.page_content}" for i, doc in enumerate(docs_retrieved))

retrieved_dict = {"question": gt_dataset[qa_pair_index]["question"],
             "answer": gt_dataset[qa_pair_index]["answer"], "result": retrieved_doc_text}

grade_model_retrieval([gt_dataset[qa_pair_index],gt_dataset[qa_pair_index]],
                      [retrieved_dict,retrieved_dict],
                      grade_docs_prompt="x")

In [None]:
gt_dataset[qa_pair_index]

# Calculate Embedding similarities

In [None]:
def grade_embedding_similarity(gt_dataset: List[Dict[str,str]], predictions: List[str],
                                retriever: VectorStoreRetriever, embedding_model: Embeddings) -> List[float]:

    m=len(gt_dataset)

    target_embeddings = np.array(embedding_model.embed_documents([qa_pair["answer"] for qa_pair in gt_dataset])).reshape(m, -1)
    predicted_embeddings = np.array(embedding_model.embed_documents(predictions)).reshape(m, -1)

    # similarities between openai embeddings ranges from 0.7 - 1.0 only
    similarities = target_embeddings @ predicted_embeddings.T

    return np.diag(similarities)

In [None]:
grade_embedding_similarity([gt_dataset[0], gt_dataset[0]],
                            [qa_llm.run(gt_dataset[0]["question"]), qa_llm.run(gt_dataset[15]["question"])],
                            retriever=retriever,
                            embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002"))

In [None]:
true_answer = gt_dataset[0]["answer"]
pred_answer = qa_llm.run(gt_dataset[0]["question"])

true_answer, pred_answer

In [None]:
import evaluate

rouge_metric = evaluate.load("rouge")

rouge_metric.compute(references=[true_answer], predictions=[pred_answer])

In [None]:
predictions = ["my cat ate the cake"]
references = ["my dog wants the cake too"]
results = rouge_metric.compute(predictions=predictions, references=references)
print(results)

In [None]:
2 * (3/6 * 3/5) / (3/6 + 3/5), 2 * (1/5 * 1/4) / (1/5 + 1/4), 

In [None]:
import os

os.path.exists("../resources/eval_data.json")