In [1]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import numpy as np

from typing import Any, Dict, List, Optional, Callable

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI

import os
import sys
from pathlib import Path

sys.path.append("../eval_backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

from eval_backend.utils import *

import json
with open("./eval_set.json", "r", encoding="utf-8") as file:
    # Load existing data into a list
    gt_dataset = json.load(file)

# load and split data into chunks

In [2]:
data = load_document("../resources/msg_life-gb-2021-EN_final.pdf")
chunks = split_data(data=data, chunk_size=3400)

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of elements in data: {len(data)}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

number of tokens in document: 45390
number of elements in data: 1
number of chunks: 14
average number of tokens per chunk: 3242.1428571428573


# Generate Q/A pairs

In [3]:
# generate a Q/A pair for a document chunk
# result = generate_eval_set(llm, chunks)
# from eval_backend.utils import write_json
# write_json(gt_dataset, filename="./eval_set.json")

In [None]:
for qa in gt_dataset:
    if len(qa['answer'])>200:
        print(qa["question"])
        print(f"-> {qa['answer']}")
        print("-"*50)

# Set up vectorstore and retriever

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from eval_backend.utils import get_retriever

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chunks_vs = split_data(data=data, chunk_size=512)
retriever = get_retriever(chunks_vs, OpenAIEmbeddings(model="text-embedding-ada-002"), 3)

# LLM chain for query answering based on document chunks

In [30]:
from eval_backend.utils import get_qa_llm
qa_llm = get_qa_llm(retriever)#, retrieval_llm=ChatOpenAI(temperature=0, model="gpt-4"))

await qa_llm.acall("hi")

{'question': 'hi',
 'result': "I'm sorry, but I don't have enough information to provide a helpful answer."}

In [23]:
await qa_llm.acall(gt_dataset[0])

{'question': 'Who are the members of the Management Board of msg life ag as of 31 December 2021?',
 'answer': 'The members of the Management Board of msg life ag as of 31 December 2021 are Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Holger Gorissen, Robert Hess, Milenko Radic, Jens Stäcker, and Dr Wolf Wiedmann.',
 'result': 'The members of the Management Board of msg life ag as of 31 December 2021 are Holger Gorissen, Robert Hess, and Jens Stäcker.'}

# QA_LLM grading functions

In [25]:
from eval_backend.eval_metrics import grade_model_answer

In [26]:
gt_dataset[10]

{'question': 'What are some of the challenges facing the German insurance industry?',
 'answer': 'The digitisation of the economy and society, customer centricity and individualisation, industrialisation and automation, analytics and data effectiveness, and standardisation and integration.'}

In [27]:
qa_llm_answer = qa_llm(gt_dataset[10]["question"])
qa_llm_answer

{'question': 'What are some of the challenges facing the German insurance industry?',
 'result': 'Some of the challenges facing the German insurance industry include the difficult market conditions, the trend towards internationalization and consolidation, and the need for modern and flexible IT systems to increase efficiency and corporate success.'}

In [28]:
res = grade_model_answer([gt_dataset[10], gt_dataset[10]],
                   [qa_llm_answer, qa_llm_answer],
                   grade_answer_prompt="GPT4")

In [None]:
res

In [None]:
# test for incorrect answer
qa_predicted = qa_llm("What is the Life Factory?")
qa_predicted["question"] = gt_dataset[0]["question"]

In [None]:
qa_predicted

In [None]:
grade_model_answer([gt_dataset[0]],
                   [qa_predicted],
                   grade_answer_prompt="GPT4")

# Calculate quality of retrieved documents

In [11]:
from eval_backend.eval_metrics import grade_model_retrieval

qa_pair_index = 10
query = gt_dataset[qa_pair_index]["question"]
docs_retrieved = retriever.get_relevant_documents(gt_dataset[10]["question"])

retrieved_doc_text = "\n\n".join(f"Retrieved document {i}: {doc.page_content}" for i, doc in enumerate(docs_retrieved))

retrieved_dict = {"question": gt_dataset[qa_pair_index]["question"],
             "answer": gt_dataset[qa_pair_index]["answer"], "result": retrieved_doc_text}

grade_model_retrieval([gt_dataset[qa_pair_index],gt_dataset[qa_pair_index]],
                      [retrieved_dict,retrieved_dict],
                      grade_docs_prompt="x")

([{'results': 'GRADE: 4'}, {'results': 'GRADE: 4'}], 1.0)

In [None]:
gt_dataset[qa_pair_index]

# Calculate Embedding similarities

In [None]:
def grade_embedding_similarity(gt_dataset: List[Dict[str,str]], predictions: List[str],
                                retriever: VectorStoreRetriever, embedding_model: Embeddings) -> List[float]:

    m=len(gt_dataset)

    target_embeddings = np.array(embedding_model.embed_documents([qa_pair["answer"] for qa_pair in gt_dataset])).reshape(m, -1)
    predicted_embeddings = np.array(embedding_model.embed_documents(predictions)).reshape(m, -1)

    # similarities between openai embeddings ranges from 0.7 - 1.0 only
    similarities = target_embeddings @ predicted_embeddings.T

    return np.diag(similarities)

In [None]:
grade_embedding_similarity([gt_dataset[0], gt_dataset[0]],
                            [qa_llm.run(gt_dataset[0]["question"]), qa_llm.run(gt_dataset[15]["question"])],
                            retriever=retriever,
                            embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002"))

In [None]:
true_answer = gt_dataset[0]["answer"]
pred_answer = qa_llm.run(gt_dataset[0]["question"])

true_answer, pred_answer

In [None]:
import evaluate

rouge_metric = evaluate.load("rouge")

rouge_metric.compute(references=[true_answer], predictions=[pred_answer])

In [None]:
predictions = ["my cat ate the cake"]
references = ["my dog wants the cake too"]
results = rouge_metric.compute(predictions=predictions, references=references)
print(results)

In [None]:
2 * (3/6 * 3/5) / (3/6 + 3/5), 2 * (1/5 * 1/4) / (1/5 + 1/4), 