In [1]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import numpy as np

from typing import Any, Dict, List, Optional, Callable

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI

import os
import sys
from pathlib import Path

sys.path.append("../eval_backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

from eval_backend.utils import load_and_chunk_doc
from eval_backend.testsetgen.test_set_generator import generate_eval_set, get_qa_from_chunk
from eval_backend.commons import Hyperparameters

import json
with open("./eval_set.json", "r", encoding="utf-8") as file:
    # Load existing data into a list
    gt_dataset = json.load(file)

doc = "../resources/msg_life-gb-2021-EN_final.pdf"

ImportError: cannot import name 'generate_eval_set' from 'eval_backend.testsetgen.test_set_generator' (C:\Users\andre\OneDrive\Desktop\ML\myRepos\RAG-Evaluation\notebooks\..\eval_backend\testsetgen\test_set_generator.py)

In [5]:
import numpy as np

np.average([1,2])

1.5

# load and split data into chunks

In [2]:
qa_gen_configs =     {
        "chunk_size": 3400,
        "chunk_overlap": 0,
        "num_retrieved_docs": 3,
        "use_llm_grader": False,
        "length_function_name": "text-embedding-ada-002",
        "grade_answer_prompt": None,
        "grade_docs_prompt": None,
        "embedding_model": "text-embedding-ada-002",
        "retrieval_llm": "gpt-3.5-turbo",
        "grader_llm": "gpt-3.5-turbo",
    }
hp = Hyperparameters.from_dict(qa_gen_configs)

chunks = load_and_chunk_doc(hp, doc)

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

number of tokens in document: 45390
number of chunks: 14
average number of tokens per chunk: 3242.1428571428573


# Generate Q/A pairs

In [None]:
# generate a Q/A pair for a document chunk
# result = generate_eval_set(llm, chunks)
# from eval_backend.utils import write_json
# write_json(gt_dataset, filename="./eval_set.json")

In [3]:
d =     {
        "chunk_size": 2048,
        "chunk_overlap": 0,
        "num_retrieved_docs": 3,
        "use_llm_grader": False,
        "length_function_name": "text-embedding-ada-002",
        "grade_answer_prompt": "3cats_zero_shot",
        "grade_docs_prompt": "default",
        "embedding_model": "text-embedding-ada-002",
        "retrieval_llm": "gpt-3.5-turbo",
        "grader_llm": "gpt-4"
    }


res = await generate_eval_set(d, doc)

In [11]:
pprint(res)

What activities did the Supervisory Board perform in the 2021 financial year?
The Supervisory Board diligently performed its duties in the 2021 financial year by monitoring the activities of the Management Board, comparing actual business performance against targets, and scrutinizing the operational and strategic performance of the company. They also discussed the current indicators of earnings, financial and assets position of the msg life Group, as well as important projects and the general development of the market climates.
--------------------------------------------------
What reports did the Supervisory Board receive from the Management Board?
The Supervisory Board received detailed reports from the Management Board in preparation for each meeting. These reports included information on the current business situation of the company and its individual segments, major sales projects, risk management, compliance, auditing, data protection, and relationships with affiliated companies

In [None]:
from langchain.chains import QAGenerationChain
from commons import QA_PROMPT_SELECTOR

eval_set = []
qa_chain = QAGenerationChain.from_llm(hp.retrieval_llm, prompt=QA_PROMPT_SELECTOR.get_prompt(hp.retrieval_llm))

await get_qa_from_chunk(chunks[7], qa_chain, eval_set)

In [None]:
eval_set

In [None]:
eval_set1 = []
qa_chain = QAGenerationChain.from_llm(hp.retrieval_llm)

await get_qa_from_chunk(chunks[7], qa_chain, eval_set1)

In [None]:
pprint(eval_set1)

In [10]:
def pprint(eval):
    for qa in eval:
        print(qa["question"])
        print(qa["answer"])
        print("-"*50)

In [None]:
for qa in gt_dataset:
    if len(qa['answer'])>200:
        print(qa["question"])
        print(f"-> {qa['answer']}")
        print("-"*50)

# Set up vectorstore and retriever

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from eval_backend.utils import get_retriever

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chunks_vs = split_data(data=data, chunk_size=512)
retriever = get_retriever(chunks_vs, OpenAIEmbeddings(model="text-embedding-ada-002"), 3)

# LLM chain for query answering based on document chunks

In [None]:
from eval_backend.utils import get_qa_llm
qa_llm = get_qa_llm(retriever)#, retrieval_llm=ChatOpenAI(temperature=0, model="gpt-4"))

await qa_llm.acall("hi")

In [None]:
await qa_llm.acall(gt_dataset[0])

# QA_LLM grading functions

In [None]:
from eval_backend.eval_metrics import grade_model_answer

In [None]:
gt_dataset[10]

In [None]:
qa_llm_answer = qa_llm(gt_dataset[10]["question"])
qa_llm_answer

In [None]:
res = grade_model_answer([gt_dataset[10], gt_dataset[10]],
                   [qa_llm_answer, qa_llm_answer],
                   grade_answer_prompt="GPT4")

In [None]:
res

In [None]:
# test for incorrect answer
qa_predicted = qa_llm("What is the Life Factory?")
qa_predicted["question"] = gt_dataset[0]["question"]

In [None]:
qa_predicted

In [None]:
grade_model_answer([gt_dataset[0]],
                   [qa_predicted],
                   grade_answer_prompt="GPT4")

# Calculate quality of retrieved documents

In [None]:
from eval_backend.eval_metrics import grade_model_retrieval

qa_pair_index = 10
query = gt_dataset[qa_pair_index]["question"]
docs_retrieved = retriever.get_relevant_documents(gt_dataset[10]["question"])

retrieved_doc_text = "\n\n".join(f"Retrieved document {i}: {doc.page_content}" for i, doc in enumerate(docs_retrieved))

retrieved_dict = {"question": gt_dataset[qa_pair_index]["question"],
             "answer": gt_dataset[qa_pair_index]["answer"], "result": retrieved_doc_text}

grade_model_retrieval([gt_dataset[qa_pair_index],gt_dataset[qa_pair_index]],
                      [retrieved_dict,retrieved_dict],
                      grade_docs_prompt="x")

In [None]:
gt_dataset[qa_pair_index]

# Calculate Embedding similarities

In [None]:
def grade_embedding_similarity(gt_dataset: List[Dict[str,str]], predictions: List[str],
                                retriever: VectorStoreRetriever, embedding_model: Embeddings) -> List[float]:

    m=len(gt_dataset)

    target_embeddings = np.array(embedding_model.embed_documents([qa_pair["answer"] for qa_pair in gt_dataset])).reshape(m, -1)
    predicted_embeddings = np.array(embedding_model.embed_documents(predictions)).reshape(m, -1)

    # similarities between openai embeddings ranges from 0.7 - 1.0 only
    similarities = target_embeddings @ predicted_embeddings.T

    return np.diag(similarities)

In [None]:
grade_embedding_similarity([gt_dataset[0], gt_dataset[0]],
                            [qa_llm.run(gt_dataset[0]["question"]), qa_llm.run(gt_dataset[15]["question"])],
                            retriever=retriever,
                            embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002"))

In [None]:
true_answer = gt_dataset[0]["answer"]
pred_answer = qa_llm.run(gt_dataset[0]["question"])

true_answer, pred_answer

In [None]:
import evaluate

rouge_metric = evaluate.load("rouge")

rouge_metric.compute(references=[true_answer], predictions=[pred_answer])

In [None]:
predictions = ["my cat ate the cake"]
references = ["my dog wants the cake too"]
results = rouge_metric.compute(predictions=predictions, references=references)
print(results)

In [None]:
2 * (3/6 * 3/5) / (3/6 + 3/5), 2 * (1/5 * 1/4) / (1/5 + 1/4), 

In [None]:
import os

os.path.exists("../resources/eval_data.json")