In [1]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import numpy as np

from typing import Any, Dict, List, Optional, Callable

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI

# load and split data into chunks

In [2]:
# loading PDF, DOCX and TXT files as LangChain Documents
import tiktoken

def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import (
        PyPDFLoader, # already splits data during loading
        UnstructuredPDFLoader, # returns only 1 Document
        PyMuPDFLoader # returns 1 Document per page
        )
        print(f'Loading {file}')
        loader = UnstructuredPDFLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file, encoding="utf-8")
    elif extension == ".docx":
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)

    data = loader.load()
    return data

def split_data(data: List[Document],
               chunk_size: Optional[int] = 4096,
               chunk_overlap: Optional[int] = 0,
               length_function: Optional[Callable] = lambda x: len(tiktoken.encoding_for_model("text-embedding-ada-002").encode(x))
              ):
    
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap,
                                                   length_function=length_function)
    
    chunks = text_splitter.split_documents(data)
    return chunks

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

data = load_document("../resources/msg_life-gb-2021-EN_final.pdf")
chunks = split_data(data=data, chunk_size=3400)

Loading ../resources/msg_life-gb-2021-EN_final.pdf


In [3]:
print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of elements in data: {len(data)}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

number of tokens in document: 45390
number of elements in data: 1
number of chunks: 14
average number of tokens per chunk: 3242.1428571428573


# Generate Q/A pairs

In [4]:
# generate a Q/A pair for a document chunk
def generate_eval_set(llm: BaseLanguageModel, chunks: List[Document]) -> List[Dict[str,str]]:
    import numpy as np
    from langchain.chains import QAGenerationChain
    from json import JSONDecodeError
    import itertools

    # set up QAGenerationChain chain using GPT 3.5
    qa_generator_chain = QAGenerationChain.from_llm(llm,k=1)#, prompt=CHAT_PROMPT)
    eval_set = []

    # catch any QA generation errors and re-try until QA pair is generated
    for i, chunk in enumerate(chunks):
        
        #if llm.get_num_tokens(chunk.page_content) < 1500:
        #    print(f"Not considering {i}/{len(chunks)} because too small")
        #    continue
            
        try:
            qa_pair = qa_generator_chain.run(chunk.page_content)
            eval_set.append(qa_pair)
            awaiting_answer = False
        except JSONDecodeError:
            print(f"Error occurred inside QAChain in chunk {i}/{len(chunks)}")
            
    eval_pair = list(itertools.chain.from_iterable(eval_set))
    return eval_pair

result = generate_eval_set(llm, chunks)

In [5]:
result

[{'question': 'How many times did the Supervisory Board meet in the 2021 financial year?',
  'answer': 'The Supervisory Board met five times in the 2021 financial year.'},
 {'question': 'What reports did the Supervisory Board receive from the Management Board?',
  'answer': 'The Supervisory Board received detailed reports from the Management Board in preparation for each meeting.'},
 {'question': 'Who was appointed as Chairman of the Supervisory Board?',
  'answer': 'Johann Zehetmaier'},
 {'question': 'What is the primary income potential for the holding company?',
  'answer': 'Earnings from participating interests'},
 {'question': 'Where are the headquarters of msg life ag located?',
  'answer': 'The headquarters of msg life ag are located in Leinfelden-Echterdingen near Stuttgart, Germany.'},
 {'question': 'Who are the members of the Management Board of msg life ag as of 31 December 2021?',
  'answer': 'The members of the Management Board of msg life ag as of 31 December 2021 are Rol

In [6]:
for qa in result:
    if len(qa['answer'])>150:
        print(qa["question"])
        print(f"-> {qa['answer']}")
        print("-"*50)

Who are the members of the Management Board of msg life ag as of 31 December 2021?
-> The members of the Management Board of msg life ag as of 31 December 2021 are Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Holger Gorissen, Robert Hess, Milenko Radic, Jens Stäcker, and Dr Wolf Wiedmann.
--------------------------------------------------
What is the purpose of the German Pension Overview Act (RentÜG)?
-> The purpose of the German Pension Overview Act (RentÜG) is to provide customers with a quick overview of their statutory, company, and private pension entitlements.
--------------------------------------------------
What were the financial key performance indicators of the msg life Group in the reporting period?
-> The msg life Group recorded gross Group revenue from its own business under German GAAP of 176.1 million euros and Group earnings before interest, taxes, depreciation and amortisation (EBITDA) under German GAAP of 17.9 million euros.
--

In [7]:
sum([1 if len(qa["answer"]) > 150 else 0 for qa in result])

17

In [8]:
gt_dataset = [qa_pair for qa_pair in result if len(qa_pair["answer"]) > 150]

# Set up vectorstore and retriever

In [9]:
from langchain.schema.vectorstore import VectorStoreRetriever 
from langchain.schema.embeddings import Embeddings
from langchain.embeddings import OpenAIEmbeddings

def get_retriever(splits: List[Document], embedding_model: Embeddings,
                  num_retrieved_docs: int) -> tuple[VectorStoreRetriever, Embeddings]:
    
    from langchain.vectorstores import FAISS
    
    vectorstore = FAISS.from_documents(splits, embedding_model)
    retriever = vectorstore.as_retriever(k=num_retrieved_docs)
        
    return retriever

chunks_vs = split_data(data=data, chunk_size=512)
retriever = get_retriever(chunks_vs, OpenAIEmbeddings(model="text-embedding-ada-002"), 3)

# LLM chain for query answering based on document chunks

In [10]:
import os
import sys
from pathlib import Path

sys.path.append("../eval_backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

In [11]:
from langchain.chains import RetrievalQA
from eval_backend.prompts import QA_CHAIN_PROMPT

def get_qa_llm(retriever: VectorStoreRetriever, 
               retrieval_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo")) -> RetrievalQA:
    
    # Select prompt 
    chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}

    qa_llm = RetrievalQA.from_chain_type(llm=retrieval_llm,
                                           chain_type="stuff",
                                           retriever=retriever,
                                           chain_type_kwargs=chain_type_kwargs,
                                           input_key="question",
                                             
                                          )
    return qa_llm

qa_llm = get_qa_llm(retriever)

In [12]:
qa_llm("What is TRAIL.X?")

{'question': 'What is TRAIL.X?',
 'result': 'TRAIL.X is a project by msg life that involves the development of deep neural networks (DNNs) for the actuarial computation module in collaboration with the Ludwig Maximilian University of Munich. These DNNs will enable life insurers to replace old system generations and integrate artificial intelligence into their core functions.'}

# QA_LLM grading functions

In [13]:
from eval_backend.prompts import (GRADE_ANSWER_PROMPT_FAST,
GRADE_ANSWER_PROMPT_BIAS_CHECK, 
GRADE_ANSWER_PROMPT_OPENAI,
GRADE_ANSWER_PROMPT,
GRADE_ANSWER_PROMPT_WITH_GRADING,
GRADE_ANSWER_PROMPT_WITH_GRADING_JUSTIFICATION)

def grade_model_answer(gt_datase: List[Dict[str,str]], 
                       predictions: List[str],
                       grade_answer_prompt: str, 
                       grader_llm: Optional[BaseLanguageModel] = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)):

    from langchain.evaluation.qa import QAEvalChain    
    
    if grade_answer_prompt == "Fast":
        prompt = GRADE_ANSWER_PROMPT_FAST
    elif grade_answer_prompt == "Descriptive w/ bias check":
        prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
    elif grade_answer_prompt == "OpenAI grading prompt":
        prompt = GRADE_ANSWER_PROMPT_OPENAI
    elif grade_answer_prompt == "Grade w/o Reasoning":
        prompt = GRADE_ANSWER_PROMPT_WITH_GRADING
    elif grade_answer_prompt == "Grade w/ Reasoning":
        prompt = GRADE_ANSWER_PROMPT_WITH_GRADING_JUSTIFICATION
    else:
        prompt = GRADE_ANSWER_PROMPT

    # Note: GPT-4 grader is advised by OAI model_name="gpt-4"
    eval_chain = QAEvalChain.from_llm(llm=grader_llm,
                                      prompt=prompt, 
                                      verbose=False)
    
    graded_outputs = eval_chain.evaluate(gt_datase,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")

    #average_grade = np.average([int(grade["results"][-1]) for grade in graded_outputs])

    return graded_outputs#, average_grade

In [14]:
gt_dataset[0]

{'question': 'Who are the members of the Management Board of msg life ag as of 31 December 2021?',
 'answer': 'The members of the Management Board of msg life ag as of 31 December 2021 are Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Holger Gorissen, Robert Hess, Milenko Radic, Jens Stäcker, and Dr Wolf Wiedmann.'}

In [15]:
qa_llm_answer = qa_llm(gt_dataset[0]["question"])
qa_llm_answer

{'question': 'Who are the members of the Management Board of msg life ag as of 31 December 2021?',
 'result': 'As of 31 December 2021, the members of the Management Board of msg life ag were Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Milenko Radic, and Dr Wolf Wiedmann.'}

In [16]:
grade_model_answer([gt_dataset[0]],
                   [qa_llm_answer],
                   grade_answer_prompt="Grade w/ Reasoning")

[{'results': 'SCORE: 2\n\nJUSTIFICATION: The student answer is mostly correct as it includes several members of the Management Board of msg life ag as of 31 December 2021. However, it is missing some members mentioned in the true answer. The question does not appear to have any bias. The true answer may have bias as it includes additional members not mentioned in the student answer.'}]

In [17]:
# test for incorrect answer
qa_predicted = qa_llm("What is the Life Factory?")
qa_predicted["question"] = gt_dataset[0]["question"]

In [18]:
qa_predicted

{'question': 'Who are the members of the Management Board of msg life ag as of 31 December 2021?',
 'result': 'The Life Factory is a policy management system developed by msg life. It is used for managing life insurance and pension products.'}

In [19]:
grade_model_answer([gt_dataset[0]],
                   [qa_predicted],
                   grade_answer_prompt="Grade w/ Reasoning")

[{'results': 'SCORE: 1\n\nJUSTIFICATION: The student answer is incorrect because it does not provide any information about the members of the Management Board of msg life ag as of 31 December 2021. The question is unbiased and straightforward. The true answer may be biased as it only provides the names of the members without any additional context or information.'}]

In [20]:
from eval_backend.prompts import GRADE_DOCS_PROMPT_FAST, GRADE_DOCS_PROMPT

def grade_model_retrieval(gt_dataset: List[Dict], predictions: List[str],
                          grade_docs_prompt: str,
                          grader_llm: Optional[BaseLanguageModel] = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)):
    
    from langchain.evaluation.qa import QAEvalChain    

    if grade_docs_prompt == "Fast":
        prompt = GRADE_DOCS_PROMPT_FAST
    else:
        prompt = GRADE_DOCS_PROMPT

    # Note: GPT-4 grader is advised by OAI
    eval_chain = QAEvalChain.from_llm(llm=grader_llm,
                                      prompt=prompt)
    
    graded_outputs = eval_chain.evaluate(gt_dataset,
                                         predictions,
                                         question_key="question",
                                         answer_key="answer",
                                         prediction_key="result")

    #average_grade = np.average([0 if "incorrect" in grade["results"][:30].lower() else 1 for grade in graded_outputs])
    
    return graded_outputs#, average_grade

In [21]:
docs_retrieved = retriever.get_relevant_documents(gt_dataset[0]["question"])

retrieved_doc_text = ""
for i, doc in enumerate(docs_retrieved):
    retrieved_doc_text += "Doc %s: " % str(i+1) + doc.page_content + " "

retrieved_dict = {"question": gt_dataset[0]["question"],
             "answer": gt_dataset[0]["answer"], "result": retrieved_doc_text}

grade_model_retrieval([gt_dataset[0]],
                      [retrieved_dict],
                      grade_docs_prompt="")

[{'results': 'GRADE: Correct\n\nJUSTIFICATION: The documents clearly state that the members of the Management Board of msg life ag as of 31 December 2021 are Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Holger Gorissen, Robert Hess, Milenko Radic, Jens Stäcker, and Dr Wolf Wiedmann.'}]

In [22]:
def grade_embedding_similarity(gt_dataset: List[Dict[str,str]], predictions: List[str],
                                retriever: VectorStoreRetriever, embedding_model: Embeddings) -> List[float]:

    m=len(gt_dataset)

    target_embeddings = np.array(embedding_model.embed_documents([qa_pair["answer"] for qa_pair in gt_dataset])).reshape(m, -1)
    predicted_embeddings = np.array(embedding_model.embed_documents(predictions)).reshape(m, -1)

    # similarities between openai embeddings ranges from 0.7 - 1.0 only
    similarities = target_embeddings @ predicted_embeddings.T

    return np.diag(similarities)

In [23]:
grade_embedding_similarity([gt_dataset[0], gt_dataset[0]],
                            [qa_llm.run(gt_dataset[0]["question"]), qa_llm.run(gt_dataset[15]["question"])],
                            retriever=retriever,
                            embedding_model=OpenAIEmbeddings(model="text-embedding-ada-002"))

array([0.99292227, 0.71218124])

In [24]:
true_answer = gt_dataset[0]["answer"]
pred_answer = qa_llm.run(gt_dataset[0]["question"])

true_answer, pred_answer

('The members of the Management Board of msg life ag as of 31 December 2021 are Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Holger Gorissen, Robert Hess, Milenko Radic, Jens Stäcker, and Dr Wolf Wiedmann.',
 'As of 31 December 2021, the members of the Management Board of msg life ag were Rolf Zielke (Chairman), Dr Aristid Neuburger (Deputy Chairman), Francesco Cargnel, Milenko Radic, and Dr Wolf Wiedmann.')

In [25]:
import evaluate

rouge_metric = evaluate.load("rouge")

rouge_metric.compute(references=[true_answer], predictions=[pred_answer])

{'rouge1': 0.8732394366197183,
 'rouge2': 0.7536231884057972,
 'rougeL': 0.7323943661971831,
 'rougeLsum': 0.7323943661971831}

In [26]:
predictions = ["my cat ate the cake"]
references = ["my dog wants the cake too"]
results = rouge_metric.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.5454545454545454, 'rouge2': 0.22222222222222224, 'rougeL': 0.5454545454545454, 'rougeLsum': 0.5454545454545454}


In [27]:
2 * (3/6 * 3/5) / (3/6 + 3/5), 2 * (1/5 * 1/4) / (1/5 + 1/4), 

(0.5454545454545454, 0.22222222222222224)