In [32]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import numpy as np
from langchain.docstore.document import Document

from typing import Any, Dict, List, Optional
from langchain.schema.language_model import BaseLanguageModel

# load and split data into chunks

In [50]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import (
        PyPDFLoader, # already splits data during loading
        UnstructuredPDFLoader, # returns only 1 Document
        PyMuPDFLoader # returns 1 Document per page
        )
        print(f'Loading {file}')
        loader = UnstructuredPDFLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file, encoding="utf-8")
    elif extension == ".docx":
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)

    data = loader.load()
    return data

def split_data(data: List[Document], llm: BaseLanguageModel, chunk_size: Optional[int] = 4096):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    """
    import tiktoken
    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
    enc.encode(x)
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   length_function=lambda x: llm.get_num_tokens(x))
    
    chunks = text_splitter.split_documents(data)
    return chunks

from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

data = load_document("resources/msg_life-gb-2021-EN_final.pdf")
chunks = split_data(data, llm=llm, chunk_size=3000)

Loading resources/msg_life-gb-2021-EN_final.pdf


In [51]:
print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of elements in data: {len(data)}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

number of tokens in document: 47819
number of elements in data: 1
number of chunks: 17
average number of tokens per chunk: 2812.8823529411766


# Generate Q/A pairs

In [52]:
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.prompts.prompt import PromptTemplate

SYSTEM_MESSAGE = """Suppose you are QuestionAnswerGPT, a large language model trained in detecting the most important
parts of a document and you are able to come up with a corresponding question and answer pair for these highly important 
sections in the document, that would help new readers understand the document.

Your task is to generate questions and corresponding answers from the given text.
"""

HUMAN_MESSAGE = """Please generate questions, answers and importance scores in of the following text: {text}"""

CHAT_PROMPT = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(SYSTEM_MESSAGE),
        HumanMessagePromptTemplate.from_template(HUMAN_MESSAGE),
    ])

# generate a Q/A pair for a document chunk
def generate_eval_set(llm: BaseLanguageModel, chunks: List[Document]) -> List[Dict[str,str]]:
    import numpy as np
    from langchain.chains import QAGenerationChain
    from json import JSONDecodeError
    import itertools

    # set up QAGenerationChain chain using GPT 3.5
    qa_generator_chain = QAGenerationChain.from_llm(llm)#, prompt=CHAT_PROMPT)
    eval_set = []

    # catch any QA generation errors and re-try until QA pair is generated
    awaiting_answer = True
    for i, chunk in enumerate(chunks):
        
        if llm.get_num_tokens(chunk.page_content) < 1500:
            print(f"Not considering {i}/{len(chunks)}")
            continue
            
        try:
            qa_pair = qa_generator_chain.run(chunk.page_content)
            eval_set.append(qa_pair)
            awaiting_answer = False
        except JSONDecodeError:
            print(f"Error occurred inside QAChain in chunk {i}/{len(chunks)}")
            
    eval_pair = list(itertools.chain.from_iterable(eval_set))
    return eval_pair

result = generate_eval_set(llm, chunks)

Not considering 16/17


In [53]:
for qa in result:
    if len(qa['answer'])>150:
        print(qa["question"])
        print(f"-> {qa['answer']}")
        print("-"*50)

What were the financial key performance indicators of the msg life Group in the reporting period?
-> The msg life Group recorded gross Group revenue from its own business under German GAAP of 176.1 million euros and Group earnings before interest, taxes, depreciation and amortisation (EBITDA) under German GAAP of 17.9 million euros.
--------------------------------------------------
What is the purpose of msg life's 'life talent' programme?
-> The purpose of msg life's 'life talent' programme is to identify employees with potential for strategic and leading roles in the company and provide them with individual support, professional development, and networking opportunities.
--------------------------------------------------
What rights do shareholders have at the annual general meeting?
-> The administrative rights include the right to attend the annual general meeting and speak, ask questions, put forward motions and exercise voting rights.
--------------------------------------------

In [54]:
sum([1 if len(qa["answer"]) > 150 else 0 for qa in result])

17

In [55]:
gt_dataset = [qa_pair for qa_pair in result if len(qa_pair["answer"]) > 150]

# Set up vectorstore and retriever

In [56]:
chunks_vs = split_data(data, llm=llm, chunk_size=512)

In [57]:
from langchain.schema.vectorstore import VectorStoreRetriever 

def get_retriever(splits: List[Document], k: int) -> VectorStoreRetriever:
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import FAISS
    
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    vectorstore = FAISS.from_documents(splits, embedding_model)
    retriever = vectorstore.as_retriever(k=k)
        
    return retriever

retriever = get_retriever(chunks_vs, 3)

# LLM chain for query answering based on document chunks

In [58]:
from langchain.chains import RetrievalQA
from utils import QA_CHAIN_PROMPT

def get_qa_llm(llm: BaseLanguageModel, retriever: VectorStoreRetriever) -> RetrievalQA:
    
    # Select prompt 
    chain_type_kwargs = {"prompt": QA_CHAIN_PROMPT}

    qa_llm = RetrievalQA.from_chain_type(llm,
                                           chain_type="stuff",
                                           retriever=retriever,
                                           chain_type_kwargs=chain_type_kwargs,
                                           input_key="question"
                                          )
    return qa_llm

qa_llm = get_qa_llm(llm, retriever)

In [60]:
qa_llm("What is TRAIL.X?")

{'question': 'What is TRAIL.X?',
 'result': 'TRAIL.X is a project that involves the development of deep neural networks (DNNs) for the actuarial computation module in cooperation with the Ludwig Maximilian University of Munich. These DNNs will enable life insurers to replace old system generations, map their core functions with artificial intelligence, and integrate them into a modern system.'}

# QA_LLM grading functions

In [88]:
from utils import (GRADE_ANSWER_PROMPT_FAST, 
                    GRADE_ANSWER_PROMPT_BIAS_CHECK, 
                    GRADE_ANSWER_PROMPT_OPENAI,
                    GRADE_ANSWER_PROMPT)

def grade_model_answer(gt_datase: List[Dict[str,str]], 
                       predictions: List[str],
                       grade_answer_prompt: str, 
                       qa_judge_model: Optional[str] = "gpt-3.5-turbo"):

    from langchain.evaluation.qa import QAEvalChain    
    
    if grade_answer_prompt == "Fast":
        prompt = GRADE_ANSWER_PROMPT_FAST
    elif grade_answer_prompt == "Descriptive w/ bias check":
        prompt = GRADE_ANSWER_PROMPT_BIAS_CHECK
    elif grade_answer_prompt == "OpenAI grading prompt":
        prompt = GRADE_ANSWER_PROMPT_OPENAI
    else:
        prompt = GRADE_ANSWER_PROMPT

    # Note: GPT-4 grader is advised by OAI model_name="gpt-4"
    eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name=qa_judge_model, temperature=0),
                                      prompt=prompt, 
                                      verbose=True)
    
    graded_outputs = eval_chain.evaluate(gt_datase,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")
    return graded_outputs

In [77]:
gt_dataset[0]

{'question': 'What were the financial key performance indicators of the msg life Group in the reporting period?',
 'answer': 'The msg life Group recorded gross Group revenue from its own business under German GAAP of 176.1 million euros and Group earnings before interest, taxes, depreciation and amortisation (EBITDA) under German GAAP of 17.9 million euros.'}

In [79]:
qa_llm_answer = qa_llm(gt_dataset[0]["question"])
qa_llm_answer

{'question': 'What were the financial key performance indicators of the msg life Group in the reporting period?',
 'result': 'In the reporting period, the msg life Group recorded gross Group revenue of 176.1 million euros and Group earnings before interest, taxes, depreciation and amortisation (EBITDA) of 17.9 million euros.'}

In [89]:
grade_model_answer([gt_dataset[0]],
                   [qa_llm_answer],
                   grade_answer_prompt="Fast")



[1m> Entering new QAEvalChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher grading a quiz. 
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: Correct or Incorrect here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 

QUESTION: What were the financial key performance indicators of the msg life Group in the reporting period?
STUDENT ANSWER: In the reporting period, the msg life Group recorded 

[{'results': 'Correct'}]

In [82]:
# test for incorrect answer
qa_predicted = qa_llm("What is TRAIL.X?")
qa_predicted["question"] = gt_dataset[0]["question"]

In [90]:
grade_model_answer([gt_dataset[0]],
                   [qa_predicted],
                   grade_answer_prompt="Fast")



[1m> Entering new QAEvalChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a teacher grading a quiz. 
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: Correct or Incorrect here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin! 

QUESTION: What were the financial key performance indicators of the msg life Group in the reporting period?
STUDENT ANSWER: TRAIL.X is a project developed by msg life in coopera

[{'results': 'Incorrect'}]