In [13]:
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.evaluation.qa import QAEvalChain
from typing import List
import pandas as pd
import os
os.environ["OPENAI_API_KEY"] = "<your api key>"

# Load the PDF document and create the vector store index
loader = PyPDFLoader("policy-booklet-0923.pdf")

document = loader.load_and_split()
# Creates a vector index from the extracted documents.

embeddings = OpenAIEmbeddings()
index = VectorstoreIndexCreator(embedding=embeddings).from_documents(document)

# Initialize the RAG model and language model
llm  = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=index.vectorstore.as_retriever())




In [2]:

df=pd.read_excel('./EvaluationDataset.xlsx')

In [3]:
dataset=[]
for i in range(len(df)):   
    question=df.loc[i,"Query"]
    answer=df.loc[i,"Response"]
    dataset.append({"query": question,
                    "answer": answer})

In [4]:


template = """You are a teacher evaluating answers. 
You are given a question, my answer, and the true answer, and are asked to score  my answer as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
MY ANSWER: my answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Grade my answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between my answer and true answer. It is OK if my answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

QUESTION: {query}
MY ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:

"""

GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)

def grade_model_answer(predicted_dataset: List, predictions: List) -> List:
    """
    Grades the distilled answer based on ground truth and model predictions.
    @param predicted_dataset: A list of dictionaries containing ground truth questions and answers.
    @param predictions: A list of dictionaries containing model predictions for the questions.
    @param grade_answer_prompt: The prompt level for the grading. Either "Fast" or "Full".
    @return: A list of scores for the distilled answers.
    """

    # Set the grading prompt based on the grade_answer_prompt parameter
    prompt = GRADE_ANSWER_PROMPT

    # Create an evaluation chain
    eval_chain = QAEvalChain.from_llm(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
        prompt=prompt
    )
    for pred in predictions:
        if not pred['result']:
            pred['result'] = 'No answer provided'

    # Evaluate the predictions and ground truth using the evaluation chain
    graded_outputs = eval_chain.evaluate(
        predicted_dataset,
        predictions,
        question_key="query",
        prediction_key="result"
    )

    return graded_outputs

In [5]:
prediction_dataset=[]
for i in range(len(dataset)):
    query=dataset[i]['query']
    result=qa.invoke({"query": query})
    prediction_dataset.append(result)

In [11]:
# Grade the model answers
graded_outputs = grade_model_answer(dataset[:21], prediction_dataset)


In [12]:
correct_count =0
for i in range(len(graded_outputs)):
    if graded_outputs[i]['results'].split(':')[1].strip()=='CORRECT':
        correct_count+=1
print(f"Accuracy:",round(correct_count/len(graded_outputs)*100,2),"%")

Accuracy: 76.19 %
