# Installing the required libraries

In [1]:
!pip install -r requirements.txt



## Imports

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import os

## Enter API key

In [3]:
OpenAI_api_key = "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"

In [4]:
input_file = "policy-booklet-0923.pdf"

# Function to load the input pdf and create a vector database

In [5]:
def load_data(file):
    """
    Loads the pdf file to be used for the retrieval augmented generation 
    and creates a vector database using it.
    """
    os.environ["OPENAI_API_KEY"] = OpenAI_api_key
    loader = PyPDFLoader(file)
    data = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(data)
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(docs, embeddings)
    return db.as_retriever()


# Retrieval augmented generation function

In [6]:
#RAG 
def rag_answer(context,query):
    template = """Pretend you are the respresentative of a car insurance policy. Answer the question based only on the following context:
    {context}
    Question: {question}
    After getting the question, try to find out the meaning behind it and compare its meaning to the meanings of the data stored in the context. If a question is asking about something that is available in the context, return the answer. The answer should be the same as the one in the original data.
    """
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model="gpt-4", temperature=0)
    chain = (
        {"context": context, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    answer = chain.invoke(query)

    if "I'm sorry" in answer or "provided context" in answer or "not clear" in answer:
        print("Answer not found")
        print(answer)
        answer = " NULL "
    return answer

# Evaluation metric: Cosine Simlarity

In [8]:
def Cosine_Similarity(sentence1, sentence2):
    """
    Compares the cosine similarity between two sentences to find out if they are similar in meaning.
    """
    model_name = 'distilbert-base-nli-stsb-mean-tokens'
    model = SentenceTransformer(model_name)

    embedding1 = model.encode([sentence1])[0]
    embedding2 = model.encode([sentence2])[0]

    similarity = cosine_similarity([embedding1], [embedding2])[0][0]

    similarity_threshold = 0.9

    if similarity >= similarity_threshold:
        return True  
    else:
        return False  

# Evaluation metric: GPT based similarity

In [9]:
def gpt_based_evaluation(sentence1,sentence2):
    """
    Prompting GPT to find out the meanings of two sentences to find if they are similar.
    """
    os.environ["OPENAI_API_KEY"] = OpenAI_api_key
    template = """
There are two sentences s1 {sentence1} and s2 {sentence2}. Compare the meanings of these two. Return "True" if they have similar meanings. 
Otherwise return "False".
 """
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model="gpt-4", temperature=0)
    chain = (
        prompt
        | model
        | StrOutputParser()
    )
    answer = chain.invoke({"sentence1":sentence1,"sentence2":sentence2})
    return answer

# Evaluating performance of the RAG model on GPT-generated data

## We take the gpt generated test set and evaluate the performance of the RAG model compared to its repsonses for the same queries.

In [16]:
gpt_generated_testing_data = pd.read_csv("dataset.csv")
query_list_gpt_data = gpt_generated_testing_data['Query'].tolist()
response_list_gpt_data = gpt_generated_testing_data['Response'].tolist()
rag_response_list_on_gpt_data = []

In [17]:

retriever = load_data(input_file)
for query in query_list_gpt_data:
    rag_response_list_on_gpt_data.append(rag_answer(retriever,query))

In [18]:
def accuracy(sentence1,sentence2):
    print('Ground truth')
    print(sentence1)
    print('RAG pipeline answer')
    print(sentence2)
    cos_sim = Cosine_Similarity(sentence1, sentence2)
    print("Cosine Similarity answer")
    print(cos_sim) 
    gpt_eval = gpt_based_evaluation(sentence1,sentence2)
    print("Cosine Similarity answer")
    print(gpt_eval)
    print("\n\n\n\n")
    return cos_sim,gpt_eval

In [22]:
#Evaluation on gpt-generated data.
test_size = len(rag_response_list_on_gpt_data)
cs_res = []
gpt_res = []
for i in range(test_size):
    sentence1 = response_list_gpt_data[i]
    sentence2 = rag_response_list_on_gpt_data[i]
    a,b = accuracy(sentence1,sentence2)
    cs_res.append(a)
    gpt_res.append(b)

Ground truth
 Your car insurance policy typically covers damages to your vehicle caused by accidents, theft, vandalism, and natural disasters. It may also provide liability coverage for injuries or property damage you cause to others.
RAG pipeline answer
Under your car insurance policy, if your car is damaged, the cost of repairing or replacing your car up to its UK market value is covered. This is the current value of your car at the time of the claim. The policy also covers injury to third parties, or damage caused to their property, when someone covered under the policy is driving other cars. However, it does not cover the car being driven. If your car is left unlocked or with the keys in it, the policy will not pay a claim for theft or attempted theft. The policy does not cover mechanical or electrical failure, wear and tear, damage to tyres caused by braking, punctures, cuts or bursts, or breakdowns. It also does not provide cover if your car is being used by someone who's not ins

In [23]:
cs_res

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [24]:
gpt_res

['True',
 'True',
 'True',
 'False',
 'False',
 'False',
 'False',
 'False',
 'True',
 'True',
 'False',
 'False',
 'True',
 'False',
 'True',
 'False',
 'True',
 'False',
 'False',
 'False',
 'True',
 'False',
 'True',
 'True',
 'True',
 'True',
 'True',
 'False',
 'True',
 'True',
 '"False"',
 '"False"',
 'False',
 'True',
 'True',
 'False',
 '"False"',
 'False',
 'True',
 'False',
 'False',
 'True',
 'True',
 'False',
 'True',
 'False',
 '"False"',
 'True',
 'True',
 'True']

## Cosine similarity accuracy on gpt generated data

In [36]:
#Cosine similarity accuracy on gpt generated data
Cs_gpt_data = []
for x in cs_res:
    if x:
        Cs_gpt_data.append(1)
    else:
        Cs_gpt_data.append(0)

In [37]:
accuracy_cs_gpt = sum(Cs_gpt_data)/len(Cs_gpt_data)

In [56]:
accuracy_cs_gpt
print("Accuracy on Cosine Similarity-based evaluation of GPT generated data")
print(accuracy_cs_gpt)

Accuracy on Cosine Similarity-based evaluation of GPT generated data
0.0


## GPT based evaluation accuracy on gpt generated data

In [39]:
#GPT based evaluation accuracy on gpt generated data
gpt_eval_gpt_data = []
for x in gpt_res:
    if x:
        gpt_eval_gpt_data.append(1)
    else:
        gpt_eval_gpt_data.append(0)

In [40]:
gpt_eval_gpt_data

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [41]:
accuracy_gpt_eval_gpt_data = sum(gpt_eval_gpt_data)/len(gpt_eval_gpt_data)

In [57]:
accuracy_gpt_eval_gpt_data
print("Accuracy on GPT-based evaluation of GPT generated data")
print(accuracy_gpt_eval_gpt_data)

Accuracy on GPT-based evaluation of GPT generated data
1.0


# Evaluating performance of the RAG model on manually generated data

## We take the manually generated test set and evaluate the performance of the RAG model compared to its repsonses for the same queries.

In [43]:
testing_data = pd.read_csv("manual_data.csv")
query_list = testing_data['Query'].tolist()
response_list = testing_data['Response'].tolist()
rag_response_list = []

In [44]:
query_list

['How much will you pay if my car is damaged?',
 'Who is covered to drive other cars?',
 'Am I covered if I leave my car unlocked or the keys in the car?',
 'What’s not included in my cover?',
 'Does Churchill have approved repairers?',
 'What is DriveSure?',
 'What’s the difference between commuting and business use?',
 'Can I use my car abroad?',
 'Are my electric car’s charging cables covered?',
 'Is my electric car battery covered?']

In [45]:
response_list

['Where damage to your car is covered under your policy, we’ll pay the cost of repairing or replacing your car up to its UK market value. This is the current value of your car at the time of the claim. It may be different to the amount you paid or any amount you provided when you insured your car with us.',
 'Your certificate of motor insurance will show who has cover to drive other cars. We’ll only cover injury to third parties, or damage caused to their property, not to the car being driven.',
 'We won’t pay a claim for theft or attempted theft if your car is left: Unlocked; With keys or key fobs in, on, or attached to the car; With the engine running or With a window or roof open.',
 '\n            We don’t cover things like:\n> Mechanical or electrical failure.\n> Wear and tear.\n> Damage to tyres caused by braking,\npunctures, cuts or bursts.\n> Breakdowns (if your policy includes\nGreen Flag breakdown, this may cover getting your car going again).\nWe won’t provide cover if your 

In [46]:
retriever = load_data(input_file)
for query in query_list:
    rag_response_list.append(rag_answer(retriever,query))

## Evaluation on manually-generated data.

In [47]:
#Evaluation on manually-generated data.
test_size = len(rag_response_list)
cs_res_manual_data = []
gpt_res_manual_data = []
for i in range(test_size):
    sentence1 = response_list[i]
    sentence2 = rag_response_list[i]
    a,b = accuracy(sentence1,sentence2)
    cs_res_manual_data.append(a)
    gpt_res_manual_data.append(b)

Ground truth
Where damage to your car is covered under your policy, we’ll pay the cost of repairing or replacing your car up to its UK market value. This is the current value of your car at the time of the claim. It may be different to the amount you paid or any amount you provided when you insured your car with us.
RAG pipeline answer
If your car is accidentally damaged, the insurance policy will cover up to the market value of your car.
Cosine Similarity answer
False
Cosine Similarity answer
True





Ground truth
Your certificate of motor insurance will show who has cover to drive other cars. We’ll only cover injury to third parties, or damage caused to their property, not to the car being driven.
RAG pipeline answer
The main driver may be covered to drive other cars under certain conditions. These conditions include that they aren't covered by another insurance policy, they don't own the other car and it isn't hired to them under a hire-purchase or leasing agreement, the car is not

## Cosine similarity accuracy on manually generated data

In [54]:
#Cosine similarity accuracy on manually generated data
Cs_manual_data = []
for x in cs_res_manual_data:
    if x:
        Cs_manual_data.append(1)
    else:
        Cs_manual_data.append(0)

accuracy_cs_manual_data = sum(Cs_manual_data)/len(Cs_manual_data)
print("Accuracy on Cosine Similarity -based evaluation of manually generated data")
print(accuracy_cs_manual_data)

Accuracy on Cosine Similarity -based evaluation of manually generated data
0.4


## GPT based evaluation accuracy on manually generated data

In [49]:
#GPT based evaluation accuracy on manually generated data
gpt_eval_manual_data = []
for x in gpt_res_manual_data:
    if x:
        gpt_eval_manual_data.append(1)
    else:
        gpt_eval_manual_data.append(0)

In [53]:
accuracy_gpt_eval_manual_data = sum(gpt_eval_manual_data)/len(gpt_eval_manual_data)
print("Accuracy on GPT-based evaluation of manually generated data")
print(accuracy_gpt_eval_manual_data)

Accuracy on GPT-based evaluation of manually generated data
1.0
