In [371]:
GOOGLE_API_KEY = "</GOOGLE Console API>"  #insert API here

In [372]:
import os
import getpass

os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

### Initialize Model

In [373]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-pro")

### Load

In [374]:
from langchain_community.document_loaders import PyPDFLoader 

loader = PyPDFLoader('policy-booklet-0923.pdf')
docs = loader.load_and_split()

In [578]:
import pandas as pd

df = pd.DataFrame([d.page_content for d in docs], columns=["text"])
df.head(10)

Unnamed: 0,text
0,Your car insurance \npolicy booklet
1,Page 2FAQs 3\nGlossary 4\nMaking a claim 6\...
2,Page 3FAQs\nHow much will you pay if my car is...
3,Page 4About the glossary\nWhen we use these wo...
4,Glossary\nLoss of any limb A limb severed at o...
5,Page 6Making a claim\nIf you need to claim\nTh...
6,Page 7Making a claim\nHow repairs and replacem...
7,Page 8We’ll provide cover up to the amounts sh...
8,"Page 9What your cover includes\nThird Party, \..."
9,"Third Party, \nFire and Theft Essentials Compr..."


In [376]:
for i in range(5): 
    print(docs[i].metadata)

{'source': 'policy-booklet-0923.pdf', 'page': 0}
{'source': 'policy-booklet-0923.pdf', 'page': 1}
{'source': 'policy-booklet-0923.pdf', 'page': 2}
{'source': 'policy-booklet-0923.pdf', 'page': 3}
{'source': 'policy-booklet-0923.pdf', 'page': 4}


In [377]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Split

In [378]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=20,add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [379]:
print (f'{len(docs)} vs {len(all_splits)}')

44 vs 131


### Storing

In [380]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits,
                                    embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

### Retrieval

In [381]:
#retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})

In [382]:
answer = retriever.invoke("What is DriveSure??")
answer

[Document(page_content='DriveSure is our telematics insurance product. \nIt’s designed to capture how, when and where your car is driven, based on driver-monitoring technology. We use it to understand your driving style and give you feedback. This means \nwe can base your premium on your\ndriving record.', metadata={'page': 2, 'source': 'policy-booklet-0923.pdf', 'start_index': 1768}),
 Document(page_content='we don’t cover’ on page 33.\nDoes Churchill have approved repairers? \nChurchill customers have access to a national \nnetwork of approved repairers. If \nyour car  is \nrepaired by one of these, they’ll deal with all aspects of your repair. What is DriveSure?', metadata={'page': 2, 'source': 'policy-booklet-0923.pdf', 'start_index': 1512}),
 Document(page_content='>Your certificate (or certificates) of motor \ninsurance.\n>The Green Flag breakdown cover policy \nbooklet – where this applies. \n>The DriveSure terms and conditions – where they apply. \nPolicyholder The person named

### RAG: Generate

In [383]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [384]:
template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question at the end. If you don't know the answer, just say that you don't know. Keep the answer as concise as possible. Use minimum 4 sentance. No maximum limit. Always say "Thanks for using Athina AI!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
prompt2 = PromptTemplate.from_template(template)
#prompt2

PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know. Keep the answer as concise as possible. Use minimum 4 sentance. No maximum limit. Always say "Thanks for using Athina AI!" at the end of the answer.\n{context}\nQuestion: {question}\nHelpful Answer:')

In [385]:
template3 = """You are an assistant for collecting data from the pdf. Make questions from the data and answer them, the main purpose is to create a dataset from the pdf. Create a dataframe as output and store them with a name. Always say "Thanks for using Athina AI!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
prompt3 = PromptTemplate.from_template(template3)
#prompt3

PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for collecting data from the pdf. Make questions from the data and answer them, the main purpose is to create a dataset from the pdf. Create a dataframe as output and store them with a name. Always say "Thanks for using Athina AI!" at the end of the answer.\n{context}\nQuestion: {question}\nHelpful Answer:')

In [386]:
#prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [387]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt2
    | llm
    | StrOutputParser()
)

In [581]:
ques = "What is DriveSure?"
for chunk in rag_chain.stream(ques):
    print(chunk, end="", flush=True)

DriveSure is a telematics insurance product offered by Churchill. It is designed to capture how, when, and where your car is driven, based on driver-monitoring technology. This information is used to understand your driving style and give you feedback. As a result, Churchill can base your premium on your driving record. Thanks for using Athina AI!

## Creating Dataset

In [404]:
template4 = """You are an assistant for collecting data from the pdf. Make questions from the data and answer them, the main purpose is to create a dataset from the pdf. Do not include questions which does not have answer in the pdf file. Make sure your query-response pairs are diverse and are not concentrated on a specific query type, document section or page. Make questions such that 1 question per page. Create a dataframe as output and store them with a name. Always say "Thanks for using Athina AI!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
prompt4 = PromptTemplate.from_template(template4)
#prompt4

PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for collecting data from the pdf. Make questions from the data and answer them, the main purpose is to create a dataset from the pdf. Do not include questions which does not have answer in the pdf file. Make sure your query-response pairs are diverse and are not concentrated on a specific query type, document section or page. Make questions such that 1 question per page. Create a dataframe as output and store them with a name. Always say "Thanks for using Athina AI!" at the end of the answer.\n{context}\nQuestion: {question}\nHelpful Answer:')

In [405]:
rag_data = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt4
    | llm
    | StrOutputParser()
)

In [516]:
question = "Make a unique questions on 'Making a claim' from the PDF and name it as Questions to create a column, and answer them and store it in new column called Answers, now generate the dataframe named output_data. Make sure your query-response pairs are diverse and are not concentrated on a specific query type, document section or page. Make the questions starting with what,where, how, and who. I want to store it and use it later. write a code to make the dataframe."
for chunk in rag_data.stream(question):
    print(chunk, end="", flush=True)

```python
import pandas as pd

# Create a dataframe to store the questions and answers
output_data = pd.DataFrame(columns=['Questions', 'Answers'])

# Add the questions and answers to the dataframe
output_data = output_data.append({'Questions': 'What should you do if someone offers to settle your claim?', 'Answers': 'You must tell your insurance company or appointed representative'}, ignore_index=True)
output_data = output_data.append({'Questions': 'Where can you find information about your car insurance policy?', 'Answers': 'In your car insurance policy booklet'}, ignore_index=True)
output_data = output_data.append({'Questions': 'How do you end your claim?', 'Answers': 'By settling it'}, ignore_index=True)
output_data = output_data.append({'Questions': 'Who can refuse to cover further costs if you refuse to accept an offer or payment?', 'Answers': 'Your insurance company or appointed representative'}, ignore_index=True)

# Print the dataframe
print(output_data)

# Thanks for using Ath

In [517]:
import pandas as pd


output_data = pd.DataFrame(columns=['Questions', 'Answers'])

output_data["Questions"] = ["What is the policy about automated cars?",
                      "Where does the policy apply?",
                      "What is the definition of Automated car in the policy?", 
                      "Where can an automated car be driven?",
                      "What is the phone number to call for complaints in the UK?",
                      "What is the phone number to call for complaints outside the UK?",
                      "What is the main purpose of the document?",
                      "What is the company's understanding of customer complaints?",
                      "What is the company's approach to resolving complaints?",
                      "Who is responsible for determining the amount of personal benefits you receive?",
                      "What is the company's commitment to customer satisfaction?",
                      "What is the company's policy on compensation for complaints?",
                      "What is the company's policy on investigating complaints?",
                      "What is the policy coverage for an automated car while it is driving itself on public roads in Great Britain?",
                      "What is the policy coverage for an automated car while it is being driven by a human on public roads in Great Britain?",
                      "What are the exclusions to the policy coverage for an automated car?",
                      "What is the procedure for filing a claim under the policy for an automated car?",
                      "What is the excess for windscreen damage?",
                      "What should I do if I have windscreen damage?",
                      "What are the circumstances for New car replacement cover to apply?",
                      "What is the maximum period of cover for New car replacement?",
                      "What does the Secretary of State’s list identify the vehicles by?",
                      "How many claims are allowed within a 3-year period?",
                      "How much will I be reimbursed for travel costs if I cannot get a hire car?",
                      "Where can I find information about Guaranteed Hire Car Plus?",
                      "Who is eligible for Protected No Claim Discount?",
                      "What should you do if you have to go to court?",
                      "How does 'Third Party, Fire and Theft' insurance differ from comprehensive insurance?",
                      "How can I make a claim under Motor Legal Cover?",
                      "What are the maximum amounts paid for my property?",
                      "What should you do if you're involved in an accident?",
                      "Who must identify vehicles for self-driving purposes?",
                      "How should you respond to any communication you receive regarding your car abroad?",
                      "What are the grounds for canceling the policy?",
                      "What is the coverage limit for bodily injury liability per person?",
                      "Where do I need to send the cancellation notice?",
                      "Who is responsible for paying the cancellation fee?",
                      "What are the exclusions to fire and theft coverage?",
                      "What is the excess on your policy?",
                      "What is the email address for general queries?"
                     ]

output_data["Answers"] = ["Automated car Your car where it is lawfully driving itself on roads or other public places in Great Britain. Please note that your car", 
                    "Great Britain",
                    "Your car where it is lawfully driving itself on roads or other public places in Great Britain.", 
                    "Roads or other public places in Great Britain",
                    "0800 051 0198",
                    "+44 (0) 141 349 0193",
                    "To provide information about the company's complaint handling process",
                    "Things don't always go to plan",
                    "To resolve complaints fairly and quickly",
                    "The insurance company is responsible for determining the amount of personal benefits you receive.",
                    "To ensure customer satisfaction",
                    "The company will consider compensation if the complaint is upheld",
                    "The company will investigate all complaints thoroughly",
                    "The policy coverage for an automated car while it is driving itself on public roads in Great Britain is the same as the coverage for a human-driven car.",
                    "The policy coverage for an automated car while it is being driven by a human on public roads in Great Britain is the same as the coverage for a human-driven car.",
                    "The exclusions to the policy coverage for an automated car are the same as the exclusions to the coverage for a human-driven car.",
                    "The procedure for filing a claim under the policy for an automated car is the same as the procedure for filing a claim for a human-driven car.",
                    "£75",
                    "Contact your insurer",
                    "If your car is a total loss or stolen and it is less than 12 months old or has less than 12,000 miles on the clock.",
                    "12 months",
                    "The Secretary of State’s list identifies the vehicles by type, information recorded in a registration document or in some other way.",
                    "2 claims",
                    "You will be reimbursed for travel costs up to £50 per day, up to a total of £500 per claim.",
                    "You can find information about Guaranteed Hire Car Plus in your car insurance policy booklet.",
                    "You are eligible for Protected No Claim Discount if you have been claim-free for at least 2 years.",
                    "You must tell us if anyone offers to settle your claim or makes a payment into court. If you refuse to accept an offer or payment that we or your appointed representative think you should accept, we can refuse to cover any further costs.",
                    "Third Party, Fire and Theft' insurance provides less coverage than comprehensive insurance, which covers all types of damage to your vehicle, regardless of fault.",
                    "To make a claim under Motor Legal Cover, you should contact your insurance provider as soon as possible after the accident.",
                    "Up to £15,000 for your building and £7,500 for your contents.",
                    "If you're involved in an accident, you should contact the police and your insurance company as soon as possible.",
                    "The Secretary of State must identify vehicles for self-driving purposes.",
                    "You should contact your insurer immediately to inform them of the situation.",
                    "The policy can be canceled if the vehicle is sold, stolen, or destroyed.",
                    "$25,000",
                    "The cancellation notice must be sent to the insurance company in writing.",
                    "The policyholder is responsible for paying the cancellation fee.",
                    "Fire and theft coverage typically excludes damage caused by war, nuclear war, or civil unrest. It may also exclude damage caused by earthquakes or floods.",
                    "The excess on your policy is £150 for accidental damage and fire, and £250 for theft and third party claims.",
                    "Support@churchill.com"
                   ]


output_data


Unnamed: 0,Questions,Answers
0,What is the policy about automated cars?,Automated car Your car where it is lawfully dr...
1,Where does the policy apply?,Great Britain
2,What is the definition of Automated car in the...,Your car where it is lawfully driving itself o...
3,Where can an automated car be driven?,Roads or other public places in Great Britain
4,What is the phone number to call for complaint...,0800 051 0198
5,What is the phone number to call for complaint...,+44 (0) 141 349 0193
6,What is the main purpose of the document?,To provide information about the company's com...
7,What is the company's understanding of custome...,Things don't always go to plan
8,What is the company's approach to resolving co...,To resolve complaints fairly and quickly
9,Who is responsible for determining the amount ...,The insurance company is responsible for deter...


In [514]:
output_data.to_csv('output.csv')

## Evaluation

In [568]:
def answer_fn(question, history=None):
    for chunk in rag_data.stream(question):
        a= chunk
    return a

In [579]:
class KnowledgeBase:
    def __init__(self, data):
        self.data = data

    def get_entry(self, entry_id):
        return self.data.loc[entry_id]


knowledge_base = KnowledgeBase(df)

In [531]:
from giskard.rag import evaluate

report = evaluate(answer_fn, testset=output_data, knowledge_base=knowledge_base)

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [576]:
def evaluate_custom(answer_fn, testset, knowledge_base):   
    results = []
    for question in testset:
        answer = answer_fn(question, knowledge_base)
        results.append(answer)
    
    return results


report = evaluate_custom(answer_fn, testset=output_data, knowledge_base=knowledge_base)


In [575]:
display(report)

[' call +44 (0) 141 349 0193. Thanks for using Athina AI!',
 ' We’ll do our best to send this at the earliest opportunity.\nThanks for using Athina AI!']

In [582]:
def answer_relevancy(question, answer):
    return compute_cosine_similarity(question, answer)

In [590]:
for index, row in output_data.iterrows():
    relevancy_score = answer_relevancy(question, answer)

NameError: name 'compute_cosine_similarity' is not defined

In [591]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from transformers import pipeline

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates