## Import the Libraries 

In [37]:
import pandas as pd
from tqdm import tqdm
import hashlib
from constants import EMBEDDINGS, index, CHAT_LLM

## Read the data and see the first 5 rows

In [2]:
data = pd.read_csv("bestdata.csv")

In [3]:
data.head()

Unnamed: 0,Query,Response
0,What is the maximum number of passengers my ca...,Your car isn’t made or adapted to carry more t...
1,Can I make a profit from passengers' payments ...,"No, you’re not allowed to make a profit from t..."
2,Is it okay to use my car for business purposes...,"No, you’re not allowed to carry passengers as ..."
3,Can I rent out my car under this policy?,"No, you’re not allowed to rent your car out or..."
4,What should I do if I change my car before the...,You must inform the insurer if anything about ...


## Below I will chunk the data in such a way that each chunk represents a query-response pair

In [88]:
# Create a list to store the chunks
chunks = []

In [89]:
# Iterate over the rows of the DataFrame
for _, row in tqdm(data.iterrows(), total=len(data)):
    query = row['Query']
    response = row['Response']
    
    # Treat each query-response pair as a separate chunk
    chunk = [f"question: {query}\n answer: {response}"]
    chunks.append(chunk)
    

100%|██████████| 41/41 [00:00<00:00, 3706.33it/s]


In [90]:
chunks

[['question: What is the maximum number of passengers my car can carry under the car sharing policy?\n answer: Your car isn’t made or adapted to carry more than 8 passengers and a driver.'],
 ["question: Can I make a profit from passengers' payments under the car sharing policy?\n answer: No, you’re not allowed to make a profit from the passengers’ payments or from allowing someone to drive."],
 ['question: Is it okay to use my car for business purposes while car sharing?\n answer: No, you’re not allowed to carry passengers as part of a business.'],
 ['question: Can I rent out my car under this policy?\n answer: No, you’re not allowed to rent your car out or use a peer-to-peer hire scheme to do so.'],
 ['question: What should I do if I change my car before the cover starts?\n answer: You must inform the insurer if anything about your car changes before the cover starts.'],
 ['question: Do I need to inform the insurer if I want to add another driver to the policy?\n answer: Yes, you mus

## Below I am embedding the chunked dataset using OpenAI's text-embedding-ada-00 model

In [91]:
from langchain.embeddings.openai import OpenAIEmbeddings

def generate_embeddings(documents: list[any]) -> list[list[float]]:
    """
    Generate embeddings for a list of documents.

    Args:
        documents (list[any]): A list of document objects, each containing a 'page_content' attribute.

    Returns:
        list[list[float]]: A list containig a list of embeddings corresponding to the documents.
    """
    embedded = [EMBEDDINGS.embed_documents(doc) for doc in documents]
    return embedded


In [92]:
# Run the function
chunked_document_embeddings = generate_embeddings(documents=chunks)

In [93]:
chunked_document_embeddings

[[[0.010261973217643675,
   0.00044234070170392657,
   0.03752571662602408,
   -0.04390800308228261,
   -0.01861720505238533,
   0.007891788033190416,
   -0.025992584969972556,
   -0.013903317771827347,
   -0.024589011089822015,
   -0.016657499355431354,
   -0.004680783885783589,
   0.028839453637506198,
   0.00021661865735354849,
   -0.017160666858182835,
   -0.012042921793535267,
   -0.0017859142713722617,
   0.010871070279821624,
   -0.022232068380865586,
   0.02096090807981261,
   -0.006494835856449572,
   0.018921753117813023,
   -0.008050682837991398,
   -0.01938519785068682,
   -0.0010626107533363658,
   -0.01275132857437511,
   0.0123938152635729,
   0.03469208950266471,
   0.008328749677715677,
   0.04650328986108564,
   -0.01798162397053628,
   0.03855853471987558,
   -0.0017759833460722003,
   -0.017968384289007135,
   0.0008639918980891786,
   -0.022827924830191828,
   -0.01894823620616156,
   0.004498716378677637,
   -0.010857829666969918,
   0.03834667373837753,
   -0.018

## Below I am creating a dictionary that will be a combination of the chunked text, the embeddings and a unique id for each chunk which I will upsert to my pinecone database

In [94]:

def generate_short_id(content: str) -> str:
    """
    Generate a short ID based on the content using SHA-256 hash.

    Args:
    - content (str): The content for which the ID is generated.

    Returns:
    - short_id (str): The generated short ID.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))
    return hash_obj.hexdigest()


def combine_vector_and_text(
    documents: list[any], doc_embeddings: list[list[float]]
) -> list[dict[str, any]]:
    """
    Process a list of documents along with their embeddings.

    Args:
    - documents (List[Any]): A list of documents (strings or other types).
    - doc_embeddings (List[List[float]]): A list of embeddings corresponding to the documents.

    Returns:
    - data_with_metadata (List[Dict[str, Any]]): A list of dictionaries, each containing an ID, embedding values, and metadata.
    """
    data_with_metadata = []

    for doc_text, embedding in zip(documents, doc_embeddings):
        # Convert doc_text to string if it's not already a string
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        # Generate a unique ID based on the text content
        doc_id = generate_short_id(doc_text)

        # Create a data item dictionary
        data_item = {
            "id": doc_id,
            "values": embedding[0],
            "metadata": {"text": doc_text},  # Include the text as metadata
        }

        # Append the data item to the list
        data_with_metadata.append(data_item)

    return data_with_metadata


In [95]:
# Call the function
data_with_meta_data = combine_vector_and_text(documents=chunks, doc_embeddings=chunked_document_embeddings)

In [96]:
data_with_meta_data

[{'id': 'd41c69acddc82b178f752fc59157debcccdd9c84d609c879795dadbfc23ff647',
  'values': [0.010261973217643675,
   0.00044234070170392657,
   0.03752571662602408,
   -0.04390800308228261,
   -0.01861720505238533,
   0.007891788033190416,
   -0.025992584969972556,
   -0.013903317771827347,
   -0.024589011089822015,
   -0.016657499355431354,
   -0.004680783885783589,
   0.028839453637506198,
   0.00021661865735354849,
   -0.017160666858182835,
   -0.012042921793535267,
   -0.0017859142713722617,
   0.010871070279821624,
   -0.022232068380865586,
   0.02096090807981261,
   -0.006494835856449572,
   0.018921753117813023,
   -0.008050682837991398,
   -0.01938519785068682,
   -0.0010626107533363658,
   -0.01275132857437511,
   0.0123938152635729,
   0.03469208950266471,
   0.008328749677715677,
   0.04650328986108564,
   -0.01798162397053628,
   0.03855853471987558,
   -0.0017759833460722003,
   -0.017968384289007135,
   0.0008639918980891786,
   -0.022827924830191828,
   -0.01894823620616156

## Now I am sending the data to my pinecone index

In [15]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]]) -> None:
    """
    Upsert data with metadata into a Pinecone index.

    Args:
    - data_with_metadata (List[Dict[str, Any]]): A list of dictionaries, each containing data with metadata.

    Returns:
    - None
    """
    index.upsert(vectors=data_with_metadata)


In [16]:
# Call the function
upsert_data_to_pinecone(data_with_metadata= data_with_meta_data)

## Testing the pipeline by querying the index

## Embed the query

In [13]:
def get_query_embeddings(query: str) -> list[float]:
    """This function returns a list of the embeddings for a given query

    Args:
        query (str): The actual query/question

    Returns:
        list[float]: The embeddings for the given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings


## Now I will test it out on a single question by embedding, the question

In [60]:
# Call the function
user_question = "What should I inform the insurer about before renewing my policy?"
query_embeddings = get_query_embeddings(query=user_question)

## Query the vector database to retrieve the answer using the embedded query

In [61]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 2, include_metadata: bool = True
) -> dict[str, any]:
    """Query a Pinecone index."""
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

In [63]:
# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

In [64]:
answers

{'matches': [{'id': '8d8a0fd121abf6e463eb59fafec42d92dd821cf043631bd556af050f731ae6db',
              'metadata': {'text': "['question: What should I do before "
                                   'renewing my policy?\\n answer: You must '
                                   'tell the insurer about any incidents or '
                                   'motoring offenses that have occurred since '
                                   "your cover started.']"},
              'score': 0.909445584,
              'values': []},
             {'id': '892fb09f31a96f8a19e0bff8f7a9c6d559134a719947363931f69a93f488b68f',
              'metadata': {'text': "['question: What should I do if I change "
                                   'my car before the cover starts?\\n answer: '
                                   'You must inform the insurer if anything '
                                   'about your car changes before the cover '
                                   "starts.']"},
              'score':

### The answer contains a dictionary of the id, the score, the value and the text data which is what I actualy need, so I will be extract only the text in the code below so I can pass it to my LLM

In [65]:
# Extract only the text from the dictionary before passing it to the LLM
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])


### In order to help my LLM perform well, I will write my system prompt in such a way that my LLM will get a glimps of the answer and the question that returned the answer so that by seeing both the answer and the question, it will be able to give me a better answer

In [87]:
SYSTEM_PROMPT = """You are an experienced insurance professional with deep knowledge of car insurance policies. Your task is to provide accurate and concise responses to queries based on a given car insurance policy document.
You will receive two inputs:
1. The user's question related to the car insurance policy.
2. The answer gotten from the database.
Your role is to summarize the retrieved information and craft a clear, well-structured response that directly answers the user's question. 
Keep your responses straightforward and easy to understand 
"""

In [67]:
LLM_prompt = f"{SYSTEM_PROMPT}\n\This is the question: {user_question}\nThis is the answer from the database: {text_answer}"

In [68]:
LLM_prompt

### Below I defined a fucntion that takes in the whole prompts and returns a refined answer from gpt 3.5

In [69]:
def better_query_response(prompt: str) -> str:
    """This function returns a better response using LLM
    Args:
        prompt (str): The prompt template

    Returns:
        str: The actual response returned by the LLM
    """
    better_answer = CHAT_LLM(prompt)
    return better_answer

In [70]:
# Call the function
final_answer = better_query_response(prompt=LLM_prompt)

In [71]:
print(final_answer)


Before renewing your policy, you must inform your insurer of any incidents or motoring offenses that have occurred since your cover started. Additionally, if you change your car before the cover starts, you must also notify your insurer of any changes made to your vehicle. This will ensure that your policy is accurate and up-to-date. 


----------

------------

# **Time to evaluate the RAG using Giskard**

### Below I am creating a knowledge base which is basically my data, this knowledge base will be used to create test samples which will be used to test the accuracy of my RAG agent

In [8]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(data)

### Below, I am using Giskard to generate 10 random questions and their correct answers from the data. This questions and answers will be used to test the accuracy of my RAG agent

In [9]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=10, # This is the number of questions I want to use and test the results of my RAG 
    agent_description="A chatbot answering questions about car insurance policy",
)

2024-06-12 13:21:54,344 pid:1107 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2024-06-12 13:22:13,482 pid:1107 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions:  80%|████████  | 8/10 [00:58<00:17,  8.60s/it]

2024-06-12 13:23:17,033 pid:1107 MainThread giskard.rag  ERROR    Encountered error in question generation: list index out of range. Skipping.
2024-06-12 13:23:17,040 pid:1107 MainThread giskard.rag  ERROR    list index out of range
Traceback (most recent call last):
  File "/Users/appleplay/miniconda3/envs/caserader/lib/python3.11/site-packages/giskard/rag/question_generators/base.py", line 57, in generate_questions
    yield self.generate_single_question(knowledge_base, *args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/appleplay/miniconda3/envs/caserader/lib/python3.11/site-packages/giskard/rag/question_generators/double_questions.py", line 127, in generate_single_question
    "question_2": linked_questions[1]["question"],
                  ~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


Generating questions:  90%|█████████ | 9/10 [01:17<00:08,  8.58s/it]


### Below, I just want to print out the first 3 random questions generated by giskard, including the answers and the documents from where the answers were gotten 

In [10]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")

Question 1: What should I inform the insurer about before renewing my policy?
Reference answer: You must tell the insurer about any incidents or motoring offenses that have occurred since your cover started.
Reference context:
Document 37: Query: What should I do before renewing my policy?
Response: You must tell the insurer about any incidents or motoring offenses that have occurred since your cover started.

Document 4: Query: What should I do if I change my car before the cover starts?
Response: You must inform the insurer if anything about your car changes before the cover starts.
******************

Question 2: Will I receive proof of my No Claim Discount if I decide to cancel my policy?
Reference answer: Yes, the insurer will give you proof of any No Claim Discount if you cancel the policy.
Reference context:
Document 34: Query: Will the insurer give me proof of my No Claim Discount if I cancel the policy?
Response: Yes, the insurer will give you proof of any No Claim Discount if

### Below I will store it in a jsonl file incase I want to use it later

In [11]:
testset.save("test-set.jsonl")

### Below are the 9 questions generated by giskard
- I will first of all pass all these questions into my rag pipeline and generate their answers
- Then I will compare the answers generated by my rag agent with the answers generated from the giskard
- If the two answers match or are pointing towards the same thing, I will get a mark, if not I won't get a mark

In [None]:
questions = [
    "What should I inform the insurer about before renewing my policy?",
    "Will I receive proof of my No Claim Discount if I decide to cancel my policy?",
    "Can you explain the impact on the No Claim Discount (NCD) on my car insurance policy in the event that I decide to file a claim?",
    "Within what period is the insurer obliged to update my details on the Motor Insurance Database (MID) according to the agreements with the Motor Insurers' Bureau (MIB)?",
    "What are the consequences if I begin utilizing my vehicle for commercial activities without informing the insurance company, given that I require my documents in Braille, large print, or audio format?",
    "If I change my vehicle before the cover starts, how many passengers can the new car carry at most under the car sharing policy?",
    "I just bought a new car insurance policy and after understanding the stipulations, I'm contemplating cancelling it. Can I do that within the 14-day cooling-off period and will I be charged for it?",
    "As a new small business owner, I'm planning on using my personal vehicle for business purposes. Could you tell me what would happen if I don't inform my insurer about this change?",
    "Should I proceed with this?"
]

## Now let me use my rag agent to answer the 9 questions

In [83]:
final_answers = []

for user_question in questions:
    query_embeddings = get_query_embeddings(query=user_question)
    answers = query_pinecone_index(query_embeddings=query_embeddings)
    text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])
    final_answer = better_query_response(prompt=LLM_prompt)
    final_answers.append(final_answer)


In [84]:
final_answers

['\n\nBefore renewing your policy, make sure to inform your insurer about any incidents or motoring offenses that have occurred since your cover started. Additionally, if you change your car before the cover starts, remember to inform the insurer about any changes to your car.',
 '\n\nIf you cancel your car insurance policy, the insurer will provide proof of any No Claim Discount you have earned. However, if you have made a claim, you will not receive a refund for the premium paid.',
 '\n\nIf you make a claim, your No Claim Discount may be reduced by the insurer. If you cancel your policy, the insurer will provide you with proof of any No Claim Discount you have earned.',
 "\n\nThe insurer is required to update your details on the Motor Insurance Database (MID) within seven days, as agreed upon with the Motor Insurers' Bureau (MIB). It is important to inform the insurer promptly if your contact details change.",
 '\n\nUsing your car for business purposes without notifying your insurer 

## I can now check the accuracy of my rag agent by comparing the answers from my rag agent with the answers from giskard 

In [85]:
from giskard.rag import evaluate

report = evaluate(final_answers, testset=testset, knowledge_base=knowledge_base)

CorrectnessMetric evaluation: 100%|██████████| 9/9 [00:17<00:00,  1.93s/it]




In [86]:
display(report)

# From the above, my rag agent is performing very well and is giving the right answer for the questions