## Import the Libraries 

In [1]:
import pandas as pd
from tqdm import tqdm
import hashlib
from constants import EMBEDDINGS, index, CHAT_LLM

  from tqdm.autonotebook import tqdm
  warn_deprecated(


## Read the data and see the first 5 rows

In [2]:
data = pd.read_csv("bestdata.csv")

In [44]:
# data.head()

## Below I will chunk the data in such a way that each chunk represents a query-response pair

In [4]:
# Create a list to store the chunks
chunks = []

In [5]:
# Iterate over the rows of the DataFrame
for _, row in tqdm(data.iterrows(), total=len(data)):
    query = row['Query']
    response = row['Response']
    
    # Treat each query-response pair as a separate chunk
    chunk = [f"question: {query}\n answer: {response}"]
    chunks.append(chunk)
    

100%|██████████| 41/41 [00:00<00:00, 5024.59it/s]


In [43]:
# chunks

## Below I am embedding the chunked dataset using OpenAI's text-embedding-ada-00 model

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

def generate_embeddings(documents: list[any]) -> list[list[float]]:
    """
    Generate embeddings for a list of documents.

    Args:
        documents (list[any]): A list of document objects, each containing a 'page_content' attribute.

    Returns:
        list[list[float]]: A list containig a list of embeddings corresponding to the documents.
    """
    embedded = [EMBEDDINGS.embed_documents(doc) for doc in documents]
    return embedded


In [8]:
# Run the function
chunked_document_embeddings = generate_embeddings(documents=chunks)

In [42]:
# chunked_document_embeddings

## Below I am creating a dictionary that will be a combination of the chunked text, the embeddings and a unique id for each chunk

In [10]:

def generate_short_id(content: str) -> str:
    """
    Generate a short ID based on the content using SHA-256 hash.

    Args:
    - content (str): The content for which the ID is generated.

    Returns:
    - short_id (str): The generated short ID.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))
    return hash_obj.hexdigest()


def combine_vector_and_text(
    documents: list[any], doc_embeddings: list[list[float]]
) -> list[dict[str, any]]:
    """
    Process a list of documents along with their embeddings.

    Args:
    - documents (List[Any]): A list of documents (strings or other types).
    - doc_embeddings (List[List[float]]): A list of embeddings corresponding to the documents.

    Returns:
    - data_with_metadata (List[Dict[str, Any]]): A list of dictionaries, each containing an ID, embedding values, and metadata.
    """
    data_with_metadata = []

    for doc_text, embedding in zip(documents, doc_embeddings):
        # Convert doc_text to string if it's not already a string
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        # Generate a unique ID based on the text content
        doc_id = generate_short_id(doc_text)

        # Create a data item dictionary
        data_item = {
            "id": doc_id,
            "values": embedding[0],
            "metadata": {"text": doc_text},  # Include the text as metadata
        }

        # Append the data item to the list
        data_with_metadata.append(data_item)

    return data_with_metadata


In [11]:
# Call the function
data_with_meta_data = combine_vector_and_text(documents=chunks, doc_embeddings=chunked_document_embeddings)

In [41]:
# data_with_meta_data

In [13]:
len(chunked_document_embeddings)

41

In [14]:
print(index)

<pinecone.data.index.Index object at 0x115039e10>


## Now I am sending the data to my pinecone index

In [15]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]]) -> None:
    """
    Upsert data with metadata into a Pinecone index.

    Args:
    - data_with_metadata (List[Dict[str, Any]]): A list of dictionaries, each containing data with metadata.

    Returns:
    - None
    """
    index.upsert(vectors=data_with_metadata)


In [16]:
# Call the function
upsert_data_to_pinecone(data_with_metadata= data_with_meta_data)

In [17]:
def get_query_embeddings(query: str) -> list[float]:
    """This function returns a list of the embeddings for a given query

    Args:
        query (str): The actual query/question

    Returns:
        list[float]: The embeddings for the given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings


## Testing the pipeline by querying the index

## Embed the query

In [29]:
# Call the function
user_question = "Will I get a refund if I cancel my policy after making a claim?"
query_embeddings = get_query_embeddings(query=user_question)

## Query the vector database to retrieve the answer

In [30]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 2, include_metadata: bool = True
) -> dict[str, any]:
    """
    Query a Pinecone index.

    Args:
    - index (Any): The Pinecone index object to query.
    - vectors (List[List[float]]): List of query vectors.
    - top_k (int): Number of nearest neighbors to retrieve (default: 2).
    - include_metadata (bool): Whether to include metadata in the query response (default: True).

    Returns:
    - query_response (Dict[str, Any]): Query response containing nearest neighbors.
    """
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

In [31]:
# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

In [32]:
answers

{'matches': [{'id': 'abee985c5e9c2b6136d1ffd966e2d3451b8bbe8557c327ceb4bbce0407364a20',
              'metadata': {'text': "['question: Will I get a refund if I "
                                   'cancel my policy after making a claim?\\n '
                                   'answer: No, if you’ve made a claim, the '
                                   'insurer will not refund any car insurance '
                                   "premium.']"},
              'score': 0.911601782,
              'values': []},
             {'id': 'f29470e3938049bca53eb4a1ef84102802f6cce7631859be1106194abeed0de0',
              'metadata': {'text': "['question: Will the insurer give me proof "
                                   'of my No Claim Discount if I cancel the '
                                   'policy?\\n answer: Yes, the insurer will '
                                   'give you proof of any No Claim Discount if '
                                   "you cancel the policy.']"},
             

In [33]:
# Extract only the text from the dictionary before passing it to the LLM
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])

# prompt = f"{text_answer} Using the provided information, give me a better and summarized answer"

In [34]:
SYSTEM_PROMPT = """You are an experienced insurance professional with deep knowledge of car insurance policies. Your task is to provide accurate and concise responses to queries based on a given car insurance policy document.
You will receive two inputs:
1. The user's question related to the car insurance policy.
2. The answer gotten from the database.
Your role is to summarize the retrieved information and craft a clear, well-structured response that directly answers the user's question. 
# Keep your responses straightforward and easy to understand forvfvvvvfxbv 

In [35]:
LLM_prompt = f"{SYSTEM_PROMPT}\n\This is the question: {user_question}\nThis is the answer from the database: {text_answer}"

In [36]:
LLM_prompt

"You are an experienced insurance professional with deep knowledge of car insurance policies. Your task is to provide accurate and concise responses to queries based on a given car insurance policy document.\nYou will receive two inputs:\n1. The original query related to the car insurance policy.\n2. The relevant excerpt retrieved from the policy document based on the query.\nYour role is to analyze the retrieved information and craft a clear, well-structured response that directly answers the query. Draw upon your expertise in insurance policies to provide additional context or clarification if needed.\nIf the retrieved information is insufficient to fully answer the query, summarize what you can based on the excerpt, and indicate that the available information is limited. \nKeep your responses straightforward and easy to understand for general audiences. Define any technical terms if necessary, and maintain a professional, informative tone befitting an experienced insurance professio

In [37]:
def better_query_response(prompt: str) -> str:
    """This function returns a better response using LLM
    Args:
        prompt (str): The prompt template

    Returns:
        str: The actual response returned by the LLM
    """
    better_answer = CHAT_LLM(prompt)
    return better_answer

In [38]:
# Call the function
final_answer = better_query_response(prompt=LLM_prompt)

In [39]:
print(final_answer)



Based on the provided policy document, if you cancel your policy after making a claim, you will not receive a refund for any car insurance premiums. This means that the insurer will not reimburse you for any unused portion of your policy. However, if you cancel your policy without making a claim, the insurer will provide you with proof of your No Claim Discount. This is a discount given to drivers who have not made any claims during their policy term. Please note that if you have made a claim, you may not be eligible for this discount. If you have any further questions or concerns regarding your policy, please do not hesitate to contact your insurer for more information.
