# Step 1: Import the necessary libraries

In [1]:
# Installing nessary libraries

!pip install -U -qq openai llama-index llama-index-core llama-index-readers-file llama-index-llms-openai llama-index-embeddings-openai

In [2]:
# Importing nessary libraries
import os
import random
import openai
from pathlib import Path
import pandas as pd
from IPython.display import display, HTML
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage
from llama_index.core.evaluation import CorrectnessEvaluator, FaithfulnessEvaluator, RelevancyEvaluator, DatasetGenerator

# Step 2 : Mount your Google Drive and Set the API key

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Setting OpenAI API key
from google.colab import userdata
openai.api_key = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [5]:
# checking if the OpenAI API is working correctly
messages = [
    ChatMessage(role="system", content="You are an AI assistant to the user."),
    ChatMessage(role="user", content="What is the revenue of Microsoft in 2023?")
]
resp = OpenAI().chat(messages)
print(resp)

assistant: I'm sorry, but I cannot provide real-time or future financial data as I do not have access to current or future information. I recommend checking Microsoft's official financial reports or news sources for the most up-to-date information on their revenue in 2023.


# Step 3 - Data Loading (Ingestion)

In [6]:
# using PDFReader method to loader PDFs documents
loader = PDFReader()
document1 = loader.load_data(file = '/content/drive/MyDrive/Upgrad/GenAI_HelpMate_AI_Project/Principal-Sample-Life-Insurance-Policy.pdf')

In [7]:
print('Loaded docs:', len(document1))
type(document1)

Loaded docs: 64


list

In [8]:
document1[0]

Document(id_='efe72f91-0058-489f-9730-b805f004c990', embedding=None, metadata={'page_label': '1', 'file_name': 'Principal-Sample-Life-Insurance-Policy.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=' \n \n \n \n \nGROUP POLICY FOR: \nRHODE ISLAND JOHN DOE \n \nALL MEMBERS \nGroup Member Life Insurance \n \nPrint Date: 07/16/2014 \n \nDOROTHEA GLAUSE S655 \nRHODE ISLAND JOHN DOE 01/01/2014 \n711 HIGH STREET  \nGEORGE RI 02903  \n \n \n \n \n                                       ', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')

# Step 4 - Building the query engine (Splitter, Embedding, VectorStore)

In [57]:
## Initialize the node_parser with the custom node settings
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)

##Initialize the embedding model
Settings.embed_model = OpenAIEmbedding(model_name='text-embedding-ada-002')

##Initialize the OpenAI model
llm = OpenAI(model="gpt-3.5-turbo", temperature=0, max_tokens=256)
Settings.llm = llm

## Initialize the num_output and the context window
Settings.num_output = 512
Settings.context_window = 3900

# Create a VectorStoreIndex from a list of documents using the service context
index = VectorStoreIndex.from_documents(document1)

# Initialize a query engine for the index with a specified similarity top-k value
query_engine = index.as_query_engine(similarity_top_k=3)
chat_engine = index.as_chat_engine(chat_mode="condense_plus_context", verbose=True)

In [10]:
response = query_engine.query("What is the insurance company name ?")

In [11]:
# Fetching the response value which contains the actual response inside the response object
response.response

'The insurance company name is Principal Life Insurance Company.'

# Step 5 - Creating a response Pipeline

In [12]:
## Query response function
def query_response(user_input):
  response = query_engine.query(user_input)
  file_name = response.source_nodes[0].node.metadata['file_name']
  page_numbers = [response.source_nodes[i].node.metadata['page_label'] for i in range(len(response.source_nodes))]
  final_response = {"response": response.response, "file_name": file_name, "page_number": page_numbers}
  return final_response

In [53]:
def initialize_conv():
  print('Feel free to ask Questions regarding insurance document. Press exit once you are done')
  while True:
    user_input = input()
    # Type 'exit' to exit conversation
    if user_input.lower() == 'exit':
      print('Exiting the program... bye')
      break
    else:
      # Pass only the user input string to the chat_engine.chat method
      response = chat_engine.chat(user_input)
      response_text = response.response
      display(HTML(f'<p style="font-size:20px"><b>User:</b> {user_input}</p>'))
      display(HTML(f'<p style="font-size:20px"><b>Assistant:</b> {response_text}</p>'))

# Step 6 - Building a Testing Pipeline

In [15]:
def testing_pipeline(questions):
  test_feedback  = []
  for i in questions:
    res = query_response(i)
    response_text = res['response']
    page_info = ', '.join(res['page_number'])
    test_feedback.append((i, response_text, page_info))

  feedback_df = pd.DataFrame(test_feedback, columns =['Question', 'Response', 'Page'])
  return feedback_df

In [16]:
questions = [
    "What is the insurance company name ?",
    "What is the insurance company address ?",
    "What is the name of the employer ?"
    "What is the city and zip code for RHODE ISLAND JOHN DOE's address?",
    "What is the page label for the document?",
    "What is the file name of the document?"
]

In [17]:
testing_pipeline(questions)

Unnamed: 0,Question,Response,Page
0,What is the insurance company name ?,The insurance company name is Principal Life I...,"64, 5, 3"
1,What is the insurance company address ?,"DES MOINES, IOWA 50392-0001","64, 5, 3"
2,What is the name of the employer ?What is the ...,The name of the employer is RHODE ISLAND JOHN ...,"1, 3, 5"
3,What is the page label for the document?,The page label for the document is the number ...,"2, 4, 63"
4,What is the file name of the document?,Principal-Sample-Life-Insurance-Policy.pdf,"2, 4, 15"


# Step 7 - Evaluating response

In [18]:
# generating questions
data_generator = DatasetGenerator.from_documents(document1)
eval_questions = data_generator.generate_questions_from_nodes()

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


## 1. Evaluating response using Relevancy Evaluator
> The Relevancy Evaluator assesses how relevant a generated response is to the user's query. It helps determine if the answer directly addresses the question asked.

In [19]:
# Create RelevancyEvaluator using GPT-4 LLM
relevancy_evaluator = RelevancyEvaluator()

In [20]:
question_number = random.randint(0, len(eval_questions))

# Generate response
response_vector = query_engine.query(eval_questions[question_number])

# Evaluation
relevancy_eval_result = relevancy_evaluator.evaluate_response(
    query=eval_questions[question_number], response=response_vector
)



In [21]:
relevancy_eval_result.query

'What must a Dependent do in order to have their individual policy issued and in force after their coverage under the Group Policy terminates?'

In [22]:
relevancy_eval_result.response

'A Dependent must apply for individual purchase within 31 days after their coverage under the Group Policy terminates. The first premium for the individual policy must be paid to The Principal within this timeframe. The individual policy will then be in force on the 32nd day after the termination date of the coverage under the Group Policy.'

In [23]:
relevancy_eval_result.passing

True

In [24]:
relevancy_eval_result.feedback

'YES'

In [25]:
relevancy_eval_result.score

1.0

## 2. Evaluating response using Correctness Evaluator
> Evaluates the relevance and correctness of a generated answer against a reference answer.

In [26]:
correctness_evaluator = CorrectnessEvaluator()

In [27]:
query = eval_questions[question_number]
response_text = response_vector.response  # Extract the response text
reference = document1[9].text

correctness_eval_result = correctness_evaluator.evaluate(
    query=query,
    response=response_text,  # Pass the response text string
    reference=reference,
)

In [28]:
correctness_eval_result.score

4.0

In [29]:
correctness_eval_result.passing

True

In [30]:
correctness_eval_result.feedback

'The generated answer provides a relevant and correct response to the user query. It explains that a Dependent must apply for an individual policy within 31 days after their coverage under the Group Policy terminates, pay the first premium to The Principal within this timeframe, and the individual policy will be in force on the 32nd day after termination. The information aligns with the requirements outlined in the reference answer.'

## 3. Evaluating response using Faithfulness Evaluator
>  Measures if the response from a query engine matches any source nodes. This is useful for measuring if the response was hallucinated.

In [31]:
faithfulness_evaluator = FaithfulnessEvaluator()

In [32]:
faithfulness_eval_result = faithfulness_evaluator.evaluate_response(response=response_vector)

In [33]:
faithfulness_eval_result.passing

True

In [34]:
faithfulness_eval_result.feedback

'YES'

In [35]:
faithfulness_eval_result.score

1.0

In [36]:
faithfulness_eval_result

EvaluationResult(query=None, contexts=["This policy has been updated effective  January 1, 2014 \n \nPART III - INDIVIDUAL REQUIREMENTS AND RIGHTS \nGC 6011  Section F - Individual Purchase Rights, Page 3  \n \n(4) Premium will be based on the Dependent's age and the standard rate of The Principal \nfor the policy form to be issued. \n \nb. Purchase Qualification \n \nA Dependent will qualify for individual purchase if: \n \n(1) Dependent Life Insurance, or any portion of it, terminates because he or she ceases to \nbe a Dependent as defined in PART I; or because the Member dies, ends Active \nWork, or ceases to be in a class eligible for such insurance; or \n(2) the Dependent spouse's Dependent Life Insurance terminates as described in PART \nIII, Section C; or \n(3) the Dependent spouse's or Civil Union Partner's Dependent Life Insurance terminates \nbecause of divorce or separation or termination of a Civil Union partnership from the \nMember; or \n(4) after the Dependent has been c

# Step 8 - Presentation

In [58]:
initialize_conv()

Feel free to ask Questions regarding insurance document. Press exit once you are done
What is the page label for the document
Condensed question: What is the page label for the document


What is the file name of the document?
Condensed question: What is the file name of the document?


What must a Dependent do in order to have their individual policy issued and in force after their coverage under the Group Policy terminates?
Condensed question: What must a Dependent do in order to have their individual policy issued and in force after their coverage under the Group Policy terminates?


exit
Exiting the program... bye
