In [1]:
# Import libraries

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import os
import re
import time
import base64
from tqdm import tqdm
from glob import glob
from uuid import uuid4
from typing import List
from env_config.config import EnvConfig

try:
    from google import genai
    from google.genai import types
    from langchain_chroma import Chroma
    from langchain.chains import RetrievalQA
    from langchain_core.documents import Document
    from langchain_core.prompts import PromptTemplate
    from langchain_upstage import UpstageDocumentParseLoader
    from langchain_openai import OpenAIEmbeddings, ChatOpenAI
    from langchain.tools.retriever import create_retriever_tool
except:
    !pip install -U langchain-upstage langchain-core langchain-chroma langchain-openai langchain  "google" "google.genai"
    from google import genai
    from google.genai import types
    from langchain_chroma import Chroma
    from langchain.chains import RetrievalQA
    from langchain_openai import OpenAIEmbeddings
    from langchain_core.documents import Document
    from langchain_upstage import UpstageDocumentParseLoader
    from langchain_openai import OpenAIEmbeddings, ChatOpenAI
    from langchain.tools.retriever import create_retriever_tool

project_configs = EnvConfig()
os.environ["OPENAI_API_KEY"] = project_configs.OPENAI_API_KEY

In [2]:
# Get pdf paths

pdfs_path = glob(os.getcwd()+"\\pdfs\\*.pdf")

pdfs_path

['C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\FlexiCare - Accessing care at the hospital.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\FlexiCare - Benefits.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\FlexiCare - Exclusions.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\FlexiCare - How it works.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\FlexiCare - Need to know.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\PrimeCare - Accessing care at the hospital.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\PrimeCare - Benefits.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\PrimeCare - Exclusions.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\PrimeCare - How it works.pdf',
 'C:\\Users\\HP\\Documents\\Projects\\MyGeniusAI_Assessment\\pdfs\\PrimeCare - Need to know.pd

In [3]:
def pdf_to_base64(pdf_path: str) -> str:
    """Convert a local PDF file to base64 string."""
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode('utf-8')

def generate_with_base64(base64_data: str, input_prompt: str = "Extract All the content found in the pdf."):
    """Function to generate text from base64"""
    client = genai.Client(
        api_key=project_configs.GEMINI_API_KEY,
    )
    model = "gemini-2.5-pro"
    
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_bytes(
                    mime_type="application/pdf",
                    data=base64.b64decode(base64_data),
                ),
                types.Part.from_text(text=input_prompt),
            ],
        ),
    ]
    
    tools = [
        types.Tool(googleSearch=types.GoogleSearch()),
    ]
    
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        thinking_config=types.ThinkingConfig(
            thinking_budget=-1,
        ),
        tools=tools,
        response_mime_type="text/plain",
    )

    extracted_text = ""
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        extracted_text+=chunk.text
        
    return extracted_text

def CreateDocuments(pdf_paths:List[str])->List[Document]:
    """
    Function to loop through list of pdf paths, extract content and store in a list of Document.
    """
    list_of_documents = []
    for path in tqdm(pdf_paths, desc="Extracting content from pdf's...", total=len(pdf_paths)):
        pdf_2_base64 = pdf_to_base64(path)
        extracted_content = generate_with_base64(pdf_2_base64)
        list_of_documents.append(Document(page_content=extracted_content,
                                          metadata={"pdf_name":path.split("\\")[-1]}))
    return list_of_documents

In [4]:
# Extract content from pdf

doc_list_per_pdfs = CreateDocuments(pdfs_path)

Extracting content from pdf's...: 100%|████████████████████████████████████████████████| 15/15 [04:26<00:00, 17.79s/it]


In [5]:
doc_list_per_pdfs

[Document(metadata={'pdf_name': 'FlexiCare - Accessing care at the hospital.pdf'}, page_content='Here are things to note when accessing care at the hospital: \n\n **Prioritise your visit** \n\n We highly advise utilizing the \'Visit Hospital\' button as your first step. This helps us track your progress and prepares for your arrival, even in delays or emergencies. \n\n **Present your HMO I.D** \n\n When you arrive at the hospital, make sure to present this as it contains essential information for your healthcare coverage. \n\n **Check HMO I.D Details** \n\n Find your personal details on the HMO I.D. Below, you\'ll see your HMO provider\'s name (e.g., "Hygeia"). Tell them you are from Hygeia. \n\n **Contact Support** \n\n If you encounter challenges or have questions at the hospital, refer to the support line listed on your HMO I.D. \n\n * 08088188002 (WhatsApp Only) \n * 09070008899 (WhatsApp and Call) \n * 09039810004 (Call Only)'),
 Document(metadata={'pdf_name': 'FlexiCare - Benefit

In [6]:
# Initialize Embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [7]:
# Initialize vector store
vector_store = Chroma(
    collection_name="rag_knowledge_base",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db", 
)

In [8]:
# Create unique id's for each document
uuids = [str(uuid4()) for _ in range(len(doc_list_per_pdfs))]

# Store each document in the vector store
vector_store.add_documents(documents=doc_list_per_pdfs, ids=uuids)

['543d0c22-b781-4cee-95cd-332e93945cf8',
 'f71b6ac3-ee89-4b28-bd32-0d284a7775c3',
 '646cc7d7-5e28-47a2-8738-5d5670efd5c3',
 '25dd0931-202a-4068-acf4-fa95ffa7a0fb',
 'd406f1d2-846e-475a-bfc1-6962f1b3c949',
 'bc47ef63-9445-4d34-b560-9128fc0c6240',
 'a3a85763-f42b-45bc-b3bd-0d65554218b3',
 '05dbbe68-be75-4907-8320-068746ad71bc',
 '68dc61ee-9c8b-43db-b8aa-2a503b4ca756',
 '6dd520c6-82e8-4b79-a6aa-8eb570e55f4a',
 'df7b16bf-fab3-4f59-be36-07e41e25d4c6',
 '98f97e2d-6a02-4937-8008-9436f9b513ff',
 'ddb51c7d-b12c-425d-a028-98a1a04037bb',
 '414d4d18-4b5b-4925-bc9b-6140a7073103',
 '0fa77dc7-f531-4e66-b7fc-2bf737fb0b56']

In [9]:
# Create Retriever
retriever = vector_store.as_retriever(search_type="similarity", 
                                      search_kwargs={"k": 1})

In [10]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x0000017893626750>, search_kwargs={'k': 1})

In [11]:
# Initialize llm
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [12]:
# Define llm's template/instructions
template = """You are a helpful assistant that provides accurate information about healthcare insurance plans based on the provided documentation.

Use the following context from healthcare plan documents to answer the question:

{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(template)
)

In [13]:
# Create retriever tool
retriever_tool = create_retriever_tool(retriever=retriever,
                                       name="rag_tool", 
                                       description="Tool for rag implementation")
# Create Retriever QA
doc_retriever_chain = RetrievalQA.from_chain_type(llm=llm,
                                                  retriever=retriever,
                                                  chain_type="stuff",
                                                  chain_type_kwargs={"prompt": prompt},
                                                  return_source_documents=False)

In [14]:
# Test bot
print(doc_retriever_chain.invoke({"query":"list all the flexi care benefit available after 11 months"})["result"])

The benefits available under the FlexiCare plan after 11 months are as follows:

1. **Obstetrics Care** - Covered up to Maternity Limit of ₦100,000
   - Antenatal Care - ✔
   - Delivery (Normal) - ✔
   - Delivery (Multiple) - ✔
   - Assisted Delivery - ✔
   - Manual Vacuum Aspiration - ✔
   - Neonatal Care - Covered up to Maternity Limit of ₦100,000
   - Male Circumcision - ✔
   - Ear Piercing - ✔
2. **Renal Care** - Covered Up to ₦40,000 Limit
   - Dialysis - ✔
