In [14]:
# Importing libraries
import os
import hashlib
from dotenv import load_dotenv, dotenv_values
# from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, Pinecone
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain_pinecone import PineconeVectorStore as PConeStore
from langchain.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [2]:
env = dotenv_values()

In [3]:
# This function require the name of the file to load it in
def load_from_dir(file_name):
    # file must be in the same directory for this to work as expected
    cwd = os.getcwd()
    full_path = os.path.join(cwd, file_name)
    loader = PyPDFLoader(full_path)
    data = loader.load()

    return data

In [4]:
# splitting the document with recursive character text splitting
def split_docs(data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    splits = splitter.split_documents(data)

    return splits

In [5]:
# Setting up custom class for vector embeddings and retireval
class PineconeVectorStore:
    """
    To send langchain compatible docs to your pinecone db and get a reriever/query engine for your RAG application
    
    example usage:
    ```
    data = langchain_loader.load()
    vector_db = PineconeVectorStore(api_key="", index_name="", embedding_model=langchain_embed_model).populate_db_with_docs(data)
    retriever = vector_db.as_retriever() # accepts argument k, which 3 by default, can be specified as per choice
    ```
    
    Then the retriever retruns a query engine that can be used as:
    context = retriever("Sample query goes here")
    Note: api_key can be passed in directly or by creating "PINECONE_APIKEY" in a .env file, and internet connection is required
    """
    def __init__(self, index_name, embedding_model, api_key=None):
        env = dotenv_values()
        self.api_key = api_key or env.get("PINECONE_APIKEY")
        self.index_name = index_name
        self.embedding_model = embedding_model
        self.index = self._set_up_store()
    
    def _set_up_store(self):
        pc = Pinecone(api_key=self.api_key)
        dim = len(self.embedding_model.embed_query("engineerLambda"))
        if self.index_name not in pc.list_indexes().names():
            pc.create_index(name=self.index_name, dimension=dim, spec=ServerlessSpec(cloud="aws", region="us-east-1"))
        index = pc.Index(self.index_name)
    
        return index

    def _format_result(self, query_result):
        matches = query_result["matches"]
        fetched_arr = []
        for match in matches:
            content = match["metadata"]["text"]
            fetched_arr.append(content)

        full_context = "\n\n".join(fetched_arr)
        return full_context

    def _generate_id(self, content):
        hash_obj = hashlib.sha256()
        hash_obj.update(content.encode("utf-8"))
        unique_id = hash_obj.hexdigest()
        return unique_id
        
    def populate_db_with_docs(self, docs):
        index = self._set_up_store()
        # generating vectors from docs
        vectors = []
        for doc in docs:
            content = doc.page_content
            id_ = self._generate_id(content)
            embed_vector = self.embedding_model.embed_query(content)
            vector_dict = {
                "id" : id_,
                "values" : embed_vector,
                "metadata" : {"text" : content}
            }
            vectors.append(vector_dict)

        index.upsert(vectors=vectors)
        print(f"Index {self.index_name} has been populated with {len(docs)} documents")

        # returning the class object so that it can be used to run the as_retrieval method
        return self
    
    # to set as retrieval chain, like the way langchain handles it, returns a retriever engine
    def as_retriever(self, k=3):
        # It also then returns a query engine that returns the needed context for the llm in a formatted way
        def query(query):
            query_vector = self.embedding_model.embed_query(query)
    
            #similarity search with favorite with top_k
            search_results = self.index.query(vector=query_vector, top_k=k, include_metadata=True)
            llm_context = self._format_result(search_results)
            
            return llm_context
        return query

In [6]:
class RAGPipeline:
    """
    Custom RAG pipeline, with support for langchain llms, retriever set from above PineconeVectorStore class

    example usage:
    ```
    chain = RAGPipeline(llm=llm, retriever=pinecone_retriever, prompt_template=langchain_prompt_template)
    response = chain.query()
    ```
    """
    def __init__(self, llm, retriever, prompt_template):
        self.llm = llm
        self.retriever = retriever
        self.prompt_template = prompt_template

    def generate_prompt(self, prompts, temperature=0):
        context = self.retriever(query=prompts)
        formatted_prompt = self.prompt_template.format(question=prompts, context=context)
        response = self.llm.invoke(formatted_prompt, temperature=temperature)
        
        return response

    async def agenerate_prompt(self, prompts, temperature):
        context = self.retriever(query=prompts)
        formatted_prompt = self.prompt_template.format(question=prompts, context=context)
        response = await self.llm.invoke(formatted_prompt, temperature=temperature)
        
        return response

In [7]:
data = load_from_dir("policy-booklet-0923.pdf")
docs = split_docs(data)

In [8]:
g_api_key = dotenv_values().get("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = g_api_key

In [9]:
embed_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_db = PineconeVectorStore(index_name="athina", embedding_model=embed_model).populate_db_with_docs(docs)

Index athina has been populated with 77 documents


In [11]:
llm = GoogleGenerativeAI(model="models/gemini-pro")
retriever = vector_db.as_retriever()
prompt_template = """
You are a question and answer agent, you are provided with this question
question: {question}
and you are to use the provided context below to provide a context-aware answer, if the provided information does not
contain the answer, tell the user you dont know.
context: {context}
"""
chain = RAGPipeline(llm, retriever, prompt_template)
chain.generate_prompt("How much will you pay if my car is damaged?")

'I am sorry I do not know the answer to that question. The provided context does not mention the amount that will be paid out if your car is damaged.'

To generate dataset from ragas automatically, I will first to recreate the pipeline with langchain supported classes instead of custom classes to see if the chain gets any better

In [15]:
# index name has been created with the above implemented class anyways
os.environ["PINECONE_API_KEY"] = env.get("PINECONE_APIKEY")
vector_store = PConeStore.from_documents(docs, embedding=embed_model, index_name="athina")
retriever = vector_store.as_retriever()

In [16]:
prompt = PromptTemplate.from_template(prompt_template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("How much will you pay if my car is damaged?")

'Where damage to your car is covered under your policy, we’ll pay the cost of repairing or replacing your car up to its UK market value. This is the current value of your car at the time of the claim. It may be different to the amount you paid or any amount you provided when you insured your car with us.'

This simpler direct form is even a better rag pipeline, nice.

In [17]:
# Ragas test
# import ragas
# from ragas.testset.generator import TestsetGenerator
# from ragas.testset.evolutions import simple, reasoning, multi_context
# generator = GoogleGenerativeAI(model="gemini-pro", api_key=env.get("GOOGLE_API_KEY"))
# critic = GoogleGenerativeAI(model="gemini-1.5-flash", api_key=env.get("GOOGLE_API_KEY2"))
# generator = TestsetGenerator.from_langchain(critic_llm=critic, generator_llm=generator, embeddings=embed_model)

# distributions = {
#     simple: 0.6,
#     multi_context: 0.2,
#     reasoning: 0.2
# }
# # instrustion says at least 40 testsets
# testset = generator.generate_with_langchain_docs(docs, 40, distributions) 
# testset.to_pandas()

embedding nodes:   0%|          | 0/154 [00:00<?, ?it/s]

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._co

Generating:   0%|          | 0/40 [00:00<?, ?it/s]

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._co

KeyboardInterrupt: 

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._co

Since I am unable to use the free tier models to automatically generate the dataset, I will generate them manually, and prepare the dataset in a Ragas supported format. Using ragas beacuse it's more popular and is said to be the best.

In [19]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


In [None]:
questions = []
answers = []
ground_truths = []
contexts = []

while True:
    count = 0
    question = input("Question goes here: ")
    ground_truth = input("Ground Truth here: ")
    answer = rag_chain.invoke(question)
    context = [doc.page_content for doc in retriever.get_relevant_documents(question)]

    if question != "end":
        questions.append(question)
        answers.append(answer)
        ground_truths.append(ground_truth)
        contexts.append(context)
    
        count += 1
        print(f"{count} added")
    else:
        break

data = {
    "question" : questions,
    "answer" : answers,
    "ground_truth" : ground_truths,
    "context" : contexts
}
data = pd.DataFrame(data)

In [None]:
data["contexts"] = data["context"]
data = data.drop(columns="context")

In [33]:
data

Unnamed: 0,question,answer,ground_truth,contexts
0,How much will you pay if my car is damaged?,Where damage to your car is covered under your...,Where damage to your car is covered under your...,[and replacement are shown on your car \ninsur...
1,What is DriveSure?,DriveSure is Churchill's telematics insurance ...,DriveSure is our telematics insurance product....,[Page 3FAQs\nHow much will you pay if my car i...
2,Who is covered to drive other cars?,The main driver may be covered for liability t...,Your certificate of motor insurance will show ...,[Page 12If the main driver is driving \nanothe...
3,What’s the difference between commuting and bu...,Business use provides cover for driving in con...,Business use provides cover for driving in con...,"[It’s designed to capture how, when and where ..."
4,Am I covered if I leave my car unlocked or the...,"No, you are not covered if you leave your car ...",We won’t pay a claim for theft or attempted th...,[or removable in-car entertainment inside \na ...
5,What’s not included in my cover?,"I apologize, but the provided document does no...",We don’t cover things like: > Mechanical or el...,[Page 15Section 2: Fire and theft\nIn-car ente...
6,Can I use my car abroad?,"I am sorry, I cannot find the answer to your q...","If you want to use your car abroad, your cover...","[Countries included\nAndorra, Austria, Belgium..."
7,Are my electric car’s charging cables covered?,"Yes, your electric car's charging cables are c...",Your home charger and charging cables are cons...,"[It’s designed to capture how, when and where ..."
8,Does Churchill have approved repairers?,"Yes, Churchill customers have access to a nati...",Churchill customers have access to a national ...,[Need to claim?\n0345 878 6261\nWindscreen cla...
9,Is my electric car battery covered?,"Yes, your car's battery is covered if it's dam...",Your car’s battery is covered if it’s damaged ...,"[It’s designed to capture how, when and where ..."


In [35]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['question', 'answer', 'ground_truth', 'contexts', '__index_level_0__'],
    num_rows: 10
})

In [None]:
result = evaluate(
    dataset=dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    llm=critic, # gemini-flash llm
    embeddings=embed_model
)

In [None]:
df = result.to_pandas()
df