In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(".\data\RAG.pdf")

  loader = PyPDFLoader(".\data\RAG.pdf")
  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:
docs = loader.load()

In [3]:
# CHunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter  = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
)

chunks = text_splitter.split_documents(docs)

In [31]:
chunks[2]

Document(metadata={'source': '.\\data\\RAG.pdf', 'page': 0}, page_content='eration, natural language processing, information retrieval\nI. I NTRODUCTION\nLARGE language models (LLMs) have achieved remark-\nable success, though they still face significant limitations,\nespecially in domain-specific or knowledge-intensive tasks [1],\nnotably producing “hallucinations” [2] when handling queries\nbeyond their training data or requiring current information. To\novercome challenges, Retrieval-Augmented Generation (RAG)\nenhances LLMs by retrieving relevant document chunks from\nexternal knowledge base through semantic similarity calcu-\nlation. By referencing external knowledge, RAG effectively\nreduces the problem of generating factually incorrect content.\nIts integration into LLMs has resulted in widespread adoption,\nestablishing RAG as a key technology in advancing chatbots\nand enhancing the suitability of LLMs for real-world applica-\ntions.\nRAG technology has rapidly developed in re

In [5]:
# Setting up the vector store
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(
    documents = chunks,
    collection_name = 'evaluation',
    embedding = OpenAIEmbeddings()
)





In [6]:
# Checking on our retriever

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever.invoke("What is Naive RAG?")

[Document(metadata={'page': 1, 'source': '.\\data\\RAG.pdf'}, page_content='RAG, and Modular RAG, as showed in Figure 3. Despite\nRAG method are cost-effective and surpass the performance\nof the native LLM, they also exhibit several limitations.\nThe development of Advanced RAG and Modular RAG is\na response to these specific shortcomings in Naive RAG.\nA. Naive RAG\nThe Naive RAG research paradigm represents the earli-\nest methodology, which gained prominence shortly after the'),
 Document(metadata={'page': 15, 'source': '.\\data\\RAG.pdf'}, page_content='external knowledge bases. The survey showcases the evolution\nof RAG technologies and their application on many different\ntasks. The analysis outlines three developmental paradigms\nwithin the RAG framework: Naive, Advanced, and Modu-\nlar RAG, each representing a progressive enhancement over\nits predecessors. RAG’s technical integration with other AI\nmethodologies, such as fine-tuning and reinforcement learning,\nhas further ex

In [7]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough


# Prompt
prompt_template = """You are an assistant for question-answering tasks who answers questions based 
only on the context that are provided to you.
If you don't know the answer, just say that you don't know.
Follow these instructions strictly:

- Use three sentences maximum and keep the answer concise.
- Do not make up anything from your end, only refer to the context provided for answer generation
- If the context doesn't have required information to answer the question, respond with "I do not know"

question: {question}
search_results: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

# llm 

llm = ChatOpenAI(model = 'gpt-4o')

# combining the retrieved docs
def format_docs(docs):
    if not docs:
        return ""
    return "\n\n".join(doc.page_content for doc in docs if doc.page_content)


# Chain
rag_chain = (
    {'context': retriever | format_docs , 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [8]:
rag_chain.invoke(input = "What is Naive RAG?")


"Naive RAG represents the earliest methodology within the RAG framework, which gained prominence shortly after its inception. It relies directly on the user's original query for retrieval, which can lead to subpar effectiveness if the query is not precise or clear. Challenges include difficulty in handling complex or ambiguous language and specialized vocabulary."

Working with Self Generated Reference Data

In [9]:
from datasets import Dataset
import pandas as pd

# Reading the Human Generated Test Set
df = pd.read_csv("./data/testset.csv", delimiter = '|')
questions = df["question"].tolist()
ground_truth = df["ground_truth"].tolist()

# Setting up the schema for eval dataset
data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

# Creating food for eval dataset
for query in questions:
    # The question
    data["question"].append(query)
    # The Rag generated answer
    data["answer"].append(rag_chain.invoke(input = query))
    # The retrieved contexts
    data["contexts"].append(
        [doc.page_content for doc in retriever.invoke(query)]
    )

dataset = Dataset.from_dict(data)

Let's see this dataset in a df format 

In [10]:
df1 = pd.DataFrame(dataset)
df1.head()

Unnamed: 0,question,answer,contexts,ground_truth
0,What is Retrieval-Augmented Generation (RAG)?,Retrieval-Augmented Generation (RAG) is a tech...,[1\nRetrieval-Augmented Generation for Large\n...,RAG enhances LLMs by incorporating knowledge f...
1,What are the three main paradigms of RAG?,"The three main paradigms of RAG are Naive RAG,...",[external knowledge bases. The survey showcase...,"The three main paradigms of RAG are Naive RAG,..."
2,What is the primary purpose of RAG in large la...,The primary purpose of Retrieval-Augmented Gen...,[2\nFig. 1. Technology tree of RAG research. T...,The primary purpose is to reduce hallucination...
3,What are the three core components of RAG?,"The three core components of RAG are ""Retrieva...",[ponents intricately collaborate to form a coh...,"The core components of RAG are Retrieval, Gene..."
4,What is the drawback of Naive RAG in the retri...,The drawback of Naive RAG in the retrieval pha...,[4\nFig. 3. Comparison between the three parad...,"Naive RAG struggles with precision and recall,..."


Let's see how RAGAS treats a Single Turn Sample for evaluation and then we can scale it up for the entire test set.

In [11]:
# Getting contexts for the question
contexts = []
for i in retriever.invoke("What is Retrieval-Augmented Generation (RAG)?"):
    contexts.append(i.page_content)

In [12]:
# Creating a single turn sample

from ragas import SingleTurnSample

# Creating a single turn sample object
sample = SingleTurnSample(
    user_input="What is Retrieval-Augmented Generation (RAG)?",
    reference="RAG enhances LLMs by incorporating knowledge from external databases, improving accuracy and credibility for knowledge-intensive tasks.",
    retrieved_contexts=contexts,
    response = rag_chain.invoke("What is Retrieval-Augmented Generation (RAG)?"))

print(sample.to_dict())

{'user_input': 'What is Retrieval-Augmented Generation (RAG)?', 'retrieved_contexts': ['1\nRetrieval-Augmented Generation for Large\nLanguage Models: A Survey\nYunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng\nWangc, and Haofen Wanga,c\naShanghai Research Institute for Intelligent Autonomous Systems, Tongji University\nbShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\ncCollege of Design and Innovation, Tongji University\nAbstract —Large Language Models (LLMs) showcase impres-\nsive capabilities but encounter challenges like hallucination,\noutdated knowledge, and non-transparent, untraceable reasoning\nprocesses. Retrieval-Augmented Generation (RAG) has emerged\nas a promising solution by incorporating knowledge from external\ndatabases. This enhances the accuracy and credibility of the\ngeneration, particularly for knowledge-intensive tasks, and allows\nfor continuous knowledge updates and inte

In [23]:
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ContextEntityRecall,
    NoiseSensitivity,
    ResponseRelevancy,
    Faithfulness,
)


embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

# Defining each metric that we wanna see
metrics = {
        "Context Precision": LLMContextPrecisionWithReference(llm=llm),
        "Context Recall": LLMContextRecall(llm=llm),
        "Context Entities Recall": ContextEntityRecall(llm=llm),
        "Noise Sensitivity": NoiseSensitivity(llm=llm),
        "Response Relevancy": ResponseRelevancy(llm=llm,embeddings=OpenAIEmbeddings()),
        "Faithfulness": Faithfulness(llm=llm),
    }

# Define a function to evaluate all metrics for a sample
def evaluate_metrics(sample: SingleTurnSample, metrics:dict):
    # Results dictionary to store the metric values
    results = {}
    # Iterating through the metrics dictionary
    for metric_name, metric in metrics.items():

        try:
            results[metric_name] = metric.single_turn_score(sample)

        except Exception as e:
            results[metric_name] = f"Error: {e}"

    return results



In [14]:
results = evaluate_metrics(sample = sample, metrics = metrics)
results


{'Context Precision': 0.9166666666361111,
 'Context Recall': 1.0,
 'Context Entities Recall': 0.3333333322222222,
 'Noise Sensitivity': 0.5555555555555556,
 'Response Relevancy': 0.9463667521553832,
 'Faithfulness': 1.0}

Let's scale this up to get each of our test case evaluated

In [15]:
# Function to evaluate each row

def evaluate_row(row, metrics):
    """
    Creates a single turn sample for the row
    Evaluates all metrics for it
    Returns a dictionary containing all metrics

    """

    # Create a SingleTurnSample for every row
    sample = SingleTurnSample(
        user_input=row['question'],
        reference=row['ground_truth'],
        retrieved_contexts=row['contexts'],
        response = row['answer']
    )
    
    # Evaluate metrics for the sample
    results = {}
    for metric_name, metric in metrics.items():
        try:
            results[metric_name] = metric.single_turn_score(sample)
        except Exception as e:
            results[metric_name] = f"Error: {e}"
    
    return results

# Function to evaluate the entire dataframe of testset

def evaluate_dataframe(df, metrics):
    """
    Iterates through the df test set
    For every row uses evaluate_row function to get result dictionary
    Append each dictionary to a list
    Uses list to create the result dataframe

    """
    results = []
    for _, row in df.iterrows():
        row_results = evaluate_row(row, metrics)
        results.append(row_results)
    return pd.DataFrame(results)

In [16]:
df_evaluation = evaluate_dataframe(df1[:3],metrics = metrics) 

In [19]:
df_evaluation

Unnamed: 0,Context Precision,Context Recall,Context Entities Recall,Noise Sensitivity,Response Relevancy,Faithfulness
0,0.916667,1.0,0.333333,0.583333,0.946367,1.0
1,1.0,1.0,1.0,0.0,1.0,1.0
2,0.5,0.0,0.0,1.0,0.953538,1.0


In [22]:
df1.iloc[:3]

Unnamed: 0,question,answer,contexts,ground_truth
0,What is Retrieval-Augmented Generation (RAG)?,Retrieval-Augmented Generation (RAG) is a tech...,[1\nRetrieval-Augmented Generation for Large\n...,RAG enhances LLMs by incorporating knowledge f...
1,What are the three main paradigms of RAG?,"The three main paradigms of RAG are Naive RAG,...",[external knowledge bases. The survey showcase...,"The three main paradigms of RAG are Naive RAG,..."
2,What is the primary purpose of RAG in large la...,The primary purpose of Retrieval-Augmented Gen...,[2\nFig. 1. Technology tree of RAG research. T...,The primary purpose is to reduce hallucination...


In [24]:
from ragas.testset import TestsetGenerator
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))
generator = TestsetGenerator(llm=llm, embedding_model=embeddings)

In [None]:
testset_size = 10  # Number of samples to generate

testset = generator.generate_with_langchain_docs(chunks, testset_size)