In [1]:
# Importing necessary libraries
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings

import os
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not give an answer.</s>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""
prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

In [5]:
"""Answer a question using RAG with the given knowledge index."""
def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text


    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = ""
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    # Redact an answer
    answer = llm.invoke({"question":question, "context":context})

    return answer, relevant_docs

In [6]:
"""Runs RAG tests on the given dataset and saves the results to the given output file."""
def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: LLM,
    knowledge_index: VectorStore,
    output_file: str,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            # "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [7]:
EVALUATION_PROMPT = """### Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the appropriateness and relevance  of the response strictly based on the given score rubric, not evaluating in general.
2. Also comment on the hallucination of the response to evaluate based on reference answer.
3. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
4. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}} [RESULT] {{Comment regarding hallucination of the response}}\"
5. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, appropriate and relevant based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [8]:
"""Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
def evaluate_answers(
    answer_path: str,
    eval_chat_model: LLM,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue
        eval_result = eval_chat_model.invoke({"instruction":experiment["question"],
            "response":experiment["generated_answer"],
            "reference_answer":experiment["true_answer"]})
        feedback, score, hallucination_comment = [item.strip() for item in eval_result.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback
        experiment[f"eval_hallucination_feedback_{evaluator_name}"] = hallucination_comment

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [10]:
if not os.path.exists("./output1"):
    os.mkdir("./output1")

#RAG model initialization
MODEL = "phi3"
model = Ollama(model=MODEL)
chain = prompt | model

#PDF loader
loader = PyPDFLoader("tn.pdf")
docs = loader.load()
print(docs)
special = u"\uf076"

for doc in docs:
    doc.page_content = doc.page_content.replace('\n', '')
    doc.page_content = doc.page_content.replace(special,'')
    doc.page_content = doc.page_content.replace('  ',' ')
# Transform data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Embedding the input chunk
embeddings = OllamaEmbeddings(model=MODEL)
docsearch = FAISS.from_documents(texts, embeddings)
retriever = docsearch.as_retriever()

# Load the evaluation dataset from csv file
generated_questions = pd.read_csv("evaluation.csv")
eval_dataset = datasets.Dataset.from_pandas(generated_questions, preserve_index=False)

# Initialize the evaluator model 
evaluator_name = "llama3"
evalmodel = Ollama(model=evaluator_name)
eval_prompt = PromptTemplate.from_template(EVALUATION_PROMPT)

from operator import itemgetter

evalchain = eval_prompt | evalmodel

settings_name = "outputmistral"
output_file_name = f"./output1/rag3_{settings_name}.json"


print("Running RAG...")
run_rag_tests(
    eval_dataset=eval_dataset,
    llm=chain,
    knowledge_index=docsearch,
    output_file=output_file_name,
    verbose=False,
)

print("Running evaluation...")
evaluate_answers(
    output_file_name,
    evalchain,
    evaluator_name,
    evaluation_prompt_template,
)           

[Document(page_content=' \nHIGH LIGHTS FOR BUDGET ESTIMATES 202 4-25 \nTamil Development  \n\uf076 The twin epics of Tamil literature, Silappathikaram and \nManimegalai, will be translated into 25 Indian and \nforeign languages at a cost of Rs. 2 crore.  \n\uf076 In order to translate literary works and  spread the \neuphonious  notes  of Tamil language across the world, \nan allocation of Rs.2 crore will be made in the coming \nyear. \n\uf076 In order to ensure that Tamil flourishes in the rapidly \nadvancing technological landscape , an allocation of \nRs.5 crore will be made to enable startups  \nto develop  Natural Language Processing and Large \nLanguage Models  based on machine learning and \nartificia l intelligence.  \n\uf076 In order to enable future generations to appreciate \nthe richness of Tamil language and the glorious history \nof Tamil people, a project to digitize rare books and \ndocuments will be undertaken at a cost of Rs.2 crore.  \n \n ', metadata={'source': 'tn.

100%|██████████| 10/10 [04:09<00:00, 24.95s/it]


Running evaluation...


 10%|█         | 1/10 [01:13<10:59, 73.25s/it]

Feedback: The response does not provide any estimated cost for constructing bait arches, fish landing centers, or other coastal protection works in various districts, which makes it completely irrelevant to the question. [RESULT] 1 [RESULT] This response hallucinates by not providing any relevant information and instead provides a statement that is factually incorrect (the reference answer is Rs.450 crore).


 20%|██        | 2/10 [02:03<07:57, 59.71s/it]

Feedback: The response accurately captures the purpose of allocating Rs.20 crore in the Budget Estimates for animal husbandry, which is to strengthen animal sterilization projects and ensure effective operation of animal birth control centres across Tamil Nadu. [RESULT] 5 [RESULT] The response does not exhibit any hallucination as it directly addresses the purpose of allocation mentioned in Document 1 regarding Animal Husbandry.


 30%|███       | 3/10 [03:03<07:00, 60.04s/it]

Feedback: The response does not provide an estimated cost for modernizing dairy plants with automated machines in cities like Tiruchirapalli, Madurai, and Salem, which indicates that the answer cannot be deduced from the given context. This shows a good understanding of the limitations of the provided information [RESULT] 2 [RESULT] The response does not hallucinate as it clearly states that an answer cannot be deduced from the given documents.


 40%|████      | 4/10 [03:54<05:38, 56.48s/it]

Feedback: The response accurately reflects the lack of information provided in the context regarding the establishment of mini textile parks or their distribution across various districts such as Karur, Erode, and Virudhunagar. [RESULT] 1 [RESULT] This response does not contain any hallucination as it correctly acknowledges that an answer cannot be deduced from the given documents.


 50%|█████     | 5/10 [04:51<04:42, 56.56s/it]

Feedback: The response correctly acknowledges the absence of relevant information in the provided context to deduce details about the 'Research and Business Development Fund for Technical Textiles and Man Made Fibre'. This shows an understanding that the fund's corpus cannot be determined without further information. [RESULT] 4 [RESULT] There is no hallucination of the response, as it accurately reflects the limitations imposed by the lack of relevant context.


 60%|██████    | 6/10 [06:37<04:53, 73.36s/it]

Feedback: The response does not attempt to provide an answer to the question, instead, it correctly acknowledges that the provided context does not contain information regarding the increased capital subsidy percentage for major investments under the existing policy for technical textiles. This shows understanding of the limitations of the given information. [RESULT] 5 [RESULT] There is no hallucination in this response as it accurately reflects the lack of relevant information in the provided context, which aligns with the reference answer that there is no increased capital subsidy percentage for major investments under the existing policy for technical textiles.


 70%|███████   | 7/10 [08:07<03:56, 78.78s/it]

Feedback: The response correctly acknowledges that the provided context does not contain information about a specific duration for disbursing the capital subsidy for technical textiles investments. It accurately concludes that an answer cannot be deduced from it. [RESULT] 4 [RESULT] The response does not hallucinate as it only states what can be inferred from the given context, which is the absence of information about the duration.


 80%|████████  | 8/10 [09:28<02:39, 79.57s/it]

Feedback: The response does not provide any information about the districts where artificial reefs will be constructed as part of coastal protection works, which makes it difficult to assess its correctness or relevance to the reference answer. [RESULT] 2 [RESULT] The response seems to have a hallucination that there is no relevant information in the provided context, but it does not attempt to provide any actual information about the districts.


 90%|█████████ | 9/10 [10:52<01:20, 80.73s/it]

Feedback: The response correctly acknowledges that the estimated cost for establishing mini textile parks in various districts cannot be deduced from the provided context. This shows an understanding of the limitations of the given information. [RESULT] 2 [RESULT] The response does not provide any estimation, but it is accurate in stating that the provided context is insufficient to make such a calculation. There is no hallucination of the response as it accurately reflects the given instruction.


100%|██████████| 10/10 [12:19<00:00, 73.97s/it]

Feedback: The response fails to provide a meaningful explanation of the 'Research and Business Development Fund for Technical Textiles and Man Made Fibre' due to the lack of relevant context. It correctly acknowledges that the provided information does not mention the specific fund. [RESULT] 2 [RESULT] This response shows some awareness that additional context is needed, but ultimately provides no insight into the purpose of the fund.





In [15]:
import glob

outputs = []
for file in glob.glob("./output1/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

In [16]:
result["eval_score_llama3"] = result["eval_score_llama3"].apply(lambda x: int(x) if isinstance(x, str) else 1)
result["eval_score_llama3"] = (result["eval_score_llama3"] - 1) / 4

In [17]:
# Calculating the score of the RAG model
average_scores = result.groupby("settings")["eval_score_llama3"].mean()
print("RAG model score: ",average_scores.sort_values())

settings
./output1\rag3_outputmistral.json    0.45
Name: eval_score_llama3, dtype: float64

In [18]:
# Storing the evaluated result in csv file
result.to_csv("eval_result1.csv")