# Notebook to evaluate the Retrieval part of the system

## Setup

In [1]:
# First we change the directory to the root directory of the project.
import os
os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))

# Alternatively, move two directories up using the following two commands instead of running the above two lines.
#%cd ..
#%cd ..

In [None]:
import pandas as pd
from tqdm import tqdm
from model.models import Agent

In [3]:
# vector search (embed_name: bge, openai, google, mistral)
# Possible parameters are below, just three were wrapped.
""" 
as_retriever function:
        Converts the Chroma object to a retriever object.
Args:
            search_type (Optional[str]): Defines the type of search that
                the Retriever should perform.
                Can be "similarity" (default), "mmr", or
                "similarity_score_threshold".
            search_kwargs (Optional[Dict]): Keyword arguments to pass to the
                search function. Can include things like:
                    k: Amount of documents to return (Default: 4)
                    score_threshold: Minimum relevance threshold
                        for similarity_score_threshold
                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                    lambda_mult: Diversity of results returned by MMR;
                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
                    filter: Filter by document metadata

        Returns:
            VectorStoreRetriever: Retriever class for VectorStore.
"""

param_vs = {
    "embed_name": "openai_parser",
    "param":
        {
            "search_type": "similarity",
            "search_kwargs": {
                "k": 10,
                # "score_threshold": None
            }
        }
}

model_in = {
    "vector_search" : param_vs,
}

In [4]:
agent = Agent(model_in)

## Example usage

In [6]:
#query = "I have a question regarding the skill class grade. The skill class grade only affects the final grade for the project correct? Also, from what I understand from the grading and assessment, it's worth a total of 10% of the final grade so if you missed one skill class, you would only miss out on 5% of the final grade, correct?"

In [7]:
#agent.retrieve_content(query)

## Using provided QA pairs

In [None]:
#df = pd.read_csv("data/QA/qa_pairs.csv", sep=";")
df = pd.read_csv("data/QA/qa_pairs_cleaned_V2.csv", sep=";")
df = df.dropna(subset=["Question", "Answer"])
df = df.dropna(axis=1)
df

In [None]:
df_retrieved_documents = []

for query in tqdm(df["Question"]):
    retrieved_documents = agent.retrieve_content(query)
    df_retrieved_documents.append(retrieved_documents)

df["Retrieved Documents"] = df_retrieved_documents

In [10]:
# Save the results to a pickle file to save computational resources (time and costs) and preserve the data structure.
df.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl')

In [None]:
df

## CHECKPOINT 1
The notebook has been sub-divided by checkpoints to continue working at a later moment and load the previous work.

Let's look at the first question and retrieved documents. One can load the saved retrievals by running the cell below

In [1]:
##CHECKPOINT cell
import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl')

In [4]:
df.rename(columns={'DocumentList': 'idd'}, inplace=True)

In [5]:
df["idd"]=df["idd"].replace(".", "0")

In [None]:
from model.raw_data import documents

page_content_to_iid = {doc.page_content: doc.metadata['iid'] for doc in documents}

def extract_ids_from_docs(retrieved_docs):
    ids = []
    for doc in retrieved_docs:
        if doc.page_content in page_content_to_iid:
            ids.append(page_content_to_iid[doc.page_content])
    return ids

df["idd_retrieved"] = df["Retrieved Documents"].apply(extract_ids_from_docs)

df

In [7]:
# Converti 'idd' in liste di interi
df['idd'] = df['idd'].apply(lambda x: list(map(int, x.split(','))))

In [None]:
import numpy as np
metrics = []

for index, row in df.iterrows():
    idd = row["idd"]
    idd_retrieved = row["idd_retrieved"]
    
    if idd==[0]: 
        metrics.append(np.nan)
    else:
        correctly_retrieved = set(idd).intersection(idd_retrieved)
        metric = len(correctly_retrieved) / len(idd)
        metrics.append(metric)

df["retrieval_metric"] = metrics

df

In [None]:
print("Mean retrieval metric: ", df["retrieval_metric"].mean())

In [None]:
df["idd"]

In [11]:
from model.raw_data import documents

def return_docs_from_ids(docs, ids):
    relevant_docs = []
    for doc in docs:
        #print(doc)
        if doc.metadata['iid'] in ids:
            relevant_docs.append(doc)
    return relevant_docs


#df["idd"] = df["idd"].apply(lambda x: [int(i) for i in str(x).split(',') if i.strip().isdigit()]) # not needed when rerunning from the checkpoint

df["Truth"] = [return_docs_from_ids(documents, ids) for ids in df["idd"]]

In [None]:
df

In [13]:
def display_data(df: pd.DataFrame, index: int):
    print("Q:", df.iloc[index]["Question"])
    print("A:", df.iloc[index]["Answer"])
    print(10*"-")
    print("Retrieved Documents:")
    for doc in df.iloc[index]["Retrieved Documents"]:
        print(doc.page_content)
    print(10*"-")
    print("True Documents:")
    for doc in df.iloc[index]["Truth"]:
        print(doc.page_content)

In [None]:
i=15
display_data(df, i)

### Generating responses

In [None]:
from model.models import Agent
from tqdm import tqdm

def generate_answer(queries, multi_query=True, retrieve_type="vector_search", rerank_type=None):
    # vector search
    param_vs = {
        "embed_name": "openai_parser",
        "param":
            {
                "search_type": "similarity",
                "search_kwargs": {
                    "k": 10,
                    # "score_threshold": None
                }
            }
    }

    # multi query
    param_mq = {
        "param":
            {
                "llm_model": "openai",
                "temperature": 0,
                "top_p": None,
                "top_k": None,
                "frequency_penalty": None,
                "max_tokens": None
            }
    }

    param_google_reranker = {
        "param":
            {
                "k": 5,
            }
    }

    # Response LLM
    param_response_llm = {
        "param": {

            "llm": {
                "llm_model": "google",
                "temperature": 0.2,
                "top_p": None,
                "top_k": None,
                "frequency_penalty": None,
                "max_tokens": None
            },
            "shots": {
                "embed_name": "openai",
                "k": 0  # 0 means no shot
            }
        }
    }

    model_in = {
        "vector_search": param_vs,
        "multi_query": param_mq,
        "google_reranker": param_google_reranker,
        "response_llm": param_response_llm
    }

    agent = Agent(model_in, DEBUG=False)

    results = []

    for query in tqdm(queries):
        result = agent.generate_response(query, multi_query=multi_query, retrieve_type=retrieve_type, rerank_type=rerank_type, self_reflection=False)
        results.append(result)

    return results

In [None]:
queries = df["Question"].to_list()
results = generate_answer(queries, multi_query=True, retrieve_type="vector_search", rerank_type="google_reranker")

In [17]:
answer_after_generation = [result["response"] for result in results]

In [None]:
df["Answer after generation"] = answer_after_generation
df

In [19]:
# Save the results to a pkl file to save computational resources (time and costs)
df.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl') # Save to pickle to preserve datastructure

## CHECKPOINT 2 - Evaluation using RAGAS
Let's look at the evaluation of the entire pipeline using RAGAS. One can load the saved retrievals and generations by running the cell below

In [1]:
##CHECKPOINT cell

import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl')

In [None]:
df

In [3]:
questions = df["Question"].to_list()
answers_by_Martijn = df["Answer"].to_list()
answer_after_generation = df["Answer after generation"].to_list()
contexts = []

for i in range(len(df)):
    #ground_truths.append(" ".join([doc.page_content for doc in df.iloc[i]["Truth"]])) #pay attention, this is a str
    contexts.append([" ".join([doc.page_content for doc in df.iloc[i]["Retrieved Documents"]])]) #pay attention, this is a List[str]

In [4]:
test_example = False #just use one example of the dataset

if test_example == True:
    exmp = {
        'question': [questions[6]],
        'answer': [answer_after_generation[6]],
        'contexts': [contexts[6]],
        'ground_truth': [answers_by_Martijn[6]]
    }
else:
    exmp = {
        'question': questions,
        'answer': answer_after_generation,
        'contexts': contexts,
        'ground_truth': answers_by_Martijn
    }

In [5]:
from datasets import Dataset
custom_dataset = Dataset.from_dict(exmp)

In [None]:
print(custom_dataset)

In [None]:
# Load local environment variables
from dotenv import load_dotenv
print("Environment variables are loaded = ", load_dotenv())

In [8]:
import os
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

In [None]:
from model.utils.utils import get_llm_model, get_embed_model
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from ragas import evaluate
from tqdm import tqdm
import time

langchain_llm = get_llm_model("openai", temperature=0.7)
langchain_embeddings = get_embed_model("openai")

for i in tqdm(range(len(df))):

    exmp = {
        'question': [questions[i]],
        'answer': [answer_after_generation[i]],
        'contexts': [contexts[i]],
        'ground_truth': [answers_by_Martijn[i]]
    }

    custom_dataset = Dataset.from_dict(exmp)

    results = evaluate(custom_dataset, metrics=[context_precision, context_recall, answer_relevancy, faithfulness], llm=langchain_llm, embeddings=langchain_embeddings, is_async=True)

    if i == 0:
        df_results = results.to_pandas()
    if i != 0:
        df_results = pd.concat([df_results, results.to_pandas()])

    time.sleep(3) # To avoid the API rate limit

In [None]:
df_results = df_results.reset_index().drop(columns=["index"])
df_results

In [12]:
# Save the results to a pkl file to save computational resources (time and costs)
df_results.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google_evaluated.pkl') # Save to pickle to preserve datastructure

## CHECKPOINT 3 - Evaluation manual inspection
Let's look at the evaluation of the entire pipeline using RAGAS. One can load the saved retrievals and generations by running the cell below

In [1]:
##CHECKPOINT cell

import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df_results = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google_evaluated.pkl')

In [None]:
df_results

In [None]:
import numpy as np
print("Context Precision:", np.mean(df_results["context_precision"]))
print("Context Recall:", np.mean(df_results["context_recall"]))
print("Answer Relevancy:", np.mean(df_results["answer_relevancy"]))
print("Faithfulness:", np.mean(df_results["faithfulness"]))

In [4]:
def display_data(df: pd.DataFrame, index: int):
    print('Index', i)
    print("Q:", df.iloc[index]["question"])
    print("A:", df.iloc[index]["answer"])
    print(10*"-")
    print("Retrieved Documents:")
    print(df.iloc[index]["contexts"])
    print(10*"-")
    print("True Answer:")
    print(df.iloc[index]["ground_truth"])
    print()
    print()