# Notebook to evaluate the Retrieval part of the system

## Setup

### Windows

In [1]:
import os
# First we change the directory to the root directory of the project.
os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))

### MacOS

In [2]:
# Alternatively, for MacBooks: move two directories up using the following two commands instead of running the above cell.
#%cd ..
#%cd ..

### Continuation of setup

In [3]:
import pandas as pd
from tqdm import tqdm
from model.models import Agent

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martijn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# vector search (embed_name: bge, openai, google, mistral)
# Possible parameters are below, I wrapped just three of them, but if you need others, you can ask Dima.
""" 
as_retriever function:
        Converts the Chroma object to a retriever object.
Args:
            search_type (Optional[str]): Defines the type of search that
                the Retriever should perform.
                Can be "similarity" (default), "mmr", or
                "similarity_score_threshold".
            search_kwargs (Optional[Dict]): Keyword arguments to pass to the
                search function. Can include things like:
                    k: Amount of documents to return (Default: 4)
                    score_threshold: Minimum relevance threshold
                        for similarity_score_threshold
                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
                    lambda_mult: Diversity of results returned by MMR;
                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
                    filter: Filter by document metadata

        Returns:
            VectorStoreRetriever: Retriever class for VectorStore.
"""

param_vs = {
    "embed_name": "openai_parser",
    "param":
        {
            "search_type": "similarity",
            "search_kwargs": {
                "k": 10,
                # "score_threshold": None
            }
        }
}

model_in = {
    "vector_search" : param_vs,
}

In [5]:
agent = Agent(model_in)

## Example usage

In [6]:
#query = "I have a question regarding the skill class grade. The skill class grade only affects the final grade for the project correct? Also, from what I understand from the grading and assessment, it's worth a total of 10% of the final grade so if you missed one skill class, you would only miss out on 5% of the final grade, correct?"

In [7]:
#agent.retrieve_content(query)

## Using provided QA pairs

In [8]:
#df = pd.read_csv("data/QA/qa_pairs.csv", sep=";")
df = pd.read_csv("data/QA/qa_pairs_cleaned_V2.csv", sep=";")
df = df.dropna(subset=["Question", "Answer"])
df = df.dropna(axis=1)
df

Unnamed: 0,Question,Answer,DocumentList
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,.
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,.
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",.
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,.
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","43, 44"
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","41, 42, 43, 44, 45"
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"35, 36, 48, 49"
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,44
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,44
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"6, 11, 31, 33"


In [9]:
df_retrieved_documents = []

#qa_pairs = df.to_dict(orient="records")
#for qa_pair in tqdm(qa_pairs):
#    query = qa_pair["Question"]
#    a = qa_pair["Answer"]

#    retrieved_documents = agent.retrieve_content(query)
#    df_retrieved_documents.append(retrieved_documents)

#Alternative for-loop using the questions only
for query in tqdm(df["Question"]):
    retrieved_documents = agent.retrieve_content(query)
    df_retrieved_documents.append(retrieved_documents)

df["Retrieved Documents"] = df_retrieved_documents

100%|██████████| 57/57 [00:26<00:00,  2.12it/s]


In [10]:
# Save the results to a csv file to save computational resources (time and costs)
df.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl') # Save to pickle to preserve datastructure

In [11]:
df

Unnamed: 0,Question,Answer,DocumentList,Retrieved Documents
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,.,[page_content='# Rules and Regulations\n\n## S...
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,.,[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",.,[page_content='# Rules and Regulations\n\n## S...
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,.,[page_content='# Article 5.10 Determination an...
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","43, 44",[page_content='\n| 10-points Descriptor | UK |...
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","41, 42, 43, 44, 45",[page_content='# Section 3 Procedures\n\n## Ru...
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"35, 36, 48, 49",[page_content='# Rules and Regulations\n\n## S...
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,44,[page_content='# Section 3 Procedures\n\n## Ru...
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,44,[page_content='# Rules and Regulations\n\n## S...
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"6, 11, 31, 33",[page_content='# Rules and Regulations\n\n## S...


## Inspection - CHECKPOINT
Let's look at the first question and retrieved documents. One can load the saved retrievals by running the cell below

In [1]:
##CHECKPOINT cell

import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl')

In [2]:
#Add the idd column of the cleaned qa_pairs.csv to the dataframe. This was later added, so we check if it's already present
#if "idd" not in df.columns:
#    temp_df = pd.read_csv("data/QA/qa_pairs_cleaned_V2.csv", sep=";")
#    df["idd"] = temp_df["DocumentList"].to_list()

In [3]:
# Convert empty ones to idd 0
#temp_df["DocumentList"] = temp_df["DocumentList"].replace(".", "0")

# Convert str to list of indices (integers)
#temp_df["DocumentList"] = temp_df["DocumentList"].apply(lambda x: [int(idx) for idx in x.split(",")])

In [4]:
#df["idd"] = df["idd"].replace("0", ".")
df.rename(columns={'DocumentList': 'idd'}, inplace=True)

In [5]:
df["idd"]=df["idd"].replace(".", "0")

In [6]:
from model.raw_data import documents

page_content_to_iid = {doc.page_content: doc.metadata['iid'] for doc in documents}

def extract_ids_from_docs(retrieved_docs):
    ids = []
    for doc in retrieved_docs:
        if doc.page_content in page_content_to_iid:
            ids.append(page_content_to_iid[doc.page_content])
    return ids

df["idd_retrieved"] = df["Retrieved Documents"].apply(extract_ids_from_docs)

df

Unnamed: 0,Question,Answer,idd,Retrieved Documents,idd_retrieved
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,0,[page_content='# Rules and Regulations\n\n## S...,[]
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,0,[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...,[]
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",0,[page_content='# Rules and Regulations\n\n## S...,[]
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,0,[page_content='# Article 5.10 Determination an...,[]
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","43, 44",[page_content='\n| 10-points Descriptor | UK |...,[]
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","41, 42, 43, 44, 45",[page_content='# Section 3 Procedures\n\n## Ru...,[]
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"35, 36, 48, 49",[page_content='# Rules and Regulations\n\n## S...,[]
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,44,[page_content='# Section 3 Procedures\n\n## Ru...,[]
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,44,[page_content='# Rules and Regulations\n\n## S...,[]
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"6, 11, 31, 33",[page_content='# Rules and Regulations\n\n## S...,[]


In [7]:
# Converti 'idd' in liste di interi
df['idd'] = df['idd'].apply(lambda x: list(map(int, x.split(','))))

In [8]:
import numpy as np
metrics = []

for index, row in df.iterrows():
    idd = row["idd"]
    idd_retrieved = row["idd_retrieved"]
    
    if idd==[0]: 
        metrics.append(np.nan)
    else:
        correctly_retrieved = set(idd).intersection(idd_retrieved)
        metric = len(correctly_retrieved) / len(idd)
        metrics.append(metric)

df["retrieval_metric"] = metrics

df

Unnamed: 0,Question,Answer,idd,Retrieved Documents,idd_retrieved,retrieval_metric
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,[0],[page_content='# Rules and Regulations\n\n## S...,[],
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,[0],[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...,[],
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",[0],[page_content='# Rules and Regulations\n\n## S...,[],
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,[0],[page_content='# Article 5.10 Determination an...,[],
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","[43, 44]",[page_content='\n| 10-points Descriptor | UK |...,[],0.0
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","[41, 42, 43, 44, 45]",[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"[35, 36, 48, 49]",[page_content='# Rules and Regulations\n\n## S...,[],0.0
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,[44],[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,[44],[page_content='# Rules and Regulations\n\n## S...,[],0.0
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"[6, 11, 31, 33]",[page_content='# Rules and Regulations\n\n## S...,[],0.0


In [9]:
print("Mean retrieval metric: ", df["retrieval_metric"].mean())

Mean retrieval metric:  0.0


In [10]:
df["idd"]

0                          [0]
1                          [0]
2                          [0]
3                          [0]
4                     [43, 44]
5         [41, 42, 43, 44, 45]
6             [35, 36, 48, 49]
7                         [44]
8                         [44]
9              [6, 11, 31, 33]
10    [35, 36, 37, 38, 39, 40]
11                    [36, 38]
12         [30, 43, 44, 6, 11]
13                    [48, 49]
14                    [48, 49]
15                    [30, 31]
16         [30, 31, 6, 11, 33]
17                        [48]
18                    [30, 31]
19         [30, 31, 33, 6, 11]
20         [30, 31, 33, 6, 11]
21                         [0]
22                [76, 77, 78]
23                        [52]
24                    [30, 31]
25                [30, 31, 33]
26                [30, 31, 33]
27                [30, 31, 33]
28                    [39, 40]
29                         [0]
30                        [40]
31             [6, 11, 31, 33]
32      

In [11]:
from model.raw_data import documents

def return_docs_from_ids(docs, ids):
    relevant_docs = []
    for doc in docs:
        #print(doc)
        if doc.metadata['iid'] in ids:
            relevant_docs.append(doc)
    return relevant_docs
    #return [docs[idx] for idx in ids]


#df["idd"] = df["idd"].apply(lambda x: [int(i) for i in str(x).split(',') if i.strip().isdigit()]) # not needed when rerunning from the checkpoint

df["Truth"] = [return_docs_from_ids(documents, ids) for ids in df["idd"]]

In [12]:
df

Unnamed: 0,Question,Answer,idd,Retrieved Documents,idd_retrieved,retrieval_metric,Truth
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,[0],[page_content='# Rules and Regulations\n\n## S...,[],,[]
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,[0],[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...,[],,[]
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",[0],[page_content='# Rules and Regulations\n\n## S...,[],,[]
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,[0],[page_content='# Article 5.10 Determination an...,[],,[]
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","[43, 44]",[page_content='\n| 10-points Descriptor | UK |...,[],0.0,[page_content='Failure to participate in the f...
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","[41, 42, 43, 44, 45]",[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,"[page_content='As indicated in paragraph 1, fo..."
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"[35, 36, 48, 49]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='The project is graded on three ...
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,[44],[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,[page_content='If a student does not participa...
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,[44],[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='If a student does not participa...
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"[6, 11, 31, 33]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='Force majeure is events and ass...


In [13]:
def display_data(df: pd.DataFrame, index: int):
    print("Q:", df.iloc[index]["Question"])
    print("A:", df.iloc[index]["Answer"])
    print(10*"-")
    print("Retrieved Documents:")
    for doc in df.iloc[index]["Retrieved Documents"]:
        print(doc.page_content)
    print(10*"-")
    print("True Documents:")
    for doc in df.iloc[index]["Truth"]:
        print(doc.page_content)

In [14]:
i=15
display_data(df, i)

Q: I had to be about 35 minutes late for today's project meeting due to religious reasons. I'm a Muslim, and so I had to attend the mandatory Friday prayers at the mosque, which is on the other side of Maastricht from our campus, which made me late for the project meeting. I've been noted down as absent from the meeting and told to email you. I also emailed you about the same problem last week (since the last project meeting was also on Friday), and I had received no response, so please let me know if I won't be penalized for both project meetings. Thank you for your time.
A: Project Meetings are mandatory on site sessions. If you’re not present when the tutors checks the attendance, then you do not get the attendance. You now have 2 missed project meetings in Phase 3, which means that you receive an automatically lowered individual grade, which will be determined by the examiners.
----------
Retrieved Documents:
# Rules and Regulations

## Section 5 Semester Project regulations

### R

## Generating responses

In [15]:
from model.models import Agent
from tqdm import tqdm

def generate_answer(queries, multi_query=True, retrieve_type="vector_search", rerank_type=None):
    # vector search
    param_vs = {
        "embed_name": "openai_parser",
        "param":
            {
                "search_type": "similarity",
                "search_kwargs": {
                    "k": 10,
                    # "score_threshold": None
                }
            }
    }

    # multi query
    param_mq = {
        "param":
            {
                "llm_model": "openai",
                "temperature": 0,
                "top_p": None,
                "top_k": None,
                "frequency_penalty": None,
                "max_tokens": None
            }
    }

    param_google_reranker = {
        "param":
            {
                "k": 5,
            }
    }

    # Response LLM
    param_response_llm = {
        "param": {

            "llm": {
                "llm_model": "google",
                "temperature": 0.2,
                "top_p": None,
                "top_k": None,
                "frequency_penalty": None,
                "max_tokens": None
            },
            "shots": {
                "embed_name": "openai",
                "k": 0  # 0 means no shot
            }
        }
    }

    model_in = {
        "vector_search": param_vs,
        "multi_query": param_mq,
        "google_reranker": param_google_reranker,
        "response_llm": param_response_llm
    }

    agent = Agent(model_in, DEBUG=False)

    results = []

    for query in tqdm(queries):
        result = agent.generate_response(query, multi_query=multi_query, retrieve_type=retrieve_type, rerank_type=rerank_type, self_reflection=False)
        results.append(result)

    return results

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martijn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
queries = df["Question"].to_list()
results = generate_answer(queries, multi_query=True, retrieve_type="vector_search", rerank_type="google_reranker")

  warn_deprecated(
  warn_deprecated(
 11%|█         | 6/57 [00:49<06:39,  7.84s/it]Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-pro. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai..
Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/generate_content_requests_per_minute_per_project_per_base_model with base model: gemini-pro. Please submit a quota increase request. https://cloud.google.com/vertex-ai/docs/generative-ai/quotas-genai..
 23%|██▎       | 13/57 [01:50<05:40,  7.74s/it]Retrying langchain_google_vertexai.chat_models._completio

In [17]:
answer_after_generation = [result["response"] for result in results]

In [18]:
df["Answer after generation"] = answer_after_generation
df

Unnamed: 0,Question,Answer,idd,Retrieved Documents,idd_retrieved,retrieval_metric,Truth,Answer after generation
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,[0],[page_content='# Rules and Regulations\n\n## S...,[],,[],I understand that you are concerned about the ...
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,[0],[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...,[],,[],## Update on Academic Credit Process\n\nBased ...
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",[0],[page_content='# Rules and Regulations\n\n## S...,[],,[],"I'm sorry, but I can't find any information ab..."
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,[0],[page_content='# Article 5.10 Determination an...,[],,[],"I am sorry, but I cannot answer your question ..."
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","[43, 44]",[page_content='\n| 10-points Descriptor | UK |...,[],0.0,[page_content='Failure to participate in the f...,## Possible reasons for receiving an NG in Pro...
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","[41, 42, 43, 44, 45]",[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,"[page_content='As indicated in paragraph 1, fo...",I understand that you were unable to attend th...
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"[35, 36, 48, 49]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='The project is graded on three ...,## Project Grade Calculation\n\nThe provided c...
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,[44],[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,[page_content='If a student does not participa...,I understand that you are experiencing a medic...
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,[44],[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='If a student does not participa...,I'm sorry to hear that you missed the final pr...
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"[6, 11, 31, 33]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='Force majeure is events and ass...,## Response to Missed Project Meetings\n\nBase...


In [19]:
# Save the results to a pkl file to save computational resources (time and costs)
df.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl') # Save to pickle to preserve datastructure

## Inspection - CHECKPOINT - Evaluation using RAGAS
Let's look at the evaluation of the entire pipeline using RAGAS. One can load the saved retrievals and generations by running the cell below

In [1]:
##CHECKPOINT cell

import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google.pkl')

In [2]:
df

Unnamed: 0,Question,Answer,idd,Retrieved Documents,idd_retrieved,retrieval_metric,Truth,Answer after generation
0,"I retook the project report, and I noticed tha...",I’m waiting for all examiners to submit the re...,[0],[page_content='# Rules and Regulations\n\n## S...,[],,[],I understand that you are concerned about the ...
1,I am reaching out to inquire about the process...,I’m sorry but I cannot help you with this. I r...,[0],[page_content='# ARTICLE 1.8 COMMUNICATION\n\n...,[],,[],## Update on Academic Credit Process\n\nBased ...
2,I think there was a misunderstanding. I mailed...,"Once the exam office has changed the grade, th...",[0],[page_content='# Rules and Regulations\n\n## S...,[],,[],"I'm sorry, but I can't find any information ab..."
3,when do I get the results for Project1-1 makeu...,I submitted the grades this morning. I expect ...,[0],[page_content='# Article 5.10 Determination an...,[],,[],"I am sorry, but I cannot answer your question ..."
4,The project 1-1 grades have been given and I g...,"According to my records, you were not present ...","[43, 44]",[page_content='\n| 10-points Descriptor | UK |...,[],0.0,[page_content='Failure to participate in the f...,## Possible reasons for receiving an NG in Pro...
5,I understand but I had a verry good reason for...,"As coordinator, I can only take note of attend...","[41, 42, 43, 44, 45]",[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,"[page_content='As indicated in paragraph 1, fo...",I understand that you were unable to attend th...
6,In the documentation about project 1-1s gradin...,The default computation is 0.9*ProjectGrade + ...,"[35, 36, 48, 49]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='The project is graded on three ...,## Project Grade Calculation\n\nThe provided c...
7,"Since this morning 3:30AM, I had a terrible pa...",The rules and regulations are quite clear on ...,[44],[page_content='# Section 3 Procedures\n\n## Ru...,[],0.0,[page_content='If a student does not participa...,I understand that you are experiencing a medic...
8,"Tragically, I missed the final product and rep...",The rules and regulations are quite clear on t...,[44],[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='If a student does not participa...,I'm sorry to hear that you missed the final pr...
9,I have missed the first and second project mee...,The Board of Examiners has reviewed your situa...,"[6, 11, 31, 33]",[page_content='# Rules and Regulations\n\n## S...,[],0.0,[page_content='Force majeure is events and ass...,## Response to Missed Project Meetings\n\nBase...


In [3]:
questions = df["Question"].to_list()
answers_by_Martijn = df["Answer"].to_list()
answer_after_generation = df["Answer after generation"].to_list()
contexts = []

for i in range(len(df)):
    #ground_truths.append(" ".join([doc.page_content for doc in df.iloc[i]["Truth"]])) #pay attention, this is a str
    contexts.append([" ".join([doc.page_content for doc in df.iloc[i]["Retrieved Documents"]])]) #pay attention, this is a List[str]

In [4]:
test_example = False #just use one example of the dataset

if test_example == True:
    exmp = {
        'question': [questions[6]],
        'answer': [answer_after_generation[6]],
        'contexts': [contexts[6]],
        'ground_truth': [answers_by_Martijn[6]]
    }
else:
    exmp = {
        'question': questions,
        'answer': answer_after_generation,
        'contexts': contexts,
        'ground_truth': answers_by_Martijn
    }

In [5]:
from datasets import Dataset
custom_dataset = Dataset.from_dict(exmp)

In [6]:
print(custom_dataset)

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 57
})


In [7]:
# Load local environment variables
from dotenv import load_dotenv
print("Environment variables are loaded = ", load_dotenv())

Environment variables are loaded =  True


In [8]:
import os
openai_api_key = os.getenv("OPENAI_API_KEY")
huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")

In [9]:
# By default, evaluatue in its current version uses gpt-3.5-turbo-16k, which is old.
#result = evaluate(custom_dataset, metrics=[context_precision, context_recall])
#result

In [10]:
from model.utils.utils import get_llm_model, get_embed_model
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from ragas import evaluate
from tqdm import tqdm
import time

langchain_llm = get_llm_model("openai", temperature=0.7)
langchain_embeddings = get_embed_model("openai")

for i in tqdm(range(len(df))):

    exmp = {
        'question': [questions[i]],
        'answer': [answer_after_generation[i]],
        'contexts': [contexts[i]],
        'ground_truth': [answers_by_Martijn[i]]
    }

    custom_dataset = Dataset.from_dict(exmp)

    results = evaluate(custom_dataset, metrics=[context_precision, context_recall, answer_relevancy, faithfulness], llm=langchain_llm, embeddings=langchain_embeddings, is_async=True)

    if i == 0:
        df_results = results.to_pandas()
    if i != 0:
        df_results = pd.concat([df_results, results.to_pandas()])

    time.sleep(3) # To avoid the API rate limit

  0%|          | 0/57 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  2%|▏         | 1/57 [00:26<24:44, 26.50s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  4%|▎         | 2/57 [01:00<28:21, 30.94s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  5%|▌         | 3/57 [01:17<21:58, 24.42s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  7%|▋         | 4/57 [01:24<15:42, 17.78s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  9%|▉         | 5/57 [01:51<18:02, 20.83s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 11%|█         | 6/57 [02:05<15:58, 18.79s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 12%|█▏        | 7/57 [02:28<16:50, 20.21s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 14%|█▍        | 8/57 [02:52<17:22, 21.28s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 16%|█▌        | 9/57 [03:12<16:37, 20.78s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 18%|█▊        | 10/57 [03:49<20:12, 25.80s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 19%|█▉        | 11/57 [04:08<18:09, 23.69s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 21%|██        | 12/57 [04:24<16:00, 21.35s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 23%|██▎       | 13/57 [04:45<15:42, 21.42s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 25%|██▍       | 14/57 [05:07<15:19, 21.38s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 26%|██▋       | 15/57 [05:37<16:53, 24.14s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 28%|██▊       | 16/57 [06:06<17:30, 25.62s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 30%|██▉       | 17/57 [06:25<15:46, 23.66s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 32%|███▏      | 18/57 [06:33<12:14, 18.84s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 33%|███▎      | 19/57 [06:55<12:32, 19.81s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 35%|███▌      | 20/57 [07:13<11:51, 19.22s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 37%|███▋      | 21/57 [07:41<13:04, 21.80s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 39%|███▊      | 22/57 [07:48<10:12, 17.51s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 40%|████      | 23/57 [08:06<10:02, 17.71s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 42%|████▏     | 24/57 [08:42<12:43, 23.15s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 44%|████▍     | 25/57 [09:01<11:39, 21.84s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 46%|████▌     | 26/57 [09:31<12:34, 24.35s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 47%|████▋     | 27/57 [09:41<09:59, 19.97s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 49%|████▉     | 28/57 [10:09<10:52, 22.51s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 51%|█████     | 29/57 [10:32<10:30, 22.51s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 53%|█████▎    | 30/57 [10:45<08:53, 19.77s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 54%|█████▍    | 31/57 [11:12<09:30, 21.96s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 56%|█████▌    | 32/57 [11:51<11:12, 26.89s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 58%|█████▊    | 33/57 [12:32<12:26, 31.12s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 60%|█████▉    | 34/57 [12:58<11:23, 29.70s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 61%|██████▏   | 35/57 [13:13<09:14, 25.21s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 63%|██████▎   | 36/57 [13:39<08:58, 25.63s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 65%|██████▍   | 37/57 [14:07<08:43, 26.18s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 67%|██████▋   | 38/57 [14:31<08:04, 25.53s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 68%|██████▊   | 39/57 [14:51<07:12, 24.01s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 70%|███████   | 40/57 [15:09<06:16, 22.14s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 72%|███████▏  | 41/57 [15:45<06:59, 26.22s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 74%|███████▎  | 42/57 [16:01<05:48, 23.23s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 75%|███████▌  | 43/57 [16:09<04:19, 18.57s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 77%|███████▋  | 44/57 [16:31<04:15, 19.67s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 79%|███████▉  | 45/57 [16:44<03:31, 17.59s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 81%|████████  | 46/57 [16:51<02:40, 14.55s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 82%|████████▏ | 47/57 [17:13<02:48, 16.83s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 84%|████████▍ | 48/57 [17:32<02:35, 17.23s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 86%|████████▌ | 49/57 [17:59<02:41, 20.22s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 88%|████████▊ | 50/57 [18:31<02:47, 23.88s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 89%|████████▉ | 51/57 [18:55<02:24, 24.01s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 91%|█████████ | 52/57 [19:06<01:39, 19.90s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 93%|█████████▎| 53/57 [19:23<01:16, 19.00s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 95%|█████████▍| 54/57 [19:31<00:47, 15.94s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 96%|█████████▋| 55/57 [19:42<00:28, 14.40s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

 98%|█████████▊| 56/57 [19:54<00:13, 13.58s/it]

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 57/57 [20:07<00:00, 21.19s/it]


In [11]:
df_results = df_results.reset_index().drop(columns=["index"])
df_results

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,context_recall,answer_relevancy,faithfulness
0,"I retook the project report, and I noticed tha...",I understand that you are concerned about the ...,[# Rules and Regulations\n\n## Section 5 Semes...,I’m waiting for all examiners to submit the re...,0.0,0.0,0.643951,0.333333
1,I am reaching out to inquire about the process...,## Update on Academic Credit Process\n\nBased ...,[# ARTICLE 1.8 COMMUNICATION\n\n1. The Student...,I’m sorry but I cannot help you with this. I r...,1.0,0.0,0.0,0.315789
2,I think there was a misunderstanding. I mailed...,"I'm sorry, but I can't find any information ab...",[# Rules and Regulations\n\n## Section 5 Semes...,"Once the exam office has changed the grade, th...",0.0,0.0,0.0,0.8
3,when do I get the results for Project1-1 makeu...,"I am sorry, but I cannot answer your question ...",[# Article 5.10 Determination and announcement...,I submitted the grades this morning. I expect ...,0.0,0.0,0.0,0.333333
4,The project 1-1 grades have been given and I g...,## Possible reasons for receiving an NG in Pro...,[\n| 10-points Descriptor | UK | US |\n|-|-|-|...,"According to my records, you were not present ...",1.0,0.666667,0.696987,0.846154
5,I understand but I had a verry good reason for...,I understand that you were unable to attend th...,[# Section 3 Procedures\n\n## Rules and Regula...,"As coordinator, I can only take note of attend...",1.0,0.0,0.470478,0.0
6,In the documentation about project 1-1s gradin...,## Project Grade Calculation\n\nThe provided c...,[# Rules and Regulations\n\n## Section 5 Semes...,The default computation is 0.9*ProjectGrade + ...,1.0,1.0,0.0,0.25
7,"Since this morning 3:30AM, I had a terrible pa...",I understand that you are experiencing a medic...,[# Section 3 Procedures\n\n## Rules and Regula...,The rules and regulations are quite clear on ...,1.0,0.0,0.343084,0.263158
8,"Tragically, I missed the final product and rep...",I'm sorry to hear that you missed the final pr...,[# Rules and Regulations\n\n## Section 5 Semes...,The rules and regulations are quite clear on t...,1.0,0.333333,0.593052,0.2
9,I have missed the first and second project mee...,## Response to Missed Project Meetings\n\nBase...,[# Rules and Regulations\n\n## Section 5 Semes...,The Board of Examiners has reviewed your situa...,1.0,0.4,0.49678,0.0


In [12]:
# Save the results to a pkl file to save computational resources (time and costs)
df_results.to_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google_evaluated.pkl') # Save to pickle to preserve datastructure

## Inspection - CHECKPOINT - Evaluation manual inspection
Let's look at the evaluation of the entire pipeline using RAGAS. One can load the saved retrievals and generations by running the cell below

In [1]:
##CHECKPOINT cell

import os
import pandas as pd

os.chdir(os.getcwd().replace("\\notebooks\\retrieve", ""))
df_results = pd.read_pickle('data/QA/P3 experiments/qa_pairs_cleaned_retrieved_openaiparser_google_evaluated.pkl')

In [2]:
df_results

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,context_recall,answer_relevancy,faithfulness
0,"I retook the project report, and I noticed tha...",I understand that you are concerned about the ...,[# Rules and Regulations\n\n## Section 5 Semes...,I’m waiting for all examiners to submit the re...,0.0,0.0,0.643951,0.333333
1,I am reaching out to inquire about the process...,## Update on Academic Credit Process\n\nBased ...,[# ARTICLE 1.8 COMMUNICATION\n\n1. The Student...,I’m sorry but I cannot help you with this. I r...,1.0,0.0,0.0,0.315789
2,I think there was a misunderstanding. I mailed...,"I'm sorry, but I can't find any information ab...",[# Rules and Regulations\n\n## Section 5 Semes...,"Once the exam office has changed the grade, th...",0.0,0.0,0.0,0.8
3,when do I get the results for Project1-1 makeu...,"I am sorry, but I cannot answer your question ...",[# Article 5.10 Determination and announcement...,I submitted the grades this morning. I expect ...,0.0,0.0,0.0,0.333333
4,The project 1-1 grades have been given and I g...,## Possible reasons for receiving an NG in Pro...,[\n| 10-points Descriptor | UK | US |\n|-|-|-|...,"According to my records, you were not present ...",1.0,0.666667,0.696987,0.846154
5,I understand but I had a verry good reason for...,I understand that you were unable to attend th...,[# Section 3 Procedures\n\n## Rules and Regula...,"As coordinator, I can only take note of attend...",1.0,0.0,0.470478,0.0
6,In the documentation about project 1-1s gradin...,## Project Grade Calculation\n\nThe provided c...,[# Rules and Regulations\n\n## Section 5 Semes...,The default computation is 0.9*ProjectGrade + ...,1.0,1.0,0.0,0.25
7,"Since this morning 3:30AM, I had a terrible pa...",I understand that you are experiencing a medic...,[# Section 3 Procedures\n\n## Rules and Regula...,The rules and regulations are quite clear on ...,1.0,0.0,0.343084,0.263158
8,"Tragically, I missed the final product and rep...",I'm sorry to hear that you missed the final pr...,[# Rules and Regulations\n\n## Section 5 Semes...,The rules and regulations are quite clear on t...,1.0,0.333333,0.593052,0.2
9,I have missed the first and second project mee...,## Response to Missed Project Meetings\n\nBase...,[# Rules and Regulations\n\n## Section 5 Semes...,The Board of Examiners has reviewed your situa...,1.0,0.4,0.49678,0.0


In [3]:
import numpy as np
print("Context Precision:", np.mean(df_results["context_precision"]))
print("Context Recall:", np.mean(df_results["context_recall"]))
print("Answer Relevancy:", np.mean(df_results["answer_relevancy"]))
print("Faithfulness:", np.mean(df_results["faithfulness"]))

Context Precision: 0.8947368420157895
Context Recall: 0.41250474671527304
Answer Relevancy: 0.3536387046931806
Faithfulness: 0.3724307470783762


In [4]:
def display_data(df: pd.DataFrame, index: int):
    print('Index', i)
    print("Q:", df.iloc[index]["question"])
    print("A:", df.iloc[index]["answer"])
    print(10*"-")
    print("Retrieved Documents:")
    print(df.iloc[index]["contexts"])
    print(10*"-")
    print("True Answer:")
    print(df.iloc[index]["ground_truth"])
    #print(10*"-")
    print()
    print()
    #print("Metrics:")
    #print(df.iloc[index]["metrics"])

In [5]:
#for i in range(29):
#    display_data(df_results, i)