Evaluation of Synthetic Dataset
===

Now, that we have generated a synthetic dataset and also built a RAG pipeline, let's first evaluate how good our dataset is. Then, we will filter out a gold dataset and then evaluate the RAG pipeline on the gold dataset.

In [3]:
import os
import dspy
import json

In [4]:
os.chdir('../')

In [5]:
DATASET_FPATH = './data/processed/dataset.json'

In [6]:
# Read the dataset.
with open(DATASET_FPATH, 'r') as f:
    dataset = json.load(f)

In [7]:
dataset.keys()

dict_keys(['queries', 'answers', 'corpus', 'relevant_docs'])

In [8]:
# Print an example from each key of dataset
for key in dataset.keys():
    print(f"{key}:")
    for k,v in dataset[key].items():
        print(f"\t{k}: {v}")
        break
    print()



queries:
	fbbb1c0f-6c75-475f-9cf2-880363c3a70e: Who directed the play "How to Curse" in 2007?

answers:
	fbbb1c0f-6c75-475f-9cf2-880363c3a70e: Josie Rourke

corpus:
	bddd122a-bd2e-4e2d-9f03-88484aa0f1f8: = Robert Boulter = 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whi

RAGAS
---

In [9]:
from ragas import evaluate

In [10]:
import pandas as pd
# Creating the DataFrame
data = []
for query_id, query_text in dataset['queries'].items():
    answer_text = dataset['answers'].get(query_id)
    doc_ids = dataset['relevant_docs'].get(query_id, [])
    for doc_id in doc_ids:
        corpus_text = dataset['corpus'].get(doc_id)
        # Rename ['question', 'ground_truth', 'answer', 'contexts']

        # data.append({"query": query_text, "answer": answer_text, "corpus": corpus_text})
        data.append({"question": query_text, "ground_truths": [answer_text], "answer": answer_text, "contexts": [corpus_text]})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,question,ground_truths,answer,contexts
0,"Who directed the play ""How to Curse"" in 2007?",[Josie Rourke],Josie Rourke,[= Robert Boulter = \n Robert Boulter is an En...
1,"Who directed the film ""Donkey Punch""?",[Olly Blackburn.],Olly Blackburn.,[= = = 2006 – present = = = \n In 2006 Boulter...
2,Who was Du Fu's paternal grandfather?,[Du Shenyan.],Du Shenyan.,[Since many of Du Fu 's poems feature morality...
3,How many children did Du Fu have by 757?,[Five.],Five.,"[He never again attempted the examinations , i..."
4,Where did Du Fu spend most of the next five ye...,[Sichuan province.],Sichuan province.,[He next spent around six weeks in Qinzhou ( n...


In [11]:
# df.to_csv('./data/processed/synthetic_dataset.csv', index=False)

In [28]:
from datasets import Dataset
ds = Dataset.from_pandas(df)

In [31]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    context_relevancy
)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

evaluating with [context_precision]


100%|██████████| 29/29 [02:34<00:00,  5.32s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [faithfulness]


100%|██████████| 29/29 [05:00<00:00, 10.37s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_relevancy]


100%|██████████| 29/29 [05:06<00:00, 10.56s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_recall]


100%|██████████| 29/29 [03:54<00:00,  8.08s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_similarity]


100%|██████████| 29/29 [00:20<00:00,  1.44it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_relevancy]


100%|██████████| 29/29 [02:37<00:00,  5.45s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [32]:
result

{'context_precision': 0.4426, 'faithfulness': 0.8716, 'answer_relevancy': 0.9267, 'context_recall': 0.9500, 'answer_similarity': 1.0000, 'context_relevancy': 0.0729}

In [33]:
result.to_pandas().head()

Unnamed: 0,question,ground_truths,answer,contexts,context_precision,faithfulness,answer_relevancy,context_recall,answer_similarity,context_relevancy
0,"Who directed the play ""How to Curse"" in 2007?",[Josie Rourke],Josie Rourke,[= Robert Boulter = \n Robert Boulter is an En...,1.0,0.0,0.981145,1.0,0.999998,0.03125
1,"Who directed the film ""Donkey Punch""?",[Olly Blackburn.],Olly Blackburn.,[= = = 2006 – present = = = \n In 2006 Boulter...,1.0,1.0,0.996486,1.0,0.999998,0.029412
2,Who was Du Fu's paternal grandfather?,[Du Shenyan.],Du Shenyan.,[Since many of Du Fu 's poems feature morality...,0.0,1.0,1.0,1.0,1.0,0.027027
3,How many children did Du Fu have by 757?,[Five.],Five.,"[He never again attempted the examinations , i...",1.0,1.0,0.943972,1.0,0.999999,0.025641
4,Where did Du Fu spend most of the next five ye...,[Sichuan province.],Sichuan province.,[He next spent around six weeks in Qinzhou ( n...,1.0,1.0,0.955049,1.0,0.999976,0.057143


In [42]:
# Use the save_result function to save the result to a csv file.
import time

def save_result(result):
    exp_name = f"results/eval_synthetic_data_{time.strftime('%Y%m%d-%H%M%S')}"
    print(f"Saving results to {exp_name}.csv")
    # make dir results
    if not os.path.exists('results'):
        os.makedirs('results')

    # Write to file
    result.to_pandas().to_csv(f"{exp_name}.csv")

In [43]:
# Uncomment the following line to save the result.
save_result(result)

Saving results to results/eval_synthetic_data_20240129-145748.csv


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [36]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-synthetic-eval",
    
    # track hyperparameters and run metadata
    config={
        "chuck_size": 1024,
        "sentence_chunck_overlap": 200,
        "number_of_questions": len(ds),
        "comments": "Synthetic dataset where ground truth and the answer are the same.",
    }
)

wandb.log(result)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mprasadshreyas[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_relevancy,▁
answer_similarity,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.92671
answer_similarity,0.99999
context_precision,0.44262
context_recall,0.94999
context_relevancy,0.07294
faithfulness,0.8716


-----