<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/low_level/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Building Evaluation from Scratch

We show how you can build evaluation modules from scratch. This includes both evaluation of the final generated response (where the output is plain text), as well as the evaluation of retrievers (where the output is a ranked list of items).

We have in-house modules in our [Evaluation](https://gpt-index.readthedocs.io/en/latest/core_modules/supporting_modules/evaluation/root.html) section.

## Setup

We load some data and define a very simple RAG query engine that we'll evaluate (uses top-k retrieval).

In [None]:
# %pip install llama-index-readers-file pymupdf
# %pip install llama-index-llms-openai

In [None]:
import logging
import sys
import pandas as pd
import numpy as np
import json
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

In [None]:
from pathlib import Path
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter, SimpleNodeParser
from llama_index.llms.openai import OpenAI

In [None]:
# load yaml file
import yaml
from easydict import EasyDict
config = EasyDict(yaml.safe_load(open("defaults.yaml")))
config

In [None]:
from datasets import load_dataset
import os

if os.path.exists('sources/ArabicMMLU.csv'):
    arabicmmlu_df = pd.read_csv('sources/ArabicMMLU.csv')
else:
    arabicMMLU = load_dataset('MBZUAI/ArabicMMLU')
    arabicmmlu_df = arabicMMLU['test'].to_pandas()
    arabicmmlu_df.to_csv('source/ArabicMMLU.csv', index=False)
    

arabicmmlu_df

In [None]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=1e6)



def convert_qa_to_string(row) -> str:
    options = list(map(str.strip, filter(lambda x: x not in [None, np.nan, 'nan'], [
       str(row[f"Option {i}"]) for i in range(1, 6)
    ])))
    options_str = "\n".join(
        f"{i+1}. {x}" for i, x in enumerate(options)
    )
    answer_key_map = {
        "A": 0,
        "B": 1,
        "C": 2,
        "D": 3,
        "E": 4,
    }
    correct_answer_str = options[answer_key_map[row["Answer Key"]]]

    return config.MCQ_PROMPT.format(
        question=row['Question'],
        options=options_str,
        correct_answer=correct_answer_str
    )


documents = [
    Document(
        text=convert_qa_to_string(row),
        metadata=row.to_dict()
    ) for _, row in arabicmmlu_df.query('Subject=="Law"').iterrows()
]

arabicmmlu_nodes = node_parser.get_nodes_from_documents(
    documents,
    show_progress=True
)

In [None]:
moj_df = pd.read_csv('sources/MOJ_Regulations.csv')
print(moj_df.shape)
moj_df.head()

In [None]:
tameem_df = pd.read_csv('sources/tameem.csv')
print(tameem_df.shape)
tameem_df.head()

In [None]:
moj_docs = [ 
            Document(
                text=row['Description'],
                metadata=row.drop('Description').to_dict()
            ) for _, row in moj_df.iterrows()
]

In [None]:
tameem_docs = [
    Document(
        text=row['نص التعميم'],
        metadata=row.drop('نص التعميم').to_dict()
    ) for _, row in tameem_df.iterrows()
]

In [None]:
source_docs = moj_docs + tameem_docs
len(source_docs)

In [None]:
# llama index format
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.llms.anthropic import Anthropic
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY")
api_version = "2024-02-15-preview"

llm_gpt4 = AzureOpenAI(
    engine="gpt-4",
    api_key=AZURE_OPENAI_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=api_version,
)

llm_claude3 = Anthropic(
    'claude-3-opus-20240229',
    api_key=CLAUDE_API_KEY
)

In [None]:
node_parser = SentenceSplitter(chunk_size=1024)

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding

OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")

from llama_index.core import Settings
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import PromptTemplate, ServiceContext, StorageContext, VectorStoreIndex, load_index_from_storage

embed_model = OpenAIEmbedding(model='text-embedding-3-large', api_key=OPENAI_API_KEY)

Settings.embed_model = embed_model

In [None]:
Settings.llm = llm_claude3

chroma_client = chromadb.PersistentClient(path='./chroma_db')
# Traditional VDB
try:
    chroma_collection = chroma_client.get_collection(f'ArabicMMLU_legal')
except Exception as e:
    print("Creating new collection")
    chroma_collection = chroma_client.create_collection('ArabicMMLU_legal')

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

arabicmmlu_index = VectorStoreIndex(
    arabicmmlu_nodes,
    storage_context=storage_context,
    embed_model=embed_model,
    use_async=False,
    show_progress=True,
)
# arabicmmlu_retriever = arabicmmlu_index.as_retriever(
#     similarity_top_k=3,
#     embed_model=embed_model,
# )
# # Sentence window retrieval
# query_engine_sentence_window = index_sentence_window.as_query_engine(
#     text_qa_template=text_qa_template, similarity_top_k=3, embed_model=embed_model, llm=llm
# )

In [None]:
node_parser = SentenceSplitter(chunk_size=1e6, chunk_overlap=0)

nodes = node_parser.get_nodes_from_documents(
    source_docs,
    show_progress=True
)

## Dataset Generation

We first go through an exercise of generating a synthetic evaluation dataset. We do this by synthetically generating a set of questions from existing context. We then run each question with existing context through a powerful LLM (e.g. GPT-4) to generate a "ground-truth" response.

### Define Functions

We define the functions that we will use for dataset generation:

In [None]:
from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re
from tqdm.auto import tqdm
from multiprocessing.pool import ThreadPool

We define `generate_answers_for_questions` to generate answers from questions given context.

In [None]:
question_answer_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_PROMPT),
    ]
)

def generate_answers_for_questions(
    questions: List[str], context: str, llm: OpenAI
) -> str:
    """Generate answers for questions given context."""
    
    def generate_answer(idx, question):
        fmt_qa_prompt = question_answer_template.format_messages(
            context_str=context,
            query_str=question,
        )
        response_obj = llm.chat(fmt_qa_prompt)
        return response_obj.message.content

    # for idx, node in enumerate(nodes):
    answers = list(
        tqdm(
            ThreadPool().imap(
                lambda x: generate_answer(*x),
                enumerate(questions),
            ),
        "generate_answers_for_questions()",
        total=len(questions),
        )
    )

    return answers

We define `generate_qa_pairs` to generate qa pairs over an entire list of Nodes.

In [None]:
question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=config.QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=config.QUESTION_GEN_USER_TMPL),
    ]
)


def generate_qa_pairs(
    nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 2,
    delimiter: str = "\n",
    question_gen_template=question_gen_template,
) -> List[Tuple[str, str]]:
    """Generate questions."""
    def process_node(idx, node):
        context_str = node.get_content(metadata_mode="all")
        fmt_messages = question_gen_template.format_messages(
            num_questions_per_chunk=num_questions_per_chunk,
            context_str=context_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content

        result_list = str(raw_output).strip().split(delimiter)
        cleaned_questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip()
            for question in result_list
        ]
        answers = generate_answers_for_questions(
            cleaned_questions, context_str, llm
        )
        cur_qa_pairs = list(zip(cleaned_questions, answers))
        return cur_qa_pairs
    
    qa_pairs = list(
        tqdm(
            ThreadPool().imap(
                lambda x: process_node(*x),
                enumerate(nodes),
            ),
        "Generating QA pairs",
        total=len(nodes),
        )
    )
    # flatten
    qa_pairs = [item for sublist in qa_pairs for item in sublist]
        
    return qa_pairs


In [None]:
qa_pairs = generate_qa_pairs(
    nodes,
    llm=llm_gpt4,
    num_questions_per_chunk=1,
)

Converting question answer paris int MSQs

In [None]:
qa_to_mcq_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_TO_MCQ_PROMPT),
    ]
)
qa_to_mcq_cot_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.USER, content=config.QA_TO_MCQ_COT_PROMPT),
    ]
)

In [None]:
def convert_quesitons_to_mcqs(
    qa_pairs: List[tuple], mcq_prompt_template: str, llm: OpenAI
) -> str:
    """Converting question-answer paris into MCQs."""
    
    def question_to_mcq(idx, qa_pair):
        question, answer = qa_pair
        prompt_template = mcq_prompt_template.format_messages(
            question=question,
            answer=answer,
        )
        response_obj = llm.chat(prompt_template)
        return response_obj.message.content

    mcqs = list(
        tqdm(
            ThreadPool().imap(
                lambda x: question_to_mcq(*x),
                enumerate(qa_pairs),
            ),
        "convert_quesitons_to_mcqs()",
        total=len(qa_pairs),
        )
    )

    return mcqs

In [None]:
mcqs = convert_quesitons_to_mcqs(
    qa_pairs,
    qa_to_mcq_template,
    llm_gpt4
)

In [None]:
def format_mcqs(mcqs):
    formatted_mcqs = []
    for example in mcqs:
        question = example.split('\n')[0]
        options = example.split('\n')[2:]
        formatted_mcqs.append([question, options])
    return formatted_mcqs

formated_mcqs = format_mcqs(mcqs)
formated_mcqs

In [None]:
qa_to_mcqs_dict = {}
for i in range(len(formated_mcqs)):
    qa_to_mcqs_dict[i] = {
        'question': formated_mcqs[i][0],
        'options': formated_mcqs[i][1],
        'answer': formated_mcqs[i][1][0],
        'context': nodes[i].text
    }

With Chain of Thought

In [None]:
mcqs_cot = convert_quesitons_to_mcqs(
    qa_pairs,
    qa_to_mcq_cot_template,
    llm_gpt4
)

In [None]:
def format_mcqs_cot(mcqs_cot):
    formated_mcqs_cot = []
    for example in mcqs_cot:
        example = example.strip('<start_thought>').split('<end_thought>')

        reasoning = example[0]
        question = example[1].strip().split('\n')[0]
        options = example[1].strip().split('\n')[2:]    
        
        answer = options[0]
        formated_mcqs_cot.append([question, reasoning, options, answer])
        
    return formated_mcqs_cot

formated_mcqs_cot = format_mcqs_cot(mcqs_cot)
formated_mcqs_cot

In [None]:
qa_to_mcqs_cot_dict = {}
for i in range(len(formated_mcqs_cot)):
    qa_to_mcqs_cot_dict[i] = {
        'question': formated_mcqs_cot[i][0],
        'reasoning': formated_mcqs_cot[i][1],
        'options': formated_mcqs_cot[i][2],
        'answer': formated_mcqs_cot[i][2][0],
        'context': nodes[i].text
    }

#### For MCQ Generation Using In-context Learning

In [None]:
import time

mcq_question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=config.MCQ_QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=config.MCQ_QUESTION_GEN_USER_TMPL),
    ]
)


def generate_mcq_pairs(
    nodes_dict: dict,
    num_questions_per_chunk: int = 2,
    top_k: int = 3,
    delimiter: str = "####",
    mcq_question_gen_template=mcq_question_gen_template,
    
) -> List[Tuple[str, str]]:
    """Generate questions."""
    engine, nodes = list(nodes_dict.items())[0]
    if engine == 'gpt-4':
        llm = llm_gpt4
    else:
        llm = llm_claude3
    def process_node(idx, node):
        cur_context_str = node.get_content(metadata_mode="none")
        if "{few_shot_examples}" in '\n'.join([x.content for x in mcq_question_gen_template.message_templates]) and top_k > 0:
            arabicmmlu_retriever = arabicmmlu_index.as_retriever(
                similarity_top_k=top_k,
                embed_model=embed_model,
            )
            few_shot_examples_str = "\n\n".join([
                x.text for x in arabicmmlu_retriever.retrieve(node.text)
            ])
        else:
            few_shot_examples_str = ""

        fmt_messages = mcq_question_gen_template.format_messages(
            num_questions_per_chunk=num_questions_per_chunk,
            context_str=cur_context_str,
            few_shot_examples=few_shot_examples_str,
        )
        try:
            chat_response = llm.chat(fmt_messages)
            raw_output = chat_response.message.content
        except Exception as e:
            # add a 2 second sleep and retry
            time.sleep(2)
            try:
                chat_response = llm.chat(fmt_messages)
                raw_output = chat_response.message.content
            except Exception as e:
                raw_output = ''
        result_list = str(raw_output).strip().split(delimiter)
        # cur_mcq_pairs = [
        #     #TODO: make this from the config
        #     question.strip().strip('السؤال: ').split("الجواب الصحيح: ")
        #     for question in result_list if question.strip()
        # ]
        cur_mcq_pairs = [f'Engine: {engine}' + '\n\n' + f'Context: {node.text}\n\n' + x.strip() for x in result_list if x.strip()]
        
        return cur_mcq_pairs
    
    mcq_pairs = []
    batch_size = 10
    for i in range(0, len(nodes), batch_size):
        mcq_pairs += list(
        tqdm(
            ThreadPool().imap(
                lambda x: process_node(*x),
                enumerate(nodes[i:i+batch_size]),
            ),
        f"Generating {engine} QA pairs",
        total=batch_size,
            )
        )
        with open(f"{engine}_MCQs.json", "w") as f:
            json.dump(mcq_pairs, f, indent=4, ensure_ascii=False)
    # flatten
    mcq_pairs = [item for sublist in mcq_pairs for item in sublist]
    return mcq_pairs

# mcq_pairs = generate_mcq_pairs(
#     random_nodes,
#     # nodes,
#     llm,
#     mcq_question_gen_template=mcq_question_gen_template,
#     num_questions_per_chunk=4,
#     delimiter="####"
# )
# for q, a in mcq_pairs:
#     print(f"Q: {q}\nA: {a}\n")

In [None]:
# shuffling the nodes with a fixed seed
import random
random.seed(42)
random.shuffle(nodes)

In [None]:
i = int(len(nodes)*0.5)
gpt4_nodes = nodes[:i]
claude3_nodes = nodes[i:]
all_nodes = [{'gpt-4': gpt4_nodes}, {'claude-3-opus': claude3_nodes}]

In [None]:
len(gpt4_nodes), len(claude3_nodes)

In [None]:
three_shot_mcq = generate_mcq_pairs(
                                nodes, 
                                mcq_question_gen_template=mcq_question_gen_template,
                                num_questions_per_chunk=2,
                                top_k=3,
                                delimiter="####"
)