# Install pre-requisites

In [1]:
!pip install -q torch transformers transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets ragatouille ratelimit retry

In [2]:
%reload_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

# Model preparations

To go through the evaluation process, we need following models:

1. Document model: Embedding model to generate document embeddings which will persisted in vector index. 
2. Reader model: A text completion model to answer the final question with augmented context.
3. Evaluator model: A chat completion model that will give final verdict about RAG output. As this model will affect scoring considerably, stronger model should be used. 

As the choice of different models is not subject of this article and won't impact the comparison between RAG frameworks, we are determined to use completed local solution for this experiment for better speed and lower cost. 

To be more precise, following models that are already optimized in Ollama are used:

* [Gemma 2B](https://huggingface.co/google/gemma-2b) as both `Document model` and `Reader model`.
* [Mixtral-8x7B](https://ollama.com/library/mixtral) for `Evaluator model`
   

In [3]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama, MiniMaxChat
import os


READER_MODEL_NAME = "gemma:2b"
EMBEDDING_NAME = "all-minilm"

# OLLAMA_BASE_URL = "http://192.168.0.29:11434"
OLLAMA_BASE_URL = "http://localhost:11434"
EMBEDDING_MODEL = OllamaEmbeddings(model=EMBEDDING_NAME, base_url = OLLAMA_BASE_URL)
READER_LLM = Ollama(model=READER_MODEL_NAME, base_url=OLLAMA_BASE_URL)

EVALUATOR_NAME = "mixtral:latest"
EVAL_MODEL = ChatOllama(model=EVALUATOR_NAME, base_url=OLLAMA_BASE_URL)


# EVALUATOR_NAME = "abab6-chat"
# EVAL_MODEL = MiniMaxChat(
#     model_name=EVALUATOR_NAME, 
#     minimax_api_key=os.getenv("MINIMAX_API_KEY"),
#     minimax_group_id=os.getenv("MINIMAX_GROUP_ID")
# )


LANGCHAIN_DATA_ROOT = "./data/langchain"
INSTINCT_DOC_AGENT_DATA_ROOT = "./data/doc_agent"



# Build RAG pipeline using `langchain` 

1. transform training data in `m-ric/huggingface_doc` to `langchain`'s document objects
2. Load into faiss index if index file is absent
3. prompt with eval data `m-ric/huggingface_doc` using `READER_MODEL` 

## Knowledge base preparations

In [4]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
pd.set_option("display.max_colwidth", None)

In [5]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [6]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [7]:
EVAL_DATASET = datasets.load_dataset("m-ric/huggingface_doc_qa_eval", split="train")


In [8]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(datasets.load_dataset("m-ric/huggingface_doc", split="train"))
]

  0%|          | 0/2647 [00:00<?, ?it/s]

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument]
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name="gpt-4",
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
        disallowed_special=[],
        allowed_special="all"
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [10]:
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
import os
from langchain_core.embeddings import Embeddings


def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model: Embeddings,
    embedding_model_name: str
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model: the embedding
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
         
    """
    # load embedding_model


    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name}"
    index_folder_path = os.path.join(LANGCHAIN_DATA_ROOT, index_name)
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
        )

    else:
        docs_processed = split_documents(
            chunk_size,
            langchain_docs
        )
        print(f"Index not found, generating it... {len(docs_processed)} docs in total")
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

## QA chain

In [11]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [12]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

# Generating answers

## Test function with langchain

In [13]:
from langchain_core.language_models import BaseChatModel 

def run_langchain_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: LLM,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [14]:
def run_langchain_test_all() -> str:
    """
    Build index and run langchain test with fixed parameter and model selections
    :return: 
    """
    if not os.path.exists("./output"):
        os.mkdir("./output")
    
    chunk_size = 200
    rerank = False
    
    settings_name = f"langchain_chunk:{chunk_size}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}_embedding-model:{EMBEDDING_NAME}"
    output_file_name = f"./output/rag_{settings_name}.json"
    
    print(f"Running evaluation for {settings_name}:")
    
    print("Loading knowledge base embeddings...")
    knowledge_index = load_embeddings(
        RAW_KNOWLEDGE_BASE,
        chunk_size=chunk_size,
        embedding_model=EMBEDDING_MODEL,
        embedding_model_name=EMBEDDING_NAME
    )
    
    print("Running RAG...")
    reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
    run_langchain_rag_tests(
        eval_dataset=EVAL_DATASET,
        llm=READER_LLM,
        knowledge_index=knowledge_index,
        output_file=output_file_name,
        reranker=reranker,
        verbose=True,
        test_settings=settings_name,
    )
    
    return output_file_name 

In [15]:
# execute test for langchain
LANGCHAIN_TEST_OUTPUT = run_langchain_test_all()

Running evaluation for langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm:
Loading knowledge base embeddings...
Index not found, generating it... 33666 docs in total
Running RAG...


  0%|          | 0/67 [00:00<?, ?it/s]

  warn_deprecated(


Question: What architecture is the `tokenizers-linux-x64-musl` binary designed for?

Answer: The `tokenizers-linux-x64-musl` binary is designed for the x86 architecture.

The context provides a list of binary names for different architectures, including `x86_64-unknown-linux-musl`, `aarch64-unknown-linux-musl`, `x86_64-unknown-linux-gnu`, `aarch64-unknown-linux-gnu`, `x86_64-pc-windows-msvc` and `aarch64-pc-windows-msvc`.
True answer: x86_64-unknown-linux-musl
Question: What is the purpose of the BLIP-Diffusion model?

Answer: The context does not provide any information about the purpose of the BLIP-Diffusion model, so I cannot answer this question from the provided context.
True answer: The BLIP-Diffusion model is designed for controllable text-to-image generation and editing.
Question: How can a user claim authorship of a paper on the Hugging Face Hub?

Answer: The user can claim authorship of a paper on the Hugging Face Hub by clicking in their name in the corresponding Paper page 

## Test function with doc-agent in instinct.cpp

You have to manually start `doc-agent` locally.

To build knowledge index with same knowledge base data from HF:

```shell
$DOC_AGENT_BIN --verbose \
  --parent_child_retriever \
  --child_chunk_size=200 \
  --chat_model_model_name=gemma:2b \
  --embedding_model_model_name=all-minilm:latest \
  --db_path=./data/instinct/index.db \
  --vector_table_dimension=384 \
  build \
  --force \
  --file=https://huggingface.co/api/datasets/m-ric/huggingface_doc/parquet/default/train/0.parquet \
  --type=PARQUET \
  --parquet_mapping=0:txt,1:metadata:source:varchar \
  serve \
  --port=9090 
```

To start http server for query:

```shell
$DOC_AGENT_BIN --verbose \
  --parent_child_retriever \
  --child_chunk_size=200 \
  --chat_model_model_name=gemma:2b \
  --embedding_model_model_name=all-minilm:latest \
  --db_path=/tmp/rag_eval.db \
  --vector_table_dimension=384 \
  serve \
  --port=9090 
```

Next, we will begin QA tests.

In [16]:
def answer_with_doc_agent(question: str):
    import requests
    res = requests.post("http://localhost:9090/v1/chat/completions", json={"messages": [{"content": question, "role": "human"}], "stream": False})
    assert res.status_code == 200
    body = res.json()
    return body["choices"][0]["message"]["content"]
    

def run_doc_agent_rag_tests(
    eval_dataset: datasets.Dataset,
    output_file: str,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer = answer_with_doc_agent(question)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [17]:
def run_doc_agent_test_all():
    if not os.path.exists("./output"):
        os.mkdir("./output")
    
    chunk_size = 200
    rerank = False
    
    settings_name = f"doc_agent_chunk:{chunk_size}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}_embedding-model:{EMBEDDING_NAME}"
    output_file_name = f"./output/rag_{settings_name}.json"
    
    print(f"Running evaluation for {settings_name}:")
    run_doc_agent_rag_tests(
        eval_dataset=EVAL_DATASET,
        output_file=output_file_name,
        test_settings=settings_name
    )
    
    return output_file_name

In [18]:
DOC_AGENT_TEST_OUTPUT = run_doc_agent_test_all()

Running evaluation for doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm:


  0%|          | 0/67 [00:00<?, ?it/s]

# Evaluation Runner

In [19]:
from ratelimit import limits,sleep_and_retry
from retry import retry
from langchain_core.prompts import ChatPromptTemplate

@sleep_and_retry
@limits(calls=6, period=60)
def throttled_invoke(eval_chat_model, eval_prompt):
    return eval_chat_model.invoke(eval_prompt)



@retry(exceptions=Exception, tries=6)
def evaluate_single_answer(
        evaluation_prompt_template: ChatPromptTemplate,
        experiment: dict,
        throttled:bool,
        eval_chat_model: BaseChatModel
):
    eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
    if throttled:
        eval_result = throttled_invoke(eval_chat_model, eval_prompt)
    else:
        eval_result = eval_chat_model.invoke(eval_prompt)
    splits = [item.strip() for item in eval_result.content.split("[RESULT]")]
    assert len(splits) == 2
    assert 1 <= int(splits[1]) <= 5
    return splits


def evaluate_answers(
    answer_path: str,
    eval_chat_model: BaseChatModel,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
    throttled:bool = True
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment and experiment[f"eval_score_{evaluator_name}"]:
            continue
        
        splits = evaluate_single_answer(evaluation_prompt_template, experiment, throttled, eval_chat_model)
        
        if len(splits) != 2:
            print(splits)
            # experiment[f"eval_score_{evaluator_name}"] = ""
            # experiment[f"eval_feedback_{evaluator_name}"] = ""
            continue
        feedback, score = splits 
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [20]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


EVALUATION_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

## Run evaluations

In [21]:
def generate_eval_results():
    import glob
    for output_file_name in glob.glob("./output/*.json"):
        print(f"Evaluating {output_file_name}")
        evaluate_answers(
            output_file_name,
            EVAL_MODEL,
            EVALUATOR_NAME,
            EVALUATION_PROMPT_TEMPLATE,
            # throttling is not needed for local model
            False
        )

generate_eval_results()

Evaluating ./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json


  0%|          | 0/67 [00:00<?, ?it/s]

Evaluating ./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json


  0%|          | 0/67 [00:00<?, ?it/s]

In [22]:
def load_eval_results():
    import glob
    outputs = []
    for file in glob.glob("./output/*.json"):
        output = pd.DataFrame(json.load(open(file, "r")))
        output["settings"] = file
        outputs.append(output)
    return pd.concat(outputs)

EVAL_RESULTS = load_eval_results()
display(EVAL_RESULTS)

Unnamed: 0,question,true_answer,source_doc,generated_answer,retrieved_docs,test_settings,eval_score_mixtral:latest,eval_feedback_mixtral:latest,settings
0,What architecture is the `tokenizers-linux-x64-musl` binary designed for?\n,x86_64-unknown-linux-musl,huggingface/tokenizers/blob/main/bindings/node/npm/linux-x64-musl/README.md,"The `tokenizers-linux-x64-musl` binary is designed for the x86 architecture.\n\nThe context provides a list of binary names for different architectures, including `x86_64-unknown-linux-musl`, `aarch64-unknown-linux-musl`, `x86_64-unknown-linux-gnu`, `aarch64-unknown-linux-gnu`, `x86_64-pc-windows-msvc` and `aarch64-pc-windows-msvc`.","[`tokenizers-linux-x64-musl`\n\nThis is the **x86_64-unknown-linux-musl** binary for `tokenizers`, `tokenizers-linux-arm64-musl`\n\nThis is the **aarch64-unknown-linux-musl** binary for `tokenizers`, `tokenizers-linux-x64-gnu`\n\nThis is the **x86_64-unknown-linux-gnu** binary for `tokenizers`, `tokenizers-linux-arm64-gnu`\n\nThis is the **aarch64-unknown-linux-gnu** binary for `tokenizers`, p align=""center"">\n <br>\n <img src=""https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png"" width=""600""/>\n <br>\n<p>\n<p align=""center"">\n <img alt=""Build"" src=""https://github.com/huggingface/tokenizers/workflows/Rust/badge.svg"">\n <a href=""https://github.com/huggingface/tokenizers/blob/master/LICENSE"">\n <img alt=""GitHub"" src=""https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue"">\n </a>\n <a href=""https://docs.rs/tokenizers/"">\n <img alt=""Doc"" src=""https://docs.rs/tokenizers/badge.svg"">\n </a>\n</p>\n<br>\n\n\nThe core of `tokenizers`, written in Rust.\nProvides an implementation of today's most used tokenizers, with a focus on performance and\nversatility., `tokenizers-win32-x64-msvc`\n\nThis is the **x86_64-pc-windows-msvc** binary for `tokenizers`, `tokenizers-win32-arm64-msvc`\n\nThis is the **aarch64-pc-windows-msvc** binary for `tokenizers`]",langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,3,"While the response correctly identifies that the `tokenizers-linux-x64-musl` binary is designed for the x86 architecture, it fails to provide the complete and exact architecture name as specified in the reference answer. The reference answer includes additional details (`x86_64-unknown-linux-musl`) that are crucial for a fully accurate response.",./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
1,What is the purpose of the BLIP-Diffusion model?\n,The BLIP-Diffusion model is designed for controllable text-to-image generation and editing.,huggingface/diffusers/blob/main/docs/source/en/api/pipelines/blip_diffusion.md,"The context does not provide any information about the purpose of the BLIP-Diffusion model, so I cannot answer this question from the provided context.","[Stable Diffusion\n\n## Overview\n\nStable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.\n\nThe summary of the model is the following:, The literature on Diffusion-based models is developing at a rapid pace which is why we partnered with [Jonathan Whitaker](https://github.com/johnowhitaker) to develop a course on it. The course is free, and you can check it out [here](https://github.com/huggingface/diffusion-models-class).\n\n## Support for third-party libraries, ## How does Stable Diffusion work?\n\nHaving seen the high-quality images that stable diffusion can produce, let's try to understand \na bit better how the model functions.\n\nStable Diffusion is based on a particular type of diffusion model called **Latent Diffusion**, proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752).\n\nGenerally speaking, diffusion models are machine learning systems that are trained to *denoise* random Gaussian noise step by step, to get to a sample of interest, such as an *image*. For a more detailed overview of how they work, check [this colab](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb)., --\ntitle: ""Finetune Stable Diffusion Models with DDPO via TRL"" \nthumbnail: /blog/assets/166_trl_ddpo/thumbnail.png\nauthors:\n- user: metric-space\n guest: true\n- user: sayakpaul\n- user: kashif\n- user: lvwerra\n---\n\n# Finetune Stable Diffusion Models with DDPO via TRL\n\n\n## Introduction, In this blog post, we discuss how DDPO came to be, a brief description of how it works, and how DDPO can be incorporated into an RLHF workflow to achieve model outputs more aligned with the human aesthetics. We then quickly switch gears to talk about how you can apply DDPO to your models with the newly integrated `DDPOTrainer` from the `trl` library and discuss our findings from running DDPO on Stable Diffusion. \n\n## The Advantages of DDPO\n\nDDPO is not the only working answer to the question of how to attempt to fine-tune diffusion models with RL. \n\nBefore diving in, there are two key points to remember when it comes to understanding the advantages of one RL solution over the other\n\n1. Computational efficiency is key. The more complicated your data distribution gets, the higher your computational costs get.\n2. Approximations are nice, but because approximations are not the real thing, associated errors stack up., We've gone from the basic use of Stable Diffusion using 🤗 Hugging Face Diffusers to more advanced uses of the library, and we tried to introduce all the pieces in a modern diffusion system. If you liked this topic and want to learn more, we recommend the following resources:\n- Our [Colab notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb).\n- The [Getting Started with Diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb) notebook, that gives a broader overview on Diffusion systems.\n- The [Annotated Diffusion Model](https://huggingface.co/blog/annotated-diffusion) blog post., # Stable Diffusion 2\n\nStable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of the original [Stable Diffusion](https://stability.ai/blog/stable-diffusion-public-release), and it was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/).]",langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,3,"The response correctly acknowledges that the context does not provide sufficient information to answer the question, which is a valid observation. However, it fails to provide any additional information about the purpose of the BLIP-Diffusion model as outlined in the reference answer. Therefore, the response lacks the necessary factual information to be considered correct.",./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
2,How can a user claim authorship of a paper on the Hugging Face Hub?\n,"By clicking their name on the corresponding Paper page and clicking ""claim authorship"", then confirming the request in paper settings for admin team validation.",huggingface/hub-docs/blob/main/docs/hub/paper-pages.md,"The user can claim authorship of a paper on the Hugging Face Hub by clicking in their name in the corresponding Paper page and clicking ""claim authorship"". This will automatically re-direct to their paper settings where they can confirm the request.","[* Visit the Paper page.\n* Filter for other models or datasets on the Hub that cite the same paper.\n\n<div class=""flex justify-center"">\n<img class=""block dark:hidden"" width=""300"" src=""https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-arxiv.png""/>\n<img class=""hidden dark:block"" width=""300"" src=""https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-arxiv-dark.png""/>\n</div>\n\n## Claiming authorship to a Paper\n\nThe Hub will attempt to automatically match paper to users based on their email., The Hub will attempt to automatically match paper to users based on their email. \n\n<div class=""flex justify-center"">\n<img class=""block dark:hidden"" width=""300"" src=""https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/papers-authors.png""/>\n<img class=""hidden dark:block"" width=""300"" src=""https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/papers-authors-dark.png""/>\n</div>\n\nIf your paper is not linked to your account, you can click in your name in the corresponding Paper page and click ""claim authorship"". This will automatically re-direct to your paper settings where you can confirm the request. The admin team will validate your request soon. Once confirmed, the Paper page will show as verified., It also helps us if you spread the word: reference the library from blog posts\non the awesome projects it made possible, shout out on Twitter every time it has\nhelped you, or simply star the repo to say ""thank you"".\n\nWhichever way you choose to contribute, please be mindful to respect our\n[code of conduct](https://github.com/huggingface/huggingface_hub/blob/main/CODE_OF_CONDUCT.md).\n\n> Looking for a good first issue to work on?\n> Please check out our contributing guide below and then select an issue from our [curated list](https://github.com/huggingface/huggingface_hub/contribute).\n> Pick one and get started with it!\n\n### The client library, `huggingface_hub`, # Model `license:other` challenge\n\nRelated to https://github.com/huggingface/hub-docs/issues/985.\n\n## Context, ### Would you like to integrate your library to the Hub?\n\nThis integration is made possible by the [`huggingface_hub`](https://github.com/huggingface/huggingface_hub) library. If you want to add your library to the Hub, we have a [guide](https://huggingface.co/docs/hub/models-adding-libraries) for you! Or simply tag someone from the Hugging Face team.\n\nA shout out to the Hugging Face team for all the work on this integration, in particular [@osanseviero](https://twitter.com/osanseviero) 🦙.\n\nThank you fastlearners and hugging learners 🤗., ### MODEL CARDS ON THE HUGGING FACE HUB\nSince 2018, new platforms and mediums for hosting and sharing model cards have also emerged. For example, particularly relevant to this project, Hugging Face hosts model cards on the Hugging Face Hub as README files in the repositories associated with ML models. As a result, model cards figure as a prominent form of documentation for users of models on the Hugging Face Hub. As part of our analysis of model cards, we developed and proposed model cards for several dozen ML models on the Hugging Face Hub, using the Hub’s Pull Request (PR) and Discussion features to gather feedback on model cards, verify information included in model cards, and publish model cards for models on the Hugging Face Hub. At the time of writing of this guide book, all of Hugging Face’s models on the Hugging Face Hub have an associated model card on the Hub[^8]., --\ntitle: Hugging Face Collaborates with Microsoft to launch Hugging Face Model Catalog on Azure\nthumbnail: /blog/assets/75_hugging_face_endpoints_on_azure/01.jpg\nauthors:\n- user: jeffboudier\n- user: philschmid\n- user: juliensimon\n---\n\n# Hugging Face Collaborates with Microsoft to launch Hugging Face Model Catalog on Azure\n\n\n![Hugging Face Endpoints on Azure](assets/75_hugging_face_endpoints_on_azure/01.jpg ""Hugging Face Endpoints on Azure"")]",langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,4,"The response is largely correct and accurately describes the process of claiming authorship on the Hugging Face Hub. However, it omits the crucial step of waiting for admin team validation, which is an essential part of the process as mentioned in the reference answer.",./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
3,What is the purpose of the /healthcheck endpoint in the Datasets server API?\n,Ensure the app is running,huggingface/datasets-server/blob/main/services/api/README.md,"The purpose of the `/healthcheck` endpoint in the Datasets server API is to ensure the app is running properly. It sends a JSON response containing information about the cache, queue, and any pending jobs for the dataset.","[Datasets server API - rows endpoint\n\n> /rows endpoint\n\n## Configuration\n\nThe service can be configured using environment variables. They are grouped by scope.\n\n### API service\n\nSee [../../libs/libapi/README.md](../../libs/libapi/README.md) for more information about the API configuration.\n\n### Common\n\nSee [../../libs/libcommon/README.md](../../libs/libcommon/README.md) for more information about the common configuration.\n\n## Endpoints\n\nSee https://huggingface.co/docs/datasets-server\n\n- /healthcheck: ensure the app is running\n- /metrics: return a list of metrics in the Prometheus format\n- /rows: get a slice of rows of a dataset split, ## Endpoints\n\nSee https://huggingface.co/docs/datasets-server\n\n- /healthcheck: ensure the app is running\n- /metrics: return a list of metrics in the Prometheus format\n- /search: get a slice of a search result over a dataset split\n- /filter: filter rows of a dataset split, ## Endpoints\n\nSee https://huggingface.co/docs/datasets-server\n\n- /healthcheck: Ensure the app is running\n- /metrics: Return a list of metrics in the Prometheus format\n- /hub-cache: Return a dataset information as a Server-Sent Event (SSE) when a dataset is updated. If `?all=true` is passed in the parameters, and if the cache already has some entries, one SSE per cached dataset is sent to the client. Then, a SSE is sent when a dataset is inserted, modified or deleted. The event data is a JSON with the following structure. The `hub_cache` field is null for deleted entries, or when the response is an error. The `num_rows` value is `0` if it could not be determined., The endpoint response is a JSON with the `dataset_info` key. Its structure and content correspond to [DatasetInfo](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.DatasetInfo) object of the `datasets` library., The endpoint response is a JSON containing a list of the dataset's splits and configurations. For example, the [duorc](https://huggingface.co/datasets/duorc) dataset has six splits and two configurations:, - `/healthcheck`\n- `/metrics`: give info about the cache and the queue\n- `/cache-reports{processing_step}`: give detailed reports on the content of the cache for a processing step\n- `/cache-reports-with-content{processing_step}`: give detailed reports on the content of the cache for a processing step, including the content itself, which can be heavy\n- `/pending-jobs`: give the pending jobs, classed by queue and status (waiting or started)\n- `/force-refresh{processing_step}`: force refresh cache entries for the processing step. It's a POST endpoint. Pass the requested parameters, depending on the processing step's input type:\n - `dataset`: `?dataset={dataset}`\n - `config`: `?dataset={dataset}&config={config}`\n - `split`: `?dataset={dataset}&config={config}&split={split}`, ## Endpoints\n\nSee https://huggingface.co/docs/datasets-server\n\n- /healthcheck: Ensure the app is running\n- /metrics: Return a list of metrics in the Prometheus format\n- /webhook: Add, update or remove a dataset\n- /is-valid: Tell if a dataset is [valid](https://huggingface.co/docs/datasets-server/valid)\n- /splits: List the [splits](https://huggingface.co/docs/datasets-server/splits) names for a dataset\n- /first-rows: Extract the [first rows](https://huggingface.co/docs/datasets-server/first_rows) for a dataset split\n- /parquet: List the [parquet files](https://huggingface.co/docs/datasets-server/parquet) auto-converted for a dataset]",langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,5,"The response accurately explains the purpose of the `/healthcheck` endpoint in the Datasets server API, stating that it ensures the app is running properly and sends a JSON response with information about the cache, queue, and any pending jobs for the dataset. This aligns perfectly with the reference answer, making the response correct, accurate, and factual.",./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
4,What is the default context window size for Local Attention in the LongT5 model?\n,127 tokens,huggingface/transformers/blob/main/docs/source/en/model_doc/longt5.md,"The context window size for Local Attention in the LongT5 model is not specified in the context, so the question cannot be answered from the context.","[## Longformer Self Attention\n\nLongformer self attention employs self attention on both a ""local"" context and a ""global"" context. Most tokens only\nattend ""locally"" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and\n\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in\n`config.attention_window`. Note that `config.attention_window` can be of type `List` to define a\ndifferent \\(w\\) for each layer. A selected few tokens attend ""globally"" to all other tokens, as it is\nconventionally done for all tokens in `BertSelfAttention`., ## Local attention\n\n[Longformer](#longformer) uses local attention: often, the local context (e.g., what are the two tokens to the\nleft and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small\nwindow, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a\nrepresentation of the whole sentence.\n\nSome preselected input tokens are also given global attention: for those few tokens, the attention matrix can access\nall tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in\ntheir local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:, - [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional\nencoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.\n- Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective\ninspired by the pre-training of [`PegasusForConditionalGeneration`].\n- LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the\ninput sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.\n- For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`, Note that ""locally"" and ""globally"" attending tokens are projected by different query, key and value matrices. Also note\nthat every ""locally"" attending token not only attends to tokens within its window \\(w\\), but also to all ""globally""\nattending tokens so that global attention is *symmetric*.\n\nThe user can define which tokens attend ""locally"" and which tokens attend ""globally"" by setting the tensor\n`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for\n`global_attention_mask`:\n\n- 0: the token attends ""locally"",\n- 1: the token attends ""globally"".\n\nFor more information please also refer to [`~LongformerModel.forward`] method., are constructed dynamically within each attention operation). As a consequence, *TGlobal* attention introduces\na few new parameters -- global relative position biases and a layer normalization for global token's embedding.\nThe complexity of this mechanism is `O(l(r + l/k))`.\n- An example showing how to evaluate a fine-tuned LongT5 model on the [pubmed dataset](https://huggingface.co/datasets/scientific_papers) is below., -->\n\n# LongT5\n\n## Overview\n\nThe LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)\nby Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung and Yinfei Yang. It's an\nencoder-decoder transformer pre-trained in a text-to-text denoising generative setting. LongT5 model is an extension of\nT5 model, and it enables using one of the two different efficient attention mechanisms - (1) Local attention, or (2)\nTransient-Global attention.\n\n\nThe abstract from the paper is the following:, However, most Transformer models continued to trend towards more parameters, leading to new models focused on improving training efficiency. [ALBERT](model_doc/albert) reduces memory consumption by lowering the number of parameters in two ways: separating the larger vocabulary embedding into two smaller matrices and allowing layers to share parameters. [DeBERTa](model_doc/deberta) added a disentangled attention mechanism where the word and its position are separately encoded in two vectors. The attention is computed from these separate vectors instead of a single vector containing the word and position embeddings. [Longformer](model_doc/longformer) also focused on making attention more efficient, especially for processing documents with longer sequence lengths. It uses a combination of local windowed attention (attention only calculated from fixed window size around each token) and global attention (only for specific task tokens like `[CLS]` for classification) to create a sparse attention matrix instead of a full attention matrix.\n\n### Decoder[[nlp-decoder]]]",langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,3,"The response correctly indicates that the context does not provide enough information to answer the question, which aligns with the instruction's requirement for a factual answer based on the context. However, since the reference answer provides a specific value for the default context window size, the response lacks the necessary detail to be evaluated as correct or fully accurate.\nFeedback: The response accurately states that the information is not available in the provided context; however, it does not offer a factual answer based on the reference material.",./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
...,...,...,...,...,...,...,...,...,...
62,What is the maximum size of a model checkpoint before it is automatically sharded in Transformers version 4.18.0?\n,10GB,huggingface/transformers/blob/main/docs/source/en/big_models.md,The maximum size of a model checkpoint before it is automatically sharded in the Transformers version 4.18.0 is not specified in the documentation.,,doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,3,"The response correctly states that the maximum size of a model checkpoint before it is automatically sharded in Transformers version 4.18.0 is not specified in the documentation. However, the reference answer provides a specific value (10GB) which is the actual maximum size. Therefore, the response is not directly addressing the given instruction and lacks accuracy.\n\nFeedback: [The response correctly indicates that the maximum size is not specified in the documentation but fails to provide the expected factual value as stated in the reference answer.]",./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
63,What is the purpose of Weights and Biases (W&B) for data scientists and machine learning scientists?\n,"To track their machine learning experiments at every stage, from training to production.",gradio-app/gradio/blob/main/guides/06_integrating-other-frameworks/Gradio-and-Wandb-Integration.md,"Sure, here are the specific roles and responsibilities of Weights and Biases (W&B) in data science and machine learning:\n\n> W&B play a crucial role in data science and machine learning by helping to model and analyze complex relationships between data inputs and outputs. They are used in various data science tasks such as image processing, natural language processing, and predictive modeling.\n\n> W&B are responsible for learning the patterns and relationships from the data by performing various operations such as training, backpropagation, and optimization. They help to extract these patterns and relationships to make predictions on new data inputs.\n\n> W&B are also responsible for setting the initial weights and biases of neural networks to determine their initial state. They are used in the training process to adjust the weights and biases to achieve better performance on the data.",,doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,1,"The response does not correctly address the purpose of Weights and Biases (W&B) for data scientists and machine learning scientists. While some parts of the response touch on relevant topics such as training, backpropagation, and optimization, they are not presented in a way that accurately describes W&B's role. The response does not mention experiment tracking at all, which is a key purpose of W&B according to the reference answer.",./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
64,What is the name of the open-source library created by Hugging Face to simplify Transformer acceleration?\n,Optimum,huggingface/blog/blob/main/intel.md,"The name of the open-source library created by Hugging Face to simplify Transformer acceleration is not mentioned in the conversation, so I cannot answer this question from the provided context.",,doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,3,"The response correctly states that the name of the library cannot be answered from the provided context, which aligns with the score rubric's criteria for accuracy and factuality. However, it does not provide an actual answer or reference to the Optimum library as in the reference answer, leading to a lower score.",./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json
65,What parameter is used to ensure that elements in a row have the same height in Gradio?\n,equal_height,gradio-app/gradio/blob/main/guides/cn/03_building-with-blocks/02_controlling-layout.md,"Sure, here is the standalone question in its original language:\n\n> What is the parameter used to ensure that elements in a row have the same height in Gradio?",,doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm,5,"The response is correct and aligns with the reference answer. The parameter used to ensure that elements in a row have the same height in Gradio is indeed ""equal_height"".",./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json


## Scoring evaluation results

In [23]:
import pandas as pd

def scoring_output(eval_result: pd.DataFrame, evaluator_name: str):
    score_field = f"eval_score_{evaluator_name}"
    result = eval_result.loc[:, [score_field, "settings"]].copy()
    
    result[score_field] = result[score_field].apply(lambda x: int(x) if isinstance(x, str) else 1)
    
    result[score_field] = (result[score_field] - 1) / 4    
    average_scores = result.groupby("settings")[score_field].mean()

    average_scores.sort_values()
    return average_scores

scores = scoring_output(EVAL_RESULTS, EVALUATOR_NAME)
display(scores)

settings
./output/rag_doc_agent_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json    0.492537
./output/rag_langchain_chunk:200_rerank:False_reader-model:gemma:2b_embedding-model:all-minilm.json    0.641791
Name: eval_score_mixtral:latest, dtype: float64