In [None]:
!pip install datasets
!pip -q install langchain
!pip install langchain-core
!pip -q install --upgrade huggingface_hub
!pip -q install langchain_community
!pip -q install langchain_huggingface
!pip install huggingface_hub
!pip install faiss-cpu
!pip install faiss-gpu
!pip install transformers
!pip install torch
!pip install openai
!pip install gradio



Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import numpy as np
import json
import re
import os
import sys
import faiss # Faiss library for indexing
import torch
import openai
import configparser
import pandas as pd
from sklearn.preprocessing import normalize
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics import roc_auc_score
from langchain_community.llms import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import PromptTemplate     #
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from tqdm import tqdm # Progress bar library
from openai import OpenAI
from google.colab import drive
from transformers import pipeline, AutoTokenizer
import gradio as gr

In [None]:
Invalid_float_val = 9

class EvaluationMetrics:
    """
    A class to store and update evaluation metrics for a question-answering task.

    Attributes:
        question (str): The question being evaluated.
        relevance_score_o (float): Original relevance score.
        utilization_score_o (float): Original utilization score.
        adherence_score_o (float): Original adherence score.
        completeness_score_o (float): Original completeness score.
        relevance_score_p (float): Predicted/updated relevance score, initialized to Invalid_float_val.
        utilization_score_p (float): Predicted/updated utilization score, initialized to Invalid_float_val.
        adherence_score_p (float): Predicted/updated adherence score, initialized to Invalid_float_val.
        completeness_score_p (float): Predicted/updated completeness score, initialized to Invalid_float_val.
    """
    def __init__(self, question, response, relevance_score_o, utilization_score_o, adherence_score_o, completeness_score_o):
        """
        Initializes an EvaluationMetrics object.

        Args:
            question (str): The question being evaluated.
            relevance_score_o (float): The original relevance score.
            utilization_score_o (float): The original utilization score.
            adherence_score_o (float): The original adherence score.
            completeness_score_o (float): The original completeness score.
        """
        self.question = question
        self.response = response
        self.relevance_score_o = relevance_score_o
        self.utilization_score_o = utilization_score_o
        self.adherence_score_o = adherence_score_o
        self.completeness_score_o = completeness_score_o
        self.relevance_score_p = Invalid_float_val
        self.utilization_score_p = Invalid_float_val
        self.adherence_score_p = Invalid_float_val
        self.completeness_score_p = Invalid_float_val

    def update_scores(self, relevance_score_p=Invalid_float_val, utilization_score_p=Invalid_float_val, adherence_score_p=Invalid_float_val, completeness_score_p=Invalid_float_val):
        """
        Updates the predicted/updated scores.

        Args:
            relevance_score_p (float, optional): The predicted relevance score. Defaults to Invalid_float_val.
            utilization_score_p (float, optional): The predicted utilization score. Defaults to Invalid_float_val.
            adherence_score_p (float, optional): The predicted adherence score. Defaults to Invalid_float_val.
            completeness_score_p (float, optional): The predicted completeness score. Defaults to Invalid_float_val.
        """
        self.relevance_score_p = relevance_score_p
        self.utilization_score_p = utilization_score_p
        self.adherence_score_p = adherence_score_p
        self.completeness_score_p = completeness_score_p

In [None]:

def read_config(config_file_path):
    """
    Reads configuration from a .ini file and returns it as a ConfigParser object.

    Args:
        config_file_path (str): The path to the .ini configuration file.

    Returns:
        ConfigParser: A ConfigParser object to access the configuration.

    Raises:
        FileNotFoundError: If the configuration file does not exist.
        ValueError: If the configuration file is invalid.
    """
    if not os.path.exists(config_file_path):
        raise FileNotFoundError(f"Configuration file not found: {config_file_path}")

    config = configparser.ConfigParser()

    try:
        config.read(config_file_path)
    except Exception as e:
        raise ValueError(f"Failed to read configuration file: {config_file_path}. Error: {e}")

    return config

In [None]:
drive.mount("/content/drive/", force_remount=True)

RAG_Config_folder_path = "/content/drive/My Drive/RAG_Config/"

Config = read_config(RAG_Config_folder_path+'config.ini')

Mounted at /content/drive/


In [None]:
# Hugging Face Embedding LLM configuration
class HuggingFaceLLM:
  def __init__(self, Embedding_Model, QA_Model, max_tokens):

    # Hugging Face API key
    hfapi_key = Config.get('Settings', 'hfapi_key')

    # Setting environment variables for Hugging Face authentication.
    os.environ["HF_TOKEN"] = hfapi_key
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = hfapi_key

    self.Embedding_Model = SentenceTransformer(Embedding_Model)

    # Hugging Face LLM endpoint configuration
    self.LLM1 = HuggingFaceEndpoint(
        repo_id=QA_Model,
        task="text-generation",
        max_new_tokens=max_tokens,
        temperature=0.1,
        top_k=30,
        repetition_penalty=1.03
    )

  def get_hf_embedding(self, text):
      #return self.Embedding_Model.encode(text, convert_to_tensor=True).cpu()
      return self.Embedding_Model.encode(text)

  def get_hf_response(self, prompt):
    return self.LLM1.invoke(prompt)

# Openi AI LLM configuration
class OpenAILLM:
  def __init__(self, Embedding_Model, QA_Model, max_tokens):
    openai_api_key = Config.get('Settings', 'openai_api_key')
    openai.api_key = openai_api_key
    self.Open_AI_LLM_inst = OpenAI(api_key=openai_api_key)  # Creates an OpenAI LLM instance.
    self.Embedding_Model = Embedding_Model
    self.QA_Model = QA_Model
    self.max_tokens = max_tokens

  # Function to get embedding using OpenAI LLM
  def get_openai_embedding(self, text):
      """
      Gets an embedding for a text using OpenAI LLM embeddings API.

      Args:
          text (str): The text to get embedding for.

      Returns:
          list: The embedding vector from the OpenAI response.
      """

      return self.Open_AI_LLM_inst.embeddings.create(input=[text], model=self.Embedding_Model).data[0].embedding

  # Function to call OpenAI LLM for question answering
  def get_openai_response(self, prompt):
      """
      Sends a prompt to OpenAI LLM's beta chat completions API and returns the generated response.

      Args:
          prompt (str): The prompt string to send to the LLM.

      Returns:
          str: The generated response from the LLM, or an error message if an exception occurs.
      """

      try:
          # Define message objects including prompt and "test question?" (potentially a placeholder)
          messages = [
              {"role": "system", "content": prompt},
              {"role": "user", "content": "test question?"},
          ]

          # Use OpenAI LLM beta chat completions API with the specified parameters
          response = self.Open_AI_LLM_inst.beta.chat.completions.parse(
              model=self.QA_Model, messages=messages, max_tokens=self.max_tokens
          )

          # Return the content of the first choice, stripped of any whitespace
          return response.choices[0].message.content.strip()

      except Exception as e:
          # Handle exceptions and return an error message
          return f"Error: {e}"


In [None]:
# Global variables
Chunk_Size = Config.getint('Settings', 'Chunk_Size')

Chunk_Overlap = Config.getint('Settings', 'Chunk_Overlap')

Top_k = Config.getint('Settings', 'Top_k')

# Domain keys
Domain_Keys =  [item.strip() for item in Config.get('Settings', 'Domain_Keys').split(',')]
Domain_Split =  Config.get('Settings', 'Domain_Split')

# Number of retries for support info when JSON decoding fails
No_of_retries_support_info = 3

# List to store evaluation results for benchmarking and analysis.
evaluation_list = []

# Specifies the CSV file to store evaluation metrics.
metrics_file_name = RAG_Config_folder_path+"metrics.csv"
rmse_auc_file_name = RAG_Config_folder_path+"rmse_auc.txt"
dump_file_name = RAG_Config_folder_path+"dump.txt"
debug_txt_file_name = RAG_Config_folder_path + "debug.txt"
debug_csv_file_name = RAG_Config_folder_path + "debug.csv"

dump_file_id = open(dump_file_name, "w")


# HF: 0, Open AI: 1
Embedding_LLM = Config.getint('Settings', 'Embedding_LLM')
QA_LLM = Config.getint('Settings', 'QA_LLM')

# For HF case _0
Embedding_Model_0 = Config.get('Settings', 'Embedding_Model_0')
QA_Model_0 = Config.get('Settings', 'QA_Model_0')
max_tokens_0 = Config.getint('Settings', 'max_tokens_0')

# For open AI case _1
Embedding_Model_1 = Config.get('Settings', 'Embedding_Model_1')
QA_Model_1 = Config.get('Settings', 'QA_Model_1')
max_tokens_1 = Config.getint('Settings', 'max_tokens_1')

obj_HuggingFaceLLM = HuggingFaceLLM(Embedding_Model_0, QA_Model_0, max_tokens_0)
obj_OpenAILLM = OpenAILLM(Embedding_Model_1, QA_Model_1, max_tokens_1)

# File paths for embeddings
if Embedding_LLM==0:
	Embedding_Model = Embedding_Model_0
else:
	Embedding_Model = Embedding_Model_1
Embeddings_file_path = RAG_Config_folder_path
for Domain_Key in Domain_Keys:
  Embeddings_file_path += str(Domain_Key)+"_"
Embeddings_file_path += str(Domain_Split)+"_"+str(Chunk_Size)+"_"+str(Chunk_Overlap)+"_"+str(Embedding_LLM)+"_"+str(Embedding_Model.split('/')[-1])+".npy"


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.19k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.LLM1 = HuggingFaceEndpoint(
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
def load_ragbench_dataset():
    """
    Loads the RagBench dataset

    Returns:
        dict: A dictionary containing datasets for each domain key.
    """
    ragbench = {}
    for domain_key in Domain_Keys:
        ragbench[domain_key] = load_dataset("rungalileo/ragbench", domain_key)
    return ragbench


def get_documents(ragbench):
    """
    Extracts documents from the RagBench dataset for a specific split (train, dev, test)

    Args:
        ragbench (dict): The RagBench dataset dictionary.
        split (str, optional): The split of the dataset to use (default: "train").

    Returns:
        list: A list of all documents in the specified split.
    """
    documents = []
    for domain_key in Domain_Keys:
        for row in ragbench[domain_key][Domain_Split]:
            documents.extend(row['documents'])
    return documents


def get_questions(ragbench, split="test"):
    """
    Extracts questions from the RagBench dataset for a specific split

    Args:
        ragbench (dict): The RagBench dataset dictionary.
        split (str, optional): The split of the dataset to use .

    Returns:
        list: A list of all questions in the specified split.
    """
    questions = []
    for domain_key in Domain_Keys:
        for row in ragbench[domain_key][split]:
            questions.append(row['question'])
    return questions

def get_evaluation_list(ragbench, split="test"):
    """
    Extracts evaluation metrics from the RagBench dataset for a specific split.

    Args:
        ragbench (dict): The RagBench dataset dictionary.
        split (str, optional): The split of the dataset to use (default: "train").

    Returns:
        list: A list of EvaluationMetrics objects containing evaluation scores for each question.
    """
    evaluation_list = []
    for domain_key in Domain_Keys:
        for row in ragbench[domain_key][split]:
            evaluation_list.append(EvaluationMetrics(
                row['question'],
                row['response'],
                row['relevance_score'],
                row['utilization_score'],
                row['adherence_score'],
                row['completeness_score']))
    return evaluation_list


def update_evaluation_list(metrics, evaluation_list, i):
    """
    Updates the evaluation metrics for a specific entry in the evaluation list.

    Args:
        metrics (dict, optional): A dictionary containing predicted scores (e.g., "Context Relevance").
                                  If None, only the original scores are used.
        evaluation_list (list): The list of EvaluationMetrics objects.
        i (int): The index of the entry in the evaluation_list to update.
    """
    if metrics is not None:
        evaluation_list[i].update_scores(
            metrics["Context Relevance"],
            metrics["Context Utilization"],
            metrics["Adherence"],
            metrics["Completeness"])
    else:
        evaluation_list[i].update_scores()


def get_chunks(documents):
    """
    Splits documents into chunks of a specified size with overlap for retrieval.

    Args:
        documents (list): A list of documents (strings).

    Returns:
        list: A list of all document chunks.
    """
    all_document_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=Chunk_Size, chunk_overlap=Chunk_Overlap, separators=[". ", "! ", "? "])

    for doc in documents:
        all_document_chunks.extend(text_splitter.split_text(' '.join(doc.split())))
    return all_document_chunks

def get_document_embeddings(document_chunks):
    """
    Generates document embeddings using Embedding_Model for each document chunk.

    Args:
        document_chunks (list): A list of document chunks (strings).

    Returns:
        None.
    """

    # Return if embeddings file exists
    if os.path.exists(Embeddings_file_path):
        print("Embeddings file already exists.")
        return

    embeddings = []
    for chunk in tqdm(document_chunks):
      if Embedding_LLM == 0:
        embeddings.append(obj_HuggingFaceLLM.get_hf_embedding(chunk))
      elif Embedding_LLM == 1:
        embeddings.append(obj_OpenAILLM.get_openai_embedding(chunk))
      else:
        sys.exit("Invalid Embedding_LLM value in config file")

    # Generate embeddings if file doesn't exist
    save_embeddings(np.array(embeddings), Embeddings_file_path)

# Function to retrieve top-k relevant document chunks (approach 1)
def query_retrieval(all_document_chunks, index, query, top_k, dump_file_id=None):
    """
    Retrieves the top-k most relevant document chunks based on a query using Faiss index (approach 1).

    Args:
        all_document_chunks (list): A list of all document chunks.
        index (faiss.Index): The Faiss index for fast similarity search.
        query (str): The query string.
        top_k (int): The number of top relevant chunks to retrieve.
        dump_file_id (file object, optional): A file object to write debug information (e.g., indices, distances).

    Returns:
        list: A list of the top-k retrieved document chunks.
    """

    if Embedding_LLM == 0:
      query_embedding = obj_HuggingFaceLLM.get_hf_embedding(query).reshape(1, -1)
    elif Embedding_LLM == 1:
      query_embedding = np.array(obj_OpenAILLM.get_openai_embedding(query)).reshape(1, -1)
    else:
      sys.exit("Invalid Embedding_LLM value in config file")

    # Convert to float32 for Faiss compatibility
    query_embedding = query_embedding.astype('float32')

    # Normalize the query embedding (L2 norm)
    normalized_query_embedding = normalize(query_embedding, norm='l2', axis=1)

    # Search for nearest neighbors (top-k) in the Faiss index
    #print("normalized_query_embedding", normalized_query_embedding)
    D, I = index.search(normalized_query_embedding, top_k)

    # Filter out indices with -1 (indicating not assigned during search)
    VI = I[I != -1]
    VD = D[I != -1]

    # Optionally, write debug information to file (indices and distances)
    if dump_file_id is not None:
        dump_file_id.write("VI = " + str(VI) + "\n")
        dump_file_id.write("VD = " + str(VD) + "\n")

    # Retrieve the top-k document chunks based on the filtered indices
    retrieved_chunks = [all_document_chunks[i] for i in VI]
    return retrieved_chunks


# Function to save document embeddings to a file
def save_embeddings(embeddings, file_path):
    """
    Saves document embeddings to a NumPy file.

    Args:
        embeddings (np.array): The document embeddings to save.
        file_path (str): The path to the file to save the embeddings.
    """

    np.save(file_path, embeddings)


# Function to load document embeddings from a file
def load_embeddings(file_path):
    """
    Loads document embeddings from a NumPy file.

    Args:
        file_path (str): The path to the file containing the embeddings.

    Returns:
        np.array: The loaded document embeddings.
    """

    return np.load(file_path)

def create_normalized_embedding_index(Chunks):
    """
    Creates a Faiss index for document retrieval using normalized embeddings (approach 2).

    Args:
        Chunks (list): A list of document chunks.

    Returns:
        faiss.Index: The created Faiss index.
    """

    # Get document embeddings in to a file if embeddings file doesn't exists
    get_document_embeddings(Chunks)

    # Load embeddings from file
    document_embeddings = load_embeddings(Embeddings_file_path)

    # Convert to float32 for Faiss compatibility
    document_embeddings = document_embeddings.astype('float32')

    # Normalize embeddings using L2 normalization (important for inner product index)
    normalized_embeddings = normalize(document_embeddings, norm='l2', axis=1)

    # Create Faiss index using inner product (IP) similarity measure.
    # L2 normalization changes Euclidean distance to cosine similarity, which is equivalent to inner product after normalization.
    index = faiss.IndexFlatIP(normalized_embeddings.shape[1])

    # Add normalized embeddings to the index
    index.add(normalized_embeddings)
    return index

def rerank_documents(query, documents):
  # Set the device (0 for the first GPU, -1 for CPU)
    device = 0 if torch.cuda.is_available() else -1
    """Rerank documents based on relevance to the query."""
    # Load a pretrained reranker model (e.g., fine-tuned BERT)
    reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")

    # Score each document against the query
    scores = []
    for doc in documents:
        input_pair = f"{query} [SEP] {doc}"
        score = reranker(input_pair)[0]["score"]  # Get relevance score
        scores.append(score)

    # Sort documents and scores by descending score
    sorted_pairs = sorted(zip(scores, documents), reverse=True)  # Sort by scores
    sorted_scores, ranked_documents = zip(*sorted_pairs)  # Unzip into two lists

    return list(ranked_documents), list(sorted_scores)

# Function to generate a prompt template for question answering tasks
def get_QA_PROMPT():
    """
    Generates a prompt template for question answering tasks using the provided format.

    Returns:
        PromptTemplate: A PromptTemplate object containing the input variables and template string.
    """

    template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
"""

    qa_prompt = PromptTemplate(input_variables=["context", "question"], template=template)
    return qa_prompt


# Function to get answer from context
def get_answer_from_context(question, relevant_documents, dump_file_id):
    """
    Gets an answer to a question from the provided context using LLM1.

    Args:
        question (str): The question to answer.
        relevant_documents (list): A list of relevant document strings.
        dump_file_id (file object, optional): A file object to write debug information (e.g., prompt).

    Returns:
        str: The generated answer
    """

    # Get the question answering prompt template
    qa_prompt = get_QA_PROMPT()

    # Combine relevant documents into a single string with newlines as separators
    Rerank = True
    if Rerank:
      ranked_docs, relevance_scores = rerank_documents(question, relevant_documents)
      context = "\n".join(ranked_docs) # If we need to repack modify this code
      #print(relevance_scores)
    else:
      context = "\n".join(relevant_documents)

    # Format the prompt using the context and question
    prompt = qa_prompt.format(context=context, question=question)

    # Write the prompt to the debug file if provided
    if dump_file_id is not None:
        dump_file_id.write("prompt = " + prompt + "\n")

    # Generate the response using LLM1 and return it
    try:

      if QA_LLM == 0:
        response = obj_HuggingFaceLLM.get_hf_response(prompt)
      elif QA_LLM == 1:
        response = obj_OpenAILLM.get_openai_response(prompt)
      else:
        sys.exit("Invalid QA_LLM value in config file")

      return response
    except Exception as e:
      # Handle exceptions and return an error message
      return f"An error occurred while processing: {e}"

# Function to generate support information using OpenAI GPT-3.5 Turbo
def generate_support_info_prompt(relevant_doc_chunks, question, answer, dump_file_id):
    """
    Generates a prompt for evaluating the support of an answer given a question and relevant document chunks.

    Args:
        relevant_doc_chunks (list): A list of relevant document chunks (strings).
        question (str): The question that was asked.
        answer (str): The answer that was given.

    Returns:
        str: The formatted prompt string.
    """

    # Define the prompt template for support analysis. This template instructs
    # a language model to assess the support of an answer based on provided documents.
    template_for_support_analysis = """I asked someone to answer a question based on one or more documents.
Your task is to review their response and assess whether or not each sentence
in that response is supported by text in the documents. And if so, which
sentences in the documents provide that support. You will also tell me which
of the documents contain useful information for answering the question, and
which of the documents the answer was sourced from.
Here are the documents, each of which is split into sentences. Alongside each
sentence is associated key, such as ’0a.’ or ’0b.’ that you can use to refer
to it:
‘‘‘
{documents}
‘‘‘
The question was:
‘‘‘
{question}
‘‘‘
Here is their response, split into sentences. Alongside each sentence is
associated key, such as ’a.’ or ’b.’ that you can use to refer to it. Note
that these keys are unique to the response, and are not related to the keys
in the documents:
‘‘‘
{answer}
‘‘‘
You must respond with a JSON object matching this schema:
‘‘‘
{{
"relevance_explanation": string,
"all_relevant_sentence_keys": [string],
"overall_supported_explanation": string,
"overall_supported": boolean,
"sentence_support_information": [
{{
"response_sentence_key": string,
"explanation": string,
"supporting_sentence_keys": [string],
"fully_supported": boolean
}},
],
"all_utilized_sentence_keys": [string]
}}
‘‘‘
The relevance_explanation field is a string explaining which documents
contain useful information for answering the question. Provide a step-by-step
breakdown of information provided in the documents and how it is useful for
answering the question.
The all_relevant_sentence_keys field is a list of all document sentences keys
(e.g. ’0a’) that are revant to the question. Include every sentence that is
useful and relevant to the question, even if it was not used in the response,
or if only parts of the sentence are useful. Ignore the provided response when
making this judgement and base your judgement solely on the provided documents
and question. Omit sentences that, if removed from the document, would not
impact someone’s ability to answer the question.
The overall_supported_explanation field is a string explaining why the response
*as a whole* is or is not supported by the documents. In this field, provide a
step-by-step breakdown of the claims made in the response and the support (or
lack thereof) for those claims in the documents. Begin by assessing each claim
separately, one by one; don’t make any remarks about the response as a whole
until you have assessed all the claims in isolation.
The overall_supported field is a boolean indicating whether the response as a
whole is supported by the documents. This value should reflect the conclusion
you drew at the end of your step-by-step breakdown in overall_supported_explanation.
In the sentence_support_information field, provide information about the support
*for each sentence* in the response.
The sentence_support_information field is a list of objects, one for each sentence
in the response. Each object MUST have the following fields:
- response_sentence_key: a string identifying the sentence in the response.
This key is the same as the one used in the response above.
- explanation: a string explaining why the sentence is or is not supported by the
documents.
- supporting_sentence_keys: keys (e.g. ’0a’) of sentences from the documents that
support the response sentence. If the sentence is not supported, this list MUST
be empty. If the sentence is supported, this list MUST contain one or more keys.
In special cases where the sentence is supported, but not by any specific sentence,
you can use the string "supported_without_sentence" to indicate that the sentence
is generally supported by the documents. Consider cases where the sentence is
expressing inability to answer the question due to lack of relevant information in
the provided contex as "supported_without_sentence". In cases where the sentence
is making a general statement (e.g. outlining the steps to produce an answer, or
summarizing previously stated sentences, or a transition sentence), use the
sting "general".In cases where the sentence is correctly stating a well-known fact,
like a mathematical formula, use the string "well_known_fact". In cases where the
sentence is performing numerical reasoning (e.g. addition, multiplication), use
the string "numerical_reasoning".
- fully_supported: a boolean indicating whether the sentence is fully supported by
the documents.
- This value should reflect the conclusion you drew at the end of your step-by-step
breakdown in explanation.
- If supporting_sentence_keys is an empty list, then fully_supported must be false.
- Otherwise, use fully_supported to clarify whether everything in the response
sentence is fully supported by the document text indicated in supporting_sentence_keys
(fully_supported = true), or whether the sentence is only partially or incompletely
supported by that document text (fully_supported = false).
The all_utilized_sentence_keys field is a list of all sentences keys (e.g. ’0a’) that
were used to construct the answer. Include every sentence that either directly supported
the answer, or was implicitly used to construct the answer, even if it was not used
in its entirety. Omit sentences that were not used, and could have been removed from
the documents without affecting the answer.
Your response MUST strictly adhere to the provided JSON schema. Do NOT include any text outside the JSON object. Use escapes for quotes, e.g. ‘\\"‘, and
newlines, e.g. ‘\\n‘. Do not write anything before or after the JSON string. Do not
wrap the JSON string in backticks like ‘‘‘ or ‘‘‘json.
As a reminder: your task is to review the response and assess which documents contain
useful information pertaining to the question, and how each sentence in the response
is supported by the text in the documents.Your response MUST strictly adhere to the provided JSON schema. Do NOT include any text outside the JSON object.
"""

    # Format document chunks into numbered sentences with keys (e.g., "0: This is sentence one.")
    doc_sentences = "\n".join([f"{idx}: {doc}" for idx, doc in enumerate(relevant_doc_chunks)])

    # Split the answer into sentences and format them with keys (e.g., "a: This is answer sentence one.")
    answer_sentences = answer.split(". ")  # Splitting by ". " might not be perfect, but it's a reasonable heuristic
    answer_sentences = [f"{chr(97 + idx)}: {sentence.strip()}" for idx, sentence in enumerate(answer_sentences)] # chr(97) is 'a', chr(98) is 'b', etc.

    # Format the complete prompt using the template and the formatted documents, question, and answer
    prompt = template_for_support_analysis.format(
        documents=doc_sentences,
        question=question,
        answer=" ".join(answer_sentences)
    )

    dump_file_id.write("prompt = " + prompt + "\n")  # Log the generated prompt
    return prompt


def compute_metrics(prompt, relevant_documents, answer, dump_file_id):

  for i in range(No_of_retries_support_info):
    dump_file_id.write("Support Info 2, attempt Number = " + str(i+1) + "\n")
    if i: print("Support Info 2, re-attempt Number = " + str(i) + "\n")
    try:
        support_info = obj_OpenAILLM.get_openai_response(prompt)  # Generate support_info using GPT-3.5 Turbo

        # Clean up support info to ensure proper JSON formatting
        support_info = re.sub(r"‘‘‘|json", "", support_info)
        support_info = support_info.strip()  # Remove leading/trailing spaces
        dump_file_id.write("support_info = " + support_info + "\n")

        # Parse JSON string into a Python dictionary
        support_info = json.loads(support_info)

        # Compute Context Relevance
        relevant_sentence_count = len(support_info["all_relevant_sentence_keys"])
        total_document_sentences = sum(len(doc.split(". ")) for doc in relevant_documents)  # Total sentences in relevant docs
        context_relevance = relevant_sentence_count / total_document_sentences if total_document_sentences > 0 else 0

        # Compute Context Utilization
        utilized_sentence_count = len(support_info["all_utilized_sentence_keys"])
        context_utilization = utilized_sentence_count / relevant_sentence_count if relevant_sentence_count > 0 else 0

        # Compute Completeness
        #completeness = 1 if support_info["overall_supported"] else 0
        set1 = set(support_info["all_relevant_sentence_keys"])
        set2 = set(support_info["all_utilized_sentence_keys"])
        intersection = set1.intersection(set2)
        completeness = len(intersection) / len(set1) if len(set1) > 0 else 0
        # Compute Completeness
        #intersection = support_info["all_relevant_sentence_keys"] & support_info["all_utilized_sentence_keys"]
        #completeness = len(intersection) / len(support_info["all_relevant_sentence_keys"]) if len(support_info["all_relevant_sentence_keys"]) > 0 else 0
        # Compute Adherence
        #fully_supported_count = sum(1 for sentence in support_info["sentence_support_information"] if sentence["fully_supported"])
        #total_answer_sentences = len(answer.split(". "))
        #adherence = fully_supported_count / total_answer_sentences if total_answer_sentences > 0 else 0
        adherence = 1 if support_info["overall_supported"] else 0
        # Return all computed metrics as a dictionary
        return {
            "Context Relevance": context_relevance,
            "Context Utilization": context_utilization,
            "Completeness": completeness,
            "Adherence": adherence
        }

        break

    except json.JSONDecodeError as e:
      print(f"Error decoding JSON: {e}")
      print(f"Raw support_info: {support_info}")
      dump_file_id.write(f"Error decoding JSON: {e}\n")
      dump_file_id.write(f"Raw support_info: {support_info}\n")

    except Exception as e: # Handle any exceptions and return an error message
      print(f"An error occurred while processing: {e}")
      dump_file_id.write(f"An error occurred while processing: {e}\n")

  return None

# Function to create a log file for evaluation results
def Create_Log_file(log_file_name, evaluation_list, N=10000000):
    log_file_id = open(log_file_name, "w")  # Open file for writing logs
    log_file_id.write("question,Context Relevance(P),Context Relevance(O),Context Utilization(P),Context Utilization(O),Completeness(P),Completeness(O),Adherence(P),Adherence(O)\n")  # Header row

    i = 0
    for evaluation_item in evaluation_list:
        i += 1
        if i > N: break  # Limit the number of entries processed
        # Write evaluation results to the log file
        if evaluation_item.relevance_score_p < 1:
            log_file_id.write("\"" + evaluation_item.question             + "\","
                              + str(evaluation_item.relevance_score_p)    + ","       + str(evaluation_item.relevance_score_o)    + ","
                              + str(evaluation_item.utilization_score_p)  + ","       + str(evaluation_item.utilization_score_o)  + ","
                              + str(evaluation_item.completeness_score_p) + ","       + str(evaluation_item.completeness_score_o) + ","
                              + str(evaluation_item.adherence_score_p)    + ","       + str(evaluation_item.adherence_score_o)    + "\n")
    log_file_id.close()  # Close the file after writing


# Function to calculate RMSE and AUC metrics from the log file
def get_rmse_and_auc_values(log_file_name):
    # Load data from the log file
    df = pd.read_csv(log_file_name)  # Replace with the actual file path
    df = df.dropna()  # Drop rows with missing values
    df = df.replace({'True': 1, 'False': 0})  # Replace boolean values with integers

    # Compute RMSE for relevance
    rmse_relevance = np.sqrt(np.mean((df['Context Relevance(P)'] - df['Context Relevance(O)']) ** 2))

    # Compute RMSE for utilization
    rmse_utilization = np.sqrt(np.mean((df['Context Utilization(P)'] - df['Context Utilization(O)']) ** 2))

    # Compute AUC-ROC for adherence
    auc_adherence = roc_auc_score(df['Adherence(O)'], df['Adherence(P)'])

    return rmse_relevance, rmse_utilization, auc_adherence  # Return computed metrics

In [None]:
# Class to represent individual debug items
class DebugItem:
    def __init__(self, domain_key, split, question, documents, response, documents_sentences, response_sentences,
                 generation_model_name, annotating_model_name, dataset_name,
                 sentence_support_information, response_sentence_key,
                 supporting_sentence_keys, unsupported_response_sentence_keys,
                 relevance_explanation, all_relevant_sentence_keys, all_utilized_sentence_keys):
        # Initialize debug item properties
        self.domain_key = domain_key  # Identifier for the domain (e.g., 'msmarco', 'hotpotqa', etc.)
        self.split = split  # Dataset split ('train', 'test', 'validation')
        self.question = question  # Question string
        self.documents = documents  # List of documents associated with the question
        self.response = response  # Model-generated response for the question
        self.documents_sentences = documents_sentences  # Sentences in the documents
        self.response_sentences = response_sentences  # Sentences in the response
        self.generation_model_name = generation_model_name  # Name of the generation model
        self.annotating_model_name = annotating_model_name  # Name of the annotating model
        self.dataset_name = dataset_name  # Dataset name
        self.sentence_support_information = sentence_support_information  # Support information for sentences
        self.response_sentence_key = response_sentence_key  # Key to identify the response sentence
        self.supporting_sentence_keys = supporting_sentence_keys  # Keys for sentences that support the response
        self.unsupported_response_sentence_keys = unsupported_response_sentence_keys  # Keys for unsupported sentences
        self.relevance_explanation = relevance_explanation  # Explanation of relevance
        self.all_relevant_sentence_keys = all_relevant_sentence_keys  # Keys for all relevant sentences
        self.all_utilized_sentence_keys = all_utilized_sentence_keys  # Keys for all utilized sentences

# Class to manage and analyze debug items
class Debugger:
    def __init__(self):
        # Initialize the debugger with debug items, file handles, and configuration
        self.DebugItems = []  # List of all debug items
        self.Debug_File_id = open(debug_txt_file_name, "w")  # File to log detailed debug information
        self.Debug_csv_File_id = open(debug_csv_file_name, "w")  # File to log document statistics in CSV format

        # Define the domain keys and dataset splits to process
        self.Domain_Keys = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']
        self.Split_Keys = ["train", "test", "validation"]

        # Load datasets for each domain and split
        self.rag_bench = {}
        for domain_key in self.Domain_Keys:  # Loop through domain keys
            self.rag_bench[domain_key] = load_dataset("rungalileo/ragbench", domain_key)
            for split in self.Split_Keys:  # Loop through dataset splits
                for row in self.rag_bench[domain_key][split]:  # Loop through dataset rows
                    # Create a DebugItem object for each row and add it to DebugItems
                    self.DebugItems.append(DebugItem(
                        domain_key,
                        split,
                        row.get('question', None),
                        row.get('documents', None),
                        row.get('response', None),
                        row.get('documents_sentences', None),
                        row.get('response_sentences', None),
                        row.get('generation_model_name', None),
                        row.get('annotating_model_name', None),
                        row.get('dataset_name', None),
                        row.get('sentence_support_information', None),
                        row.get('response_sentence_key', None),
                        row.get('supporting_sentence_keys', None),
                        row.get('unsupported_response_sentence_keys', None),
                        row.get('relevance_explanation', None),
                        row.get('all_relevant_sentence_keys', None),
                        row.get('all_utilized_sentence_keys', None)
                    ))

    def __del__(self):
        # Ensure file handles are closed when the object is deleted
        self.Debug_File_id.close()
        self.Debug_csv_File_id.close()

    def GetDebugItem(self, index):
        # Retrieve a specific DebugItem by index
        return self.DebugItems[index]

    def print_debug_item(self, index):
        # Print detailed information about a specific DebugItem
        item = self.DebugItems[index]
        print("domain_key = ", item.domain_key)
        print("split = ", item.split)
        print("question = ", item.question)
        print("documents = ", item.documents)
        print("response = ", item.response)
        print("documents_sentences = ", item.documents_sentences)
        print("response_sentences = ", item.response_sentences)
        print("generation_model_name = ", item.generation_model_name)
        print("annotating_model_name = ", item.annotating_model_name)
        print("dataset_name = ", item.dataset_name)
        print("sentence_support_information = ", item.sentence_support_information)
        print("response_sentence_key = ", item.response_sentence_key)
        print("supporting_sentence_keys = ", item.supporting_sentence_keys)
        print("unsupported_response_sentence_keys = ", item.unsupported_response_sentence_keys)
        print("relevance_explanation = ", item.relevance_explanation)
        print("all_relevant_sentence_keys = ", item.all_relevant_sentence_keys)
        print("all_utilized_sentence_keys = ", item.all_utilized_sentence_keys)

    def print_document_stats_1(self):
        # Log document statistics to a CSV file
        self.Debug_csv_File_id.write("domain_key, split, question, doc_number, no of documents, document length\n")
        for item in self.DebugItems:
            for doc_no in range(len(item.documents)):
                self.Debug_csv_File_id.write(str(item.domain_key)+",")
                self.Debug_csv_File_id.write(str(item.split)+",")
                self.Debug_csv_File_id.write(str(doc_no)+",")
                self.Debug_csv_File_id.write(str(len(item.documents))+",")
                self.Debug_csv_File_id.write(str(len(item.documents[doc_no]))+"\n")

    def print_document_stats_2(self):
        # Log detailed document statistics to a text file
        for item in self.DebugItems:
            self.Debug_File_id.write("domain_key = " + str(item.domain_key)+"\n")
            self.Debug_File_id.write("split = "+ str(item.split)+"\n")
            self.Debug_File_id.write("question = "+ str(item.question)+"\n")
            self.Debug_File_id.write("documents = "+ str(item.documents)+"\n")
            self.Debug_File_id.write("response = "+ str(item.response)+"\n")
            self.Debug_File_id.write("documents_sentences = "+ str(item.documents_sentences)+"\n")
            self.Debug_File_id.write("response_sentences = "+ str(item.response_sentences)+"\n")
            self.Debug_File_id.write("generation_model_name = "+ str(item.generation_model_name)+"\n")
            self.Debug_File_id.write("annotating_model_name = "+ str(item.annotating_model_name)+"\n")
            self.Debug_File_id.write("dataset_name = "+ str(item.dataset_name)+"\n")
            self.Debug_File_id.write("sentence_support_information = " + str(item.sentence_support_information)+"\n")
            self.Debug_File_id.write("response_sentence_key = " + str(item.response_sentence_key)+"\n")
            self.Debug_File_id.write("supporting_sentence_keys = " + str(item.supporting_sentence_keys)+"\n")
            self.Debug_File_id.write("unsupported_response_sentence_keys = " + str(item.unsupported_response_sentence_keys)+"\n")
            self.Debug_File_id.write("relevance_explanation = " + str(item.relevance_explanation)+"\n")
            self.Debug_File_id.write("all_relevant_sentence_keys = " + str(item.all_relevant_sentence_keys)+"\n")
            self.Debug_File_id.write("all_utilized_sentence_keys = " + str(item.all_utilized_sentence_keys)+"\n")

            for doc_no in range(len(item.documents)):
                self.Debug_File_id.write("documents[doc_no] = " + item.documents[doc_no]+"\n")
            self.Debug_File_id.write("\n==================================================\n")

    def print_first_n_debug_item(self, n):
        # Print details of the first n DebugItems
        for i in range(n):
            self.print_debug_item(i)

    def print_stop(self, str1):
        # Print a message and terminate the program
        print(str1)
        sys.exit(1)

    def print_rag_bench_features(self):
        # Print features of the loaded RAGBench datasets
        for domain_key in self.Domain_Keys:  # Loop through domain keys
            for split in self.Split_Keys:  # Loop through dataset splits
                print(domain_key, split)
                print(self.rag_bench[domain_key][split].num_rows)  # Number of rows in the dataset split
                print(self.rag_bench[domain_key][split].features)  # Features of the dataset split


In [None]:
# Instantiate the Debugger class
#myDebugger = Debugger()

# Explanation of what happens during instantiation:
# 1. The `Debugger`'s `__init__` method is called.
# 2. The `DebugItems` list is initialized to store individual debug items.
# 3. Two file handles are created:
#    - `debug.txt`: A text file for detailed debug logging.
#    - `debug.csv`: A CSV file for summarizing document statistics.
# 4. The list of domain keys (`Domain_Keys`) and dataset splits (`Split_Keys`) is defined.
#    - Domain keys represent different datasets (e.g., 'msmarco', 'hotpotqa').
#    - Splits include 'train', 'test', and 'validation'.
# 5. The `rag_bench` dictionary is populated:
#    - For each domain key, the corresponding dataset is loaded using `load_dataset`.
#    - For each split (e.g., 'train'), all rows are iterated.
#    - For each row, a `DebugItem` object is created with row-specific information.
#    - The `DebugItem` is added to the `DebugItems` list for further processing or analysis.
# 6. The `Debugger` instance is now ready for use, with all relevant data loaded and organized.

# Note:
# - Any further actions, such as logging, printing, or analysis, can now be performed using the methods defined in the `Debugger` class.
# - Debug files will remain open until the `Debugger` object is deleted or goes out of scope.


In [None]:
# Call the print_document_stats_1 method to generate document statistics
#myDebugger.print_document_stats_1()

# Explanation of what happens in print_document_stats_1:
# 1. This method iterates over all `DebugItems` in the `Debugger` instance.
# 2. For each `DebugItem`, it extracts the following details:
#    - `domain_key`: The dataset's domain (e.g., 'msmarco', 'hotpotqa').
#    - `split`: The dataset split (e.g., 'train', 'test', or 'validation').
#    - `question`: The question associated with the debug item.
#    - Document-level statistics:
#      a. `doc_number`: The index of the document in the list of documents.
#      b. `no of documents`: The total number of documents associated with the question.
#      c. `document length`: The length (in characters) of each individual document.
# 3. It writes these details to the `debug.csv` file in CSV format for easy review:
#    - The CSV header includes columns: `domain_key`, `split`, `question`, `doc_number`, `no of documents`, `document length`.
#    - For each document in a `DebugItem`, a new row is written to the file.
# 4. This method helps analyze the distribution and size of documents across datasets and splits.

# Example output in the CSV file:
# domain_key, split, question, doc_number, no of documents, document length
# hotpotqa, train, "What is the capital of France?", 0, 2, 500
# hotpotqa, train, "What is the capital of France?", 1, 2, 450

# Note:
# - Ensure the `debug.csv` file is not open in another program while running this method to avoid file access conflicts.
# - This method does not return any values; it only writes to the `debug.csv` file.
# - If you need a visual summary or statistical analysis, the generated CSV can be loaded into tools like Excel, Google Sheets, or Python (e.g., using pandas).


In [None]:
# Load the RAGBench dataset, which is used for evaluation and benchmarking of retrieval-augmented generation tasks
Rag_Bench = load_ragbench_dataset()

# Retrieve all the documents from the RAGBench dataset.
# This function should handle any dataset-specific operations to extract document content.
All_Docs = get_documents(Rag_Bench)

# Split the retrieved documents into smaller chunks.
# Chunking is usually performed to optimize the retrieval process, as smaller chunks allow more granular search results.
All_Chunks = get_chunks(All_Docs)

# Create a FAISS index using the document chunks.
# The FAISS index enables fast and efficient similarity search, which is essential for retrieval tasks.
Faiss_Index = create_normalized_embedding_index(All_Chunks)


README.md:   0%|          | 0.00/24.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/56.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/510 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/510 [00:00<?, ? examples/s]

Embeddings file already exists.


In [None]:
def process_evaluation_list(evaluation_list):
    # Open a file to log intermediate results for debugging or analysis
    #dump_file_id = open(dump_file_name, "w")

    # Determine the total number of items in the evaluation list
    evaluation_list_len = len(evaluation_list)
    num_of_metrics = 0  # Counter to track how many metrics were successfully computed

    # Iterate over the evaluation list
    for i in range(evaluation_list_len):
        print(i+1, " of ", evaluation_list_len)  # Log progress

        # # Temporary loop control (for debugging or testing a subset)
        if False:  # Conditional block to process a specific line (disabled by default)
            particular_line = 77  # Line to process
            if i < particular_line: continue  # Skip lines before the target line
            if i > particular_line: break  # Stop after processing the target line
        else:  # Block to process only the first `n` lines (enabled by default)
            first_n_lines = 10  # Limit to the first n lines for debugging
            if i >= first_n_lines: break

        # Extract the query/question from the current evaluation item
        query = evaluation_list[i].question
        dump_file_id.write("query = " + query + "\n")  # Log the query

        # Retrieve the top-k most relevant chunks for the query
        top_k_chunks = query_retrieval(All_Chunks, Faiss_Index, query, Top_k, dump_file_id)
        dump_file_id.write("top_k_chunks = ")  # Log the retrieved chunks
        for j in range(len(top_k_chunks)):
            dump_file_id.write(top_k_chunks[j] + "\n")

        # Generate an answer based on the top-k retrieved chunks
        Answer_from_Context = get_answer_from_context(query, top_k_chunks, dump_file_id)
        dump_file_id.write("Answer_from_Context = " + Answer_from_Context + "\n")

        # Generate support information (e.g., relevance, adherence) using the retrieved chunks and answer
        prompt = generate_support_info_prompt(top_k_chunks, query, Answer_from_Context, dump_file_id)

        # Compute metrics (e.g., relevance, utilization, adherence, completeness) from the support information
        metrics = compute_metrics(prompt, top_k_chunks, Answer_from_Context, dump_file_id)
        dump_file_id.write("metrics = " + str(metrics) + "\n")

        # Update evaluation item scores based on computed metrics
        update_evaluation_list(metrics, evaluation_list, i)
        if metrics is not None: num_of_metrics += 1  # Increment the counter if metrics were computed

        # Add a separator for clarity in the log file
        dump_file_id.write("\n==================================================\n")

    # Close the log file after processing
# dump_file_id.close()

    # Return the number of successfully computed metrics
    return num_of_metrics



In [None]:
# Generate a list of evaluation items from the RAGBench dataset.
# This function likely processes the dataset to create a structured list of evaluation items,
# where each item represents a query and its associated documents, ground-truth answers, and/or metadata.
evaluation_list = get_evaluation_list(Rag_Bench)

In [None]:
# Process the evaluation list to compute metrics for each query and log intermediate results.
# - `evaluation_list` contains the list of evaluation items (queries and their ground truth data).
# - The function `process_evaluation_list` retrieves relevant documents, generates answers,
#   computes evaluation metrics (like relevance, utilization, adherence, completeness),
#   and logs the details to a file ("dump.txt").
# - Returns the total number of evaluation items for which metrics were successfully computed.

No_of_suc_evals = process_evaluation_list(evaluation_list)  # Number of successful evaluations


1  of  510


config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


2  of  510


Device set to use cpu


3  of  510


Device set to use cpu


4  of  510


Device set to use cpu


5  of  510


Device set to use cpu


6  of  510


Device set to use cpu


7  of  510


Device set to use cpu


8  of  510


Device set to use cpu


9  of  510


Device set to use cpu


10  of  510


Device set to use cpu


11  of  510


In [None]:
# Create a log file to save evaluation metrics for each query in the evaluation list.
# - `metrics_file_name`: Name of the log file to store the metrics.
# - `evaluation_list`: The list of evaluation items containing computed metrics.
# - `N`: The number of evaluation items that were successfully processed.

# The function writes detailed information about:
# - Context relevance (predicted and observed).
# - Context utilization (predicted and observed).
# - Completeness and adherence metrics for each evaluation item.

Create_Log_file(metrics_file_name, evaluation_list, No_of_suc_evals)  # Generate the log file with evaluation metrics.


Compute RMSE and AUCROC
 RMSE (Root Mean Squared Error) : It provides an estimate of how far the predicted values deviate, on average, from the actual values in the dataset.
 Formula : RMSE = sqrt [(Σ(Pi – Oi)²) / n]
Here, Pi denotes the predicted value, Oi represents the observed value, and n is the total number of observations or data points. The sum of the squared differences between the predicted and observed values is divided by the number of observations, and the square root of the result is taken to yield the RMSE. This calculation serves as a measure of the differences between values predicted by a model and the values observed in reality.


AUCROC(Area under the curve receiver operating characteristic) is a metric used to measure how well a model can distinguish between classes.
Formula: True Positive Rate (TPR) against False Positive Rate (FPR) at various thresholds.



In [None]:
rmse_relevance, rmse_utilization, auc_adherence = get_rmse_and_auc_values(metrics_file_name)
print("RMSE (Relevance):", rmse_relevance)
print("RMSE (Utilization):", rmse_utilization)
print("AUC-ROC (Adherence):", auc_adherence)

rmse_auc_file_id = open(rmse_auc_file_name, "w")
rmse_auc_file_id.write("RMSE (Relevance):" + str(rmse_relevance)+"\n")
rmse_auc_file_id.write("RMSE (Utilization):" + str(rmse_utilization)+"\n")
rmse_auc_file_id.write("AUC-ROC (Adherence):" + str(auc_adherence)+"\n")
rmse_auc_file_id.close( )

RMSE (Relevance): 0.45443938732698946
RMSE (Utilization): 0.8095330978348606
AUC-ROC (Adherence): 0.875


In [None]:
# Your core function
def gradio_function(embedding_LLM,qa_model_name, query):
  if embedding_LLM == "msmarco-bert-base-dot-v5":
    Embedding_LLM = 0
  elif embedding_LLM == "text-embedding-ada-002":
    Embedding_LLM = 1
  else:
    print("Invalid Embedding Model Name")
    sys.exit(1)
  if qa_model_name == "QwQ-32B-Preview":
    QA_LLM = 0
  elif qa_model_name == "gpt-3.5-turbo":
    QA_LLM = 1
  else:
    print("Invalid QA Model Name")
    sys.exit(1)
  top_k_chunks = query_retrieval(All_Chunks, Faiss_Index, query, Top_k, dump_file_id)
  Answer_from_Context = get_answer_from_context(query, top_k_chunks, dump_file_id)
  # Generate support information (e.g., relevance, adherence) using the retrieved chunks and answer
  prompt = generate_support_info_prompt(top_k_chunks, query, Answer_from_Context, dump_file_id)
  metrics = compute_metrics(prompt, top_k_chunks, Answer_from_Context, dump_file_id)
  return Answer_from_Context,metrics

# Define Gradio interface
interface = gr.Interface(
    fn=gradio_function,                       # Your function
    inputs=[
        gr.Dropdown(choices=["msmarco-bert-base-dot-v5", "text-embedding-ada-002"], label="Embedding Model Name"),
        gr.Dropdown(choices=["QwQ-32B-Preview", "gpt-3.5-turbo"], label="QA Model Name"),
        gr.Textbox(label="Enter Query"),  # Input type
    ],
    outputs=[
        gr.Textbox(label="Generated Answer"),  # Output 1: Answer
        gr.Textbox(label="Metrics")            # Output 2: Metrics
    ]
)

# Launch the app
interface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://61f2521a8080b68196.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://61f2521a8080b68196.gradio.live


