# Evaluating Vectara With RAGAs

In [1]:
#
# On Mac:
# brew install libmagic
#
# Additional dependecies
# pip install ragas pandas python-magic datasets langchain langchainhub langchain-experimental
# pip install -U ragas (version 0.1.5 or higher)
#

In [2]:
from datasets import Dataset
import os
import re
import shutil
import requests
import magic
import numpy as np
import time

from langchain import hub
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    answer_correctness,
    answer_similarity
)
from datasets import Dataset

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from vectaraClient import VectaraClient

In [3]:
customer_id = "<YOUR-VECTARA-CUSTOMER-ID>"
corpus_id = "<YOUR-VECTARA-CORPUS-ID>"
api_key = "<YOUR-VECTARA-API-KEY>"

## Part 1: Generate Synthetic Evaluation Dataset from a Vectara Corpus

For RAGAs to generate synthetic data, we will utilize OpenAI embedding and Chat. Make sure to have your OPENAI_API_KEY available in the environment. First we use the VectaraClient to download URLs of each document in our corpus (`url` in this case is a metadata field) and use those to download the actual content to a local `temp` folder:

In [None]:
vc = VectaraClient(api_key, customer_id, corpus_id)

urls = vc.get_all_doc_urls()
print(f"Found {len(urls)} documents in corpus")

In [None]:
def add_correct_ext(file_path):
    mime = magic.Magic(mime=True)
    mime_type = mime.from_file(file_path)
    
    # Dictionary mapping some MIME types to file extensions
    extension_mapping = {
        'application/pdf': '.pdf',
        'text/html': '.html',
        'text/plain': '.txt',
        'text/markdown': '.md',
        'application/vnd.ms-powerpoint': '.ppt',
        'application/msword': '.doc',
    }
    
    extension = extension_mapping.get(mime_type, '')    
    if extension:
        new_file_path = f"{file_path}{extension}"
        os.rename(file_path, new_file_path)
        return new_file_path
    else:
        print(f"Unsupported file type or no extension mapping found: mime_type={mime_type}.")
        return file_path

In [None]:
data_dir = './temp'
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
os.makedirs(data_dir, exist_ok=True)

for url in urls:
    file_name = url.split('/')[-1]
    if len(file_name)==0:
        file_name = 'root'
    file_path = os.path.join(data_dir, file_name)
    
    response = requests.get(url)
        
    # Check if the download was successful
    if response.status_code == 200:
        # Write the content to a file in the specified directory
        with open(file_path, 'wb') as file:
            file.write(response.content)
        file_path = add_correct_ext(file_path)
    else:
        print(f"Failed to download the file (url={url}). Status code: {response.status_code}")

Now that all files are available locally, we load them as LangChain documents and use the RAGAs `TestsetGenerator` functionality to generate 50 synthetic question/answer pairs:

In [None]:
n_questions = 50

loader = DirectoryLoader(data_dir, use_multithreading=True, silent_errors=True)
documents = loader.load()
for document in documents:
    document.metadata['file_name'] = document.metadata['source']

In [None]:
gen_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4-turbo-preview")
emb = OpenAIEmbeddings(model="text-embedding-3-large")

generator = TestsetGenerator.from_langchain(generator_llm=gen_llm, 
                                            critic_llm=critic_llm,
                                            embeddings=emb)
testset = generator.generate_with_langchain_docs(documents, test_size=n_questions, 
                                                 raise_exceptions=False, with_debugging_logs=False, is_async=False,
                                                 distributions={simple: 0.5, reasoning: 0.2, multi_context: 0.3})

In [None]:
testset_df = testset.to_pandas()
testset_df = testset_df[pd.isnull(testset_df.ground_truth)==False]
testset_df = testset_df[['question', 'ground_truth', 'contexts']].dropna()

In [5]:
testset_df.head(2)

Unnamed: 0,question,ground_truth,contexts
0,What are some examples of use cases for the Vectara platform?,"The Vectara platform has a unique ability to understand and process information, using hybrid search to find the most relevant products, support cases, and documents that answer user's questions first. It can power chatbots, Q&A systems, conversational applications, and websites based on relevant information. Vectara also provides result recommendations and enables global collaboration through cross-language search.","['\n\nUse Case Exploration\n\nExplore the Vectara Use Cases\n\nThe AI era has changed interactions between people and information\ndramatically. Users expect relevant answers to questions in a natural\nlanguage, and they expect the best results with the right context.\nUsing Vectara gives you relevant results no matter how you ask. Our\nconversational search platform generates summarized responses that speak your\nlanguage. Better results enable better outcomes that reduce support costs and\nimprove the customer experience.\n\nWhy Vectara? Get Answers and Better Outcomes\u200b\n\nThe Vectara platform has a unique ability to understand and process\ninformation. Our platform uses hybrid search to find the most relevant\nproducts, support cases, and documents that answer your user’s questions first.\nPower chatbots, Q&A systems, conversational applications and websites that base\ntheir information on what you and your users care about – information grounded\nin facts. Vectara also provides result recommendations and enables global\ncollaboration through its cross-language search.\n\nVectara GenAI Use Cases\u200b\n\nThis versatile Vectara GenAI platform caters to a wide range of use\ncases to drive better outcomes and unlock new possibilities in search\napplications. Vectara provides an easy entry point to generative AI\ncapabilities while protecting company IP and customer data. The data\nis secure. Vectara does not train on user data and respects data\nsovereignty and provides you with peace of mind.\n\nChoose the Data for Ingestion\u200b\n\nYou might be wondering what kind of data to select for ingestion. Our Vectara Quick Start Tutorial\nprovides an example that gets you set up and searching for answers quickly!\n\nHere are some other ideas to let you see Vectara in action:\n\nEmployee handbook\n\nProduct manuals\n\nLegal contracts\n\nResearch papers\n\nTraining materials\n\nFinancial reports\n\nGovernment regulations.\n\nThese types of documents contain very nuanced information where semantic search\nreally shines! Think about what information takes a long time for a user to\nlocate manually in a large volume. Unless you know exact keywords and section\ntitles, you might struggle to find the exact information you need for understanding\na real-estate contract, complex machine repair, conducting scientific\nresearch, and so on.\n\n💡 Check out example Vectara applications, demos, and tutorials to explore the many capabilities of our platform.\n\nConversational AI\u200b\n\nUse Vectara to leverage the power of intelligent chatbots that provide\nan interactive user experience. Enable your users with self-service as they engage\nin human-like interactions, providing context in queries, and receiving\nintelligent answers because the system understands them. This technology\nsits behind virtual assistants, chatbots, and messaging applications to\nhelp businesses automate customer service and streamline operations.\n\nVectara enables you to empower users with real-time feedback to avoid\nescalations and build a digital chat agent that can deflect support\ncalls. Making conversational AI easier to use increases both customer\nsatisfaction and engagement.\n\nQuestion and Answering\u200b\n\nVectara understands the context of a question and provides accurate, relevant\nresponses. The Vectara advantage lets users ask complex questions to get\nprecise answers that save your team valuable time and resources.\n\nEnable your users to ask a question and get the precise answers quickly. Embed\nyour FAQs, customer support interactions, product manuals, inform knowledge\nworkers on data, and enhance your website search. Vectara empowers your\norganization to create a dynamic, responsive, and continuous improving Question\nand Answer system that enhances the user experience and provides context-aware answers.\n\nResearch and Analysis\u200b\n\nVectara sifts through volumes of publications, news articles, financial reports,\nscientific and medical research, corporate documents and more and provides\nsummarized answers to guide decision-making in your domain. Collaborate with\nresearchers to streamline the peer review process by investigating topics and\nquestions in these vast volumes of data to identify key insights.\n\nUsing Vectara is like having a global research assistant that\ncan read and understand large volumes of documents in an instant. Let the\nplatform speed up your research process, find the most relevant information,\nand become a recommendation system for your domain.\n\nVectara can help transform data into insights which help make decision-making\neasier. This platform can provide hidden insights and patterns from your data,\nhelping you make informed decisions. Not only can it answer your questions,\nbut also provides citations grounded in facts from the raw data.\n\nSemantic App Search\u200b\n\nVectara lets you embed powerful hybrid search into your applications without\nbeing an LLM expert. You provide data and queries through']"
1,How can developers customize prompts with metadata using Vectara's Custom Retrieval Augmented Generation (RAG) Prompt Engine?,Vectara empowers developers with a flexible way of customizing prompts with metadata through the Custom Retrieval Augmented Generation (RAG) Prompt Engine. Developers can use available prompt variables and functions to customize prompts based on their needs.,"['\n\nGenerative Prompts\n\nCustom Prompts with Metadata\n\nCustom Prompts with Metadata\n\nVectara handles the system and user prompts automatically, but if you want to\ndo it yourself, Vectara now empowers developers with a flexible way of\ncustomizing prompts with metadata. Our Custom Retrieval Augmented Generation\n(RAG) Prompt Engine provides several available prompt variables and functions\nfor Scale users to customize prompts.\n\nAvailable Prompt Variables\u200b\n\nThe following table shows the available custom prompt variables:\n\n$vectaraOutChars Number of characters See below $vectaraLangCode ISO639 v3 code for the passed language code See below $vectaraQuery The query provided by the user Generate a summary in $vectaraOutChars characters in language \'${vectaraLangCode}\' for the query \\""$esc.java(${vectaraQuery})\\"" solely based on the search results in this chat. Generate a summary in 512 characters in language \'ara\' for the query \\""Give me \\""some\\"" search results.\\"" solely based on the search results in this chat. $vectaraIdxWord A utility array to convert the index to words i.e ""first"", ""second"", ""third"", ""forth"", ""fifth"", ""sixth"", ""seventh"", ""eighth"", ""ninth"", ""tenth"" $vectaraIdxWord[0] first $vectaraLangName Set to the requested language name. The language can either be requested explicitly or detected from the language of the query. You are a helpful assistant. Answer in ${vectaraLangName}. You are a helpful assistant. Answer in Arabic. $vectaraQueryResults An array of query results is found in the response, sorted by relevance score. #foreach ($qResult in $vectaraQueryResults) {""role"": ""user"", ""content"": ""Give me the $vectaraIdxWord[$foreach.index] search result.""}, {""role"": ""assistant"", ""content"": ""$esc.java(${qResult.text()})"" },#end {""role"": ""user"", ""content"": ""Give me the second search result.""},{""role"": ""assistant"", ""content"": ""2nd result"" },\n\nAvailable Prompt Functions\u200b\n\nThe following table shows the available custom prompt functions:\n\n$esc.java(...) A utility method to escape special charts, has methods such as ""esc.java"", ""esc.url"", ""esc.xml"", ""esc.html"" See below #foreach ($qResult in $vectaraQueryResults) $qResult.getText() or $qResult.text() Returns text of the query result $qResult.text() Result text $qResult.docMetadata() Returns the metadata of the document this result belongs to $qResult.docMetadata() {""title"": ""documentTitle"", ...} $qResult.docMetadata().present() Returns true/false if there are any values present in the metadata #if ($qResult.docMetadata().present())...#end $qResult.docMetadata().get(""title"") Returns the specified field value from doc metadata, an incorrect key would result in an empty value $qResult.docMetadata().get(""title"") documentTitle $qResult.partMetadata().present() Returns true/false if there are any values present in the metadata #if ($qResult.partMetadata().present())...#end $qResult.partMetadata() Returns the metadata of the part of the document this result belongs to $qResult.partMetadata() {""page"": ""1"", ...} $qResult.partMetadata().get(""page"") Returns the specified field value from part metadata, incorrect key would result in empty value $qResult.docMetadata().get(""page"") ""1""\n\nInclude Metadata in Prompt\u200b\n\nThis snippet shows how to get metadata associated with a single result qResult\nby retrieving metadata docMetadata from the date that information was\nanswered answerDate. It then extracts the text content of qResult.\n\n""role""\n\n""assistant""\n\n""content""\n\n""qResult.docMetadata().get(\'answerDate\') $esc.java(${qResult.getText()})""\n\nLet\'s dive into a full custom prompt example that shows more details about a\ncustom prompt with\nmetadata.\n\nExample Custom Prompt for an RFI Answering Bot\u200b\n\nThe following example prompt creates a Request for information (RFI)\nanswering bot that includes metadata. First, we ask the generative LLM to\nanswer an RFI question and tell it how the results will come back from the\nquery.\n']"


## Evaluate your Vectara RAG

In [None]:
sum_top_k = 5

In [6]:
def get_response(row):
    query = row['question']
    response, contexts = vc.query(query, cfg)
    response = re.sub("\[\d+(,\s*\d+)*\]", "", str(response)).replace(' .', '.')
    return pd.Series([response, contexts], index=['answer', 'contexts'])

In [None]:
# Here we also add a calculation of HHEM - Vectara's Hallucination Detection Model
# HHEM is a factual consistency score, similar to faithfulness, 
# ...but computed directly as a model instead of using LLM-as-a-judge as in RAGAs

from sentence_transformers import CrossEncoder
def calc_hhem(contexts, answer):
    model = CrossEncoder('vectara/hallucination_evaluation_model')
    scores = model.predict([[context, answer] for context in contexts])
    return np.mean(scores)


In [7]:

def eval_rag(df):
    df2 = df.copy()
    df2[['answer', 'contexts']] = df2.apply(get_response, axis=1)
    result = evaluate(
        Dataset.from_pandas(df2),
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness
        ],
        llm = ChatOpenAI(model_name="gpt-4-turbo-preview", temperature=0),
        raise_exceptions=False
    )    

    result['hhem_score'] = np.mean(np.vectorize(calc_hhem)(df2['contexts'], df2['answer']))
    return result

In [8]:
cfg = {
        'lambda': 0.0,
        'max_summary_result': sum_top_k,
        'mmr': True,
        'prompt_name': 'vectara-experimental-summary-ext-2023-10-23-small'
    }

latency = []
res = eval_rag(testset_df)
res

Evaluating:   0%|          | 0/184 [00:00<?, ?it/s]

{'faithfulness': 0.9507, 'answer_relevancy': 0.9453, 'answer_similarity': 0.9456, 'answer_correctness': 0.5154, 'hhem_score': 0.7285}

In [9]:
cfg = {
        'lambda': 0.0,
        'max_summary_result': sum_top_k,
        'mmr': False,
        'prompt_name': 'vectara-experimental-summary-ext-2023-10-23-small'
    }

latency = []
res = eval_rag(testset_df)
res

Evaluating:   0%|          | 0/184 [00:00<?, ?it/s]

{'faithfulness': 0.9561, 'answer_relevancy': 0.9395, 'answer_similarity': 0.9407, 'answer_correctness': 0.5260, 'hhem_score': 0.7426}

In [10]:
cfg = {
        'lambda': 0.025,
        'max_summary_result': sum_top_k,
        'mmr': False,
        'prompt_name': 'vectara-experimental-summary-ext-2023-10-23-small'
    }

latency = []
res = eval_rag(testset_df)
res

Evaluating:   0%|          | 0/184 [00:00<?, ?it/s]

{'faithfulness': 0.9409, 'answer_relevancy': 0.9490, 'answer_similarity': 0.9490, 'answer_correctness': 0.5580, 'hhem_score': 0.7113}

In [11]:
cfg = {
        'lambda': 0.025,
        'max_summary_result': sum_top_k,
        'mmr': False,
        'prompt_name': 'vectara-experimental-summary-ext-2023-12-11-large'
    }

latency = []
res = eval_rag(testset_df)
res

Evaluating:   0%|          | 0/184 [00:00<?, ?it/s]

{'faithfulness': 0.9896, 'answer_relevancy': 0.9391, 'answer_similarity': 0.9530, 'answer_correctness': 0.5953, 'hhem_score': 0.6177}