In [2]:
import os
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme
from langchain_nvidia_ai_endpoints._common import NVEModel

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

# Function to retrieve NVIDIA API key
def retrieve_nvidia_api_key():
    api_key = os.environ.get("NVIDIA_API_KEY")
    if api_key and "nvapi-" in api_key:
        return api_key

    hard_reset = False  # Set to True if you want to reset your NVIDIA_API_KEY
    while not api_key or "nvapi-" not in api_key or hard_reset:
        try:
            assert not hard_reset
            api_key_input = input("NVIDIA API Key: ")
            assert api_key_input.startswith('nvapi-')
            api_key = api_key_input
        except:
            print("[!] API key assignment failed. Make sure it starts with `nvapi-` as generated from the model pages.")
        hard_reset = False

    os.environ["NVIDIA_API_KEY"] = api_key
    return api_key

# Retrieve NVIDIA API key
api_key = retrieve_nvidia_api_key()
print(f"Retrieved NVIDIA_API_KEY beginning with \"{api_key[:9]}...\"")

NVEModel().available_models

NVIDIA API Key: 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
Retrieved NVIDIA_API_KEY beginning with "nvapi--ws..."


{'playground_nemotron_qa_8b': '0c60f14d-46cb-465e-b994-227e1c3d5047',
 'playground_smaug_72b': '008cff6d-4f4c-4514-b61e-bcfad6ba52a7',
 'ai-gemma-2b': '04174188-f742-4069-9e72-d77c2b77d3cb',
 'ai-embed-qa-4': '09c64e32-2b65-4892-a285-2f585408d118',
 'ai-rerank-qa-mistral-4b': '0bf77f50-5c35-4488-8e7a-f49bb1974af6',
 'playground_llama2_code_70b': '2ae529dc-f728-4a46-9b8d-2697213666d8',
 'ai-arctic-embed-l': '1528a0ad-205a-46ac-a783-94e2372586a9',
 'playground_yi_34b': '347fa3f3-d675-432c-b844-669ef8ee53df',
 'ai-recurrentgemma-2b': '2f495340-a99f-4b4b-89bd-1beb003dd896',
 'playground_nvolveqa_40k': '091a03bb-7364-4087-8090-bd71e9277520',
 'playground_llama2_70b': '0e349b44-440a-44e1-93e9-abe8dcb27158',
 'playground_mistral_7b': '35ec3354-2681-4d0e-a8dd-80325dcf7c63',
 'playground_mamba_chat': '381be320-4721-4664-bd75-58f8783b43c7',
 'ai-phi-3-mini': '4a58c6cb-a9b4-4014-99de-3e704d4ae687',
 'playground_starcoder2_15b': '6acada03-fe2f-4e4d-9e0a-e711b9fd1b59',
 'playground_gemma_2b': '5bde

We will pull in our document index (the one we saved in the RAG agent construction notebook).
Then we will follow the steps:
- Sample the RAG agent document pool to find two document chunks.
- Use those two document chunks to generate a synthetic "baseline" question-answer pair.
- Use the RAG agent to generate its own answer.
- Use a judge LLM to compare the two responses while grounding the synthetic generation as "ground-truth correct."

The chain should be a simple but powerful process that tests for the objective: 
Does my RAG chain outperform a narrow chatbot with limited document access?


In [5]:
##   Make sure you have docstore_index.tgz in your working directory
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.vectorstores import FAISS

embedder = NVIDIAEmbeddings(model="nvolveqa_40k")

!tar xzvf docstore_index.tgz
docstore = FAISS.load_local("docstore_index", embedder,allow_dangerous_deserialization=True)
docs = list(docstore.docstore._dict.values())

def format_chunk(doc):
    return (
        f"Paper: {doc.metadata.get('Title', 'unknown')}"
        f"\n\nSummary: {doc.metadata.get('Summary', 'unknown')}"
        f"\n\nPage Body: {doc.page_content}"
    )

## This printout confirms that your store has been retrieved
print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")
pprint(f"\nSample Chunk:\n\n{format_chunk(docs[len(docs)//2])}")

Constructed aggregate docstore with 542 chunks


x docstore_index/
x docstore_index/index.faiss
x docstore_index/index.pkl


In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnableBranch
from langchain_core.runnables.passthrough import RunnableAssign
from langchain.document_transformers import LongContextReorder

from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from functools import partial
from operator import itemgetter

import gradio as gr


llm = ChatNVIDIA(model='mixtral_8x7b') | StrOutputParser()
embedder = NVIDIAEmbeddings(model='nvolveqa_40k')

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string."""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name: out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked you a question: {input}\n\n"
    " The following information may be useful for your response: "
    " Document Retrieval:\n{context}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational)"
), ('user', '{input}')])

def output_puller(inputs):
    """"Output generator. Useful if your chain returns a dictionary with key 'output'"""
    for token in inputs:
        if token.get('output'):
            yield token.get('output')

long_reorder = RunnableLambda(LongContextReorder().transform_documents)

context_getter = itemgetter('input') | docstore.as_retriever() | long_reorder | docs2str
retrieval_chain = {'input' : (lambda x: x)} | RunnableAssign({'context' : context_getter})

generator_chain = RunnableAssign({"output" : chat_prompt | llm })
generator_chain = generator_chain | output_puller 


rag_chain = retrieval_chain | generator_chain

for token in rag_chain.stream("Tell me something interesting!"):
    print(token, end="")

Sure, I can share something interesting from the documents you provided!

From the first document, "ReAct: Synergizing Reasoning and Acting in Language Models," there is a description of a system where a language model can both reason about a situation and take actions in that situation. This is demonstrated through an example where the language model is interacting with a simulated kitchen environment. It's interesting to consider the potential implications of such a system, which could be used in a variety of applications where both reasoning and action are important.

From the second document, "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena," there is a discussion of the challenges of evaluating large language models (LLMs) that are capable of complex tasks such as following user instructions and answering open-ended questions. The authors note that existing benchmarks for language models are not sufficient for evaluating these more advanced capabilities, and propose a new b

In [7]:
import random

num_questions = 30
synth_questions = []
synth_answers = []

simple_prompt = ChatPromptTemplate.from_messages([('system', '{system}'), ('user', '{input}')])

for i in range(num_questions):
    doc1, doc2 = random.sample(docs, 2)
    sys_msg = (
        "Use the documents provided by the user to generate an interesting question-answer pair."
        " Try to use both documents if possible, and rely more on the document bodies than the summary."
        " Use the format:\nQuestion: (good question, 1-3 sentences, detailed)\n\nAnswer: (answer derived from the documents)"
    )
    usr_msg = (
        f"Document1: {format_chunk(doc1)}\n\n"
        f"Document2: {format_chunk(doc2)}"
    )

    qa_pair = (simple_prompt | llm).invoke({'system': sys_msg, 'input': usr_msg})
    synth_questions += [qa_pair.split('\n\n')[0]]
    synth_answers += [qa_pair.split('\n\n')[1]]
    pprint2(f"QA Pair {i+1}", synth_questions[-1], "", sep='\n')
    pprint(synth_answers[-1], "", sep='\n')

In [8]:
rag_answers = []
for i, q in enumerate(synth_questions):
    rag_answer = rag_chain.invoke(q)
    ## TODO: Compute the RAG Answer
    # rag_answer = ""
    rag_answers += [rag_answer]
    pprint2(f"QA Pair {i+1}", q, "", sep="\n")
    pprint(f"RAG Answer: {rag_answer}", "", sep='\n')

Implement a human preference metric

In [10]:
eval_instruction = """
Evaluate the following Question-Answer pair for human preference and consistency.
Assume the first answer is a ground truth answer and has to be correct.
Assume the second answer may or may not be true.
[0] The second answer lies, does not answer the question, or is inferior to the first answer.
[1] The second answer is better than the first and does not introduce any inconsistencies.

Output Format:
[Score] Justification
"""

eval_prompt = ChatPromptTemplate.from_messages([
    ('system', eval_instruction), ('user', '{input}')
])

pref_score = []

trio_gen = zip(synth_questions, synth_answers, rag_answers)
for i, (q, a_synth, a_rag) in enumerate(trio_gen):
    pprint2(f"Set {i+1}\n\n{q}\n\n")

    usr_msg = f"Question: {q}\n\nAnswer 1: {a_synth}\n\n Answer 2: {a_rag}"
    pref_score += [(eval_prompt | llm).invoke({'input': usr_msg})]
    # pprint(f"Synth Answer: {a_synth}\n\n")
    # pprint(f"RAG Answer: {a_rag}\n\n")
    pprint2(f"Synth Evaluation: {pref_score[-1]}\n\n")

Note as we use llm as a judge, the final score may vary when you execute the pipeline every time. But overall the RAG pipeline should always outperform the basic one.

In [14]:
pref_score = sum(("[1]" in score) for score in pref_score) / len(pref_score)
print(f"Preference Score: {pref_score}")

Preference Score: 0.8666666666666667
