Add api_key to your os environment first

In [1]:
import os
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

# Function to retrieve NVIDIA API key
def retrieve_nvidia_api_key():
    api_key = os.environ.get("NVIDIA_API_KEY")
    if api_key and "nvapi-" in api_key:
        return api_key

    hard_reset = False  # Set to True if you want to reset your NVIDIA_API_KEY
    while not api_key or "nvapi-" not in api_key or hard_reset:
        try:
            assert not hard_reset
            api_key_input = input("NVIDIA API Key: ")
            assert api_key_input.startswith('nvapi-')
            api_key = api_key_input
        except:
            print("[!] API key assignment failed. Make sure it starts with `nvapi-` as generated from the model pages.")
        hard_reset = False

    os.environ["NVIDIA_API_KEY"] = api_key
    return api_key

# Retrieve NVIDIA API key
api_key = retrieve_nvidia_api_key()
print(f"Retrieved NVIDIA_API_KEY beginning with \"{api_key[:9]}...\"")

NVIDIA API Key: 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
Retrieved NVIDIA_API_KEY beginning with "nvapi--ws..."


check if your api_key works

In [2]:
from langchain_nvidia_ai_endpoints._common import NVEModel
NVEModel().available_models

{'ai-gemma-2b': '04174188-f742-4069-9e72-d77c2b77d3cb',
 'playground_kosmos_2': '0bcd1a8c-451f-4b12-b7f0-64b4781190d1',
 'playground_smaug_72b': '008cff6d-4f4c-4514-b61e-bcfad6ba52a7',
 'playground_llama2_70b': '0e349b44-440a-44e1-93e9-abe8dcb27158',
 'playground_gemma_7b': '1361fa56-61d7-4a12-af32-69a3825746fa',
 'playground_nvolveqa_40k': '091a03bb-7364-4087-8090-bd71e9277520',
 'ai-embed-qa-4': '09c64e32-2b65-4892-a285-2f585408d118',
 'ai-arctic-embed-l': '1528a0ad-205a-46ac-a783-94e2372586a9',
 'ai-rerank-qa-mistral-4b': '0bf77f50-5c35-4488-8e7a-f49bb1974af6',
 'ai-parakeet-ctc-riva': '22164014-a6cc-4a6f-b048-f3a303e745bb',
 'playground_yi_34b': '347fa3f3-d675-432c-b844-669ef8ee53df',
 'playground_nemotron_steerlm_8b': '1423ff2f-d1c7-4061-82a7-9e8c67afd43a',
 'ai-llama2-70b': '2fddadfb-7e76-4c8a-9b82-f7d3fab94471',
 'playground_mistral_7b': '35ec3354-2681-4d0e-a8dd-80325dcf7c63',
 'playground_llama2_code_70b': '2ae529dc-f728-4a46-9b8d-2697213666d8',
 'ai-phi-3-mini': '4a58c6cb-a9b4

load the embedded documents

In [13]:
##   Make sure you have docstore_index.tgz in your working directory
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.vectorstores import FAISS

embedder = NVIDIAEmbeddings(model="nvolveqa_40k")

!tar xzvf docstore_index.tgz
docstore = FAISS.load_local("docstore_index", embedder,allow_dangerous_deserialization=True)
docs = list(docstore.docstore._dict.values())

def format_chunk(doc):
    return (
        f"Paper: {doc.metadata.get('Title', 'unknown')}"
        f"\n\nSummary: {doc.metadata.get('Summary', 'unknown')}"
        f"\n\nPage Body: {doc.page_content}"
    )

## This printout confirms that your store has been retrieved
print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")
pprint(f"\nSample Chunk:\n\n{format_chunk(docs[len(docs)//2])}")

Constructed aggregate docstore with 542 chunks


x docstore_index/
x docstore_index/index.faiss
x docstore_index/index.pkl


construct the backend script

In [24]:
%%writefile agent.py
from fastapi import FastAPI
from langserve import add_routes
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain_nvidia_ai_endpoints._common import NVEModel
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ArxivLoader
from langchain.document_transformers import LongContextReorder
from langchain_core.runnables import RunnableLambda,RunnableBranch
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import PyPDFLoader
from functools import partial
from operator import itemgetter
from functools import partial
from getpass import getpass
from keras.models import load_model
import numpy as np
import gradio as gr
import requests
import asyncio
import uvicorn
import PyPDF2
import os

########################################################################
## load the embedded documents
embedder = NVIDIAEmbeddings(model="nvolveqa_40k")
docstore = FAISS.load_local("docstore_index", embedder,allow_dangerous_deserialization=True)
docs = list(docstore.docstore._dict.values())

## Make some custom Chunks to give big-picture details
doc_string = ""
doc_metadata = []
for doc in docs:
    metadata = doc.metadata
    if (metadata.get('Title')!= None) and (metadata.get('Title') not in doc_string):
        doc_string += "\n - " + metadata.get('Title')
        doc_metadata += [str(metadata)]


########################################################################
## Utility Runnables/Methods
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)
embed_dims = len(embedder.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embedder,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def RPrint(preface=""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        print(f"{preface}{x}")
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string."""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

## Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)
########################################################################

embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type="query")
llm = ChatNVIDIA(model="mixtral_8x7b") | StrOutputParser()
convstore = default_FAISS()

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

initial_msg = (
    "Hello! I am a document chat agent here to help the user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help you?"
)

model1 = load_model("filter.h5")
def is_good_response(query):
    # embed the query and pass the embedding into your classifier
    embedding = np.array([embedder.embed_query(query)])
    # return true if it's most likely a good response and false otherwise
    return model1(embedding)

good_sys_msg = (
    "You are an NVIDIA chatbot. Please answer their question if it is ethical and relevant while representing NVIDIA."
    " User messaged just asked: {input}\n\n"
    " From this, we have retrieved the following potentially-useful info: "
    " Conversation History Retrieval:\n{history}\n\n"
    " Document Retrieval:\n{context}\n\n"
    " (Only cite sources that are used. Make your response conversational.)"
)
## Resist talking about this topic" system message
poor_sys_msg = (
    "You are an NVIDIA chatbot. Please answer their question while representing NVIDIA."
    "  Their question has been analyzed and labeled as 'probably not useful to answer as an NVIDIA Chatbot',"
    "  so avoid answering if appropriate and explain your reasoning to them. Make your response as short as possible."
)

chat_prompt = ChatPromptTemplate.from_messages([("system", "{system}"), ("user", "{input}")])

retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({'history' : itemgetter('input') | convstore.as_retriever() | long_reorder | docs2str})
    | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
    | RPrint()
)

stream_chain = (
    { 'input'  : (lambda x:x), 'is_good' : is_good_response }
    | RPrint()
    | RunnableBranch(
            # bad question
            ((lambda d: d['is_good'] < 0.5), RunnableAssign(dict(system = RunnableLambda(lambda x: poor_sys_msg))) | chat_prompt | llm),
            # good question
            RunnableAssign(dict(system = RunnableLambda(lambda x: good_sys_msg)))| RunnableAssign({'history' : itemgetter('input') | convstore.as_retriever() | long_reorder | docs2str})
                | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
                | RPrint() |chat_prompt | llm
    )
)


def chat_gen(message, history=[], return_buffer=True):
#     print(type(message))
#     print("message:\n")
#     print(message)
    buffer = ""
    line_buffer = ""
    ## load the uploaded pdf into the existing vecstore
    if (len(message['files'])>0) and (".pdf" in message['files'][0]):
        with open(message['files'][0], 'rb') as file:
            loader = PyPDFLoader(message['files'][0]).load()
            print("Adding new document into vector database...")
            chunks = [text_splitter.split_documents(loader)]
            vecstore = [FAISS.from_documents(chunk, embedder) for chunk in chunks]
            for vstore in vecstore:
                docstore.merge_from(vstore)
        ## if 
        if "Title" in loader[0].metadata:
            buffer+="I have received your document '"+loader[0].metadata.Title+"'. I'm glad to help if you have any question regarding it. "
            yield buffer
        else:
            first_line = loader[0].page_content.split('\n')[0]
            buffer+="I have received your document '"+first_line+"'. I'm glad to help if you have any question regarding it. "
            yield buffer

    ## response to the user input message
    if len(message['text'].strip()) > 0:
    
        ## Then, stream the results of the stream_chain
        for token in stream_chain.stream(message['text']):
            buffer += token
            ## keep line from getting too long
            if not return_buffer:
                line_buffer += token
                if "\n" in line_buffer:
                    line_buffer = ""
                if ((len(line_buffer)>84 and token and token[0] == " ") or len(line_buffer)>100):
                    line_buffer = ""
                    yield "\n"
                    token = "  " + token.lstrip()
            yield buffer if return_buffer else token

    elif len(message['files'])==0:
        buffer+="Please do not send whitespaces. "
        yield buffer
    
    ## Lastly, save the chat exchange to the conversation memory buffer
    save_memory_and_get_output({'input':  message['text'], 'output': buffer}, convstore)

chatbot = gr.Chatbot(value = [[None, initial_msg]],height=720)
demo = gr.ChatInterface(chat_gen, chatbot=chatbot,multimodal=True ).queue()

try:
    demo.launch(debug=True, share=True, show_api=False)
    demo.close()
except Exception as e:
    demo.close()
    print(e)
    raise e

"""
The script can be adapted to provide apis as a backend service following the codes below.(tested on Windows)
We choose to directly build frontend based on gradio.
"""
# import nest_asyncio
# nest_asyncio.apply()
# async def run_backend():
#     app = FastAPI(
#         title="LangChain Server",
#         version="1.0",
#         description="A simple api server using Langchain's Runnable interfaces",
#     )

#     add_routes(app, llm, path="/basic_chat")
#     add_routes(app, stream_chain, path="/rag_chat")

#     uvicorn_config = uvicorn.Config(app, host="0.0.0.0", port=9012)
#     server = uvicorn.Server(uvicorn_config)
#     await server.serve()

# if __name__ == "__main__":
#     loop = asyncio.get_event_loop()
#     loop.run_until_complete(run_backend())

Overwriting backend.py


In [25]:
%run agent.py



Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://59770e6b63f14d2183.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://59770e6b63f14d2183.gradio.live
Closing server running on port: 7860
