In [None]:
### CODE BLOCK 1
%pip install -q langchain langchain-community langchain-nvidia-ai-endpoints gradio rich
%pip install -q arxiv pymupdf faiss-cpu

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
### CODE BLOCK 2
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set NVIDIA API key from environment variable
# If not found in environment, this will be None
api_key = os.environ.get("NVIDIA_API_KEY")

# Print a message based on whether the key was found
if api_key:
    print("NVIDIA API key loaded successfully")
else:
    raise ValueError("NVIDIA API key not found in .env file. Please set NVIDIA_API_KEY in your environment variables.")


NVIDIA API key loaded successfully


In [None]:
### CODE BLOCK 3
from langchain_nvidia_ai_endpoints import ChatNVIDIA

# Get all available models
all_models = ChatNVIDIA.get_available_models()

# Display only the first 20 models
print(f"First 20 of {len(all_models)} available models:")
for i, model in enumerate(all_models[:20]):
    print(f"{i+1}. {model}")

First 20 of 88 available models:
1. id='institute-of-science-tokyo/llama-3.1-swallow-70b-instruct-v0.1' model_type='chat' client='ChatNVIDIA' endpoint=None aliases=None supports_tools=False supports_structured_output=True base_model=None
2. id='nvidia/llama-3.1-nemotron-51b-instruct' model_type='chat' client='ChatNVIDIA' endpoint=None aliases=None supports_tools=False supports_structured_output=False base_model=None
3. id='qwen/qwen2.5-coder-32b-instruct' model_type='chat' client='ChatNVIDIA' endpoint=None aliases=None supports_tools=False supports_structured_output=False base_model=None
4. id='google/gemma-7b' model_type='chat' client='ChatNVIDIA' endpoint=None aliases=['ai-gemma-7b', 'playground_gemma_7b', 'gemma_7b'] supports_tools=False supports_structured_output=False base_model=None
5. id='yentinglin/llama-3-taiwan-70b-instruct' model_type='chat' client='ChatNVIDIA' endpoint=None aliases=None supports_tools=False supports_structured_output=False base_model=None
6. id='microsoft/k

In [None]:
### CODE BLOCK 4
###################################### Document Summarization  ##################################

from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader, ArxivLoader

from pydantic import BaseModel, Field
from typing import List
from IPython.display import clear_output
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

# Define the Pydantic model for document summarization
class DocumentSummaryBase(BaseModel):
    """Base class for document summarization results."""
    running_summary: str = Field(default="", description="The running summary of the document. Update, don't override.")
    main_ideas: List[str] = Field(default_factory=list, description="Maximum 3 important points from the document.")
    loose_ends: List[str] = Field(default_factory=list, description="Maximum 3 open questions or areas needing clarification.")

# Set up console formatting with NVIDIA brand color
console = Console()
nvidia_green = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=nvidia_green)

# Document Loading 
# For local files (commented out as an option)
# loader = UnstructuredFileLoader("your_document.pdf")
# docs = loader.load()

# Load document from Arxiv (GraphRAG paper)
loader = ArxivLoader(query="2404.16130")  # GraphRAG paper
docs = loader.load()

# Configure text splitter for document chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, 
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)

# Optional preprocessing (commented out)
# def preprocess_text(text):
#     # Remove special characters, normalize whitespace, etc.
#     return text.replace("...", ".")

# Split documents into manageable chunks
docs_split = text_splitter.split_documents(docs)
print(f"Document split into {len(docs_split)} chunks")

# Create summarization prompt
summary_prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are a technical document summarizer tasked with generating a running summary of a research document.
    You are given chunks of text from a document and an existing knowledge base containing:
    1. A running summary of the document so far
    2. Main ideas (max 3)
    3. Loose ends or questions (max 3)
    
    Your goal is to update this knowledge base with new information from the current document chunk.
    
    IMPORTANT INSTRUCTIONS:
    - DO NOT lose information when updating the knowledge base
    - Integrate new information with existing running_summary
    - Update main_ideas and loose_ends as needed, but keep them limited to 3 items each
    - Follow the format instructions exactly
    - If nothing new is present in the chunk, return the existing knowledge base unchanged
    
    {format_instructions}
    """),
    ("user", "{input}")
])




Document split into 83 chunks


In [None]:
### CODE BLOCK 5

def RExtract(pydantic_class, llm, prompt):
    '''
    Runnable Extraction module
    Returns a knowledge dictionary populated by slot-filling extraction
    '''
    parser = PydanticOutputParser(pydantic_object=pydantic_class)
    instruct_merge = RunnableAssign({'format_instructions' : lambda x: parser.get_format_instructions()})
    def preparse(string):
        if '{' not in string: string = '{' + string
        if '}' not in string: string = string + '}'
        string = (string
            .replace("\\_", "_")
            .replace("\n", " ")
            .replace(r"\]", "]")
            .replace(r"\[", "[")
        )
        return string
    return instruct_merge | prompt | llm | preparse | parser


In [None]:
### CODE BLOCK 6

latest_summary = ""

def RSummarizer(knowledge, llm, prompt, verbose=False):
    def summarize_docs(docs):
        parse_chain = RunnableAssign({'info_base' : RExtract(knowledge.__class__, llm, prompt)})
        state = {'info_base' : knowledge}

        global latest_summary  ## If your loop crashes, you can check out the latest_summary

        for i, doc in enumerate(docs):
            state['input'] = doc.page_content
            state = parse_chain.invoke(state)

            assert 'info_base' in state
            if verbose:
                print(f"Considered {i+1} documents")
                pprint(state['info_base'])
                latest_summary = state['info_base']
                clear_output(wait=True)

        return state['info_base']
    return RunnableLambda(summarize_docs)

instruct_model = ChatNVIDIA(model="mistralai/mistral-7b-instruct-v0.3").bind(max_tokens=4096)
instruct_llm = instruct_model | StrOutputParser()

## Take the first 10 document chunks and accumulate a DocumentSummaryBase
summarizer = RSummarizer(DocumentSummaryBase(), instruct_llm, summary_prompt, verbose=True)
summary = summarizer.invoke(docs_split[:10])

{   "description": "Base class for document summarization results.",   "properties": {     "running_summary": "The document discusses a new method called GraphRAG that creates summaries for nested modular communities in a graph. It also mentions the necessity of adaptive benchmarking for evaluating the performance of this method.",     "main_ideas": ["GraphRAG for creating summaries for nested modular communities", "Adaptive benchmarking for evaluating GraphRAG"],     "loose_ends": ["Details about the LLM used in GraphRAG", "More information about the proposed method for generating questions for evaluation", "Information about other benchmark datasets and their focus on vector RAG performance"]   } }
Considered 10 documents


In [None]:
### CODE BLOCK 7
########################################### Retrieval-Augmented Generation (RAG) ###########################################

# Standard library imports
import json
from operator import itemgetter

# UI components
import gradio as gr

# NVIDIA specific components
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

# Vector storage components
from langchain_community.vectorstores import FAISS
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore

# Document processing components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ArxivLoader, UnstructuredPDFLoader

# Document transformation
from langchain.document_transformers import LongContextReorder

# LangChain components
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign

# Utility imports
from functools import partial

# Rich console formatting
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

# Set up console formatting with NVIDIA brand color
console = Console()
nvidia_green = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=nvidia_green)

# Configure embedding model
# NVIDIAEmbeddings.get_available_models()  # Get list of available embedding models
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")

# Set up LLM 
# ChatNVIDIA.get_available_models()  # Get list of available LLM models
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1")

# Utility function for pretty printing in chains
def RPrint(preface=None):
    """
    Creates a RunnableLambda that prints input with NVIDIA styling and returns it unchanged.
    Useful for debugging chains by showing intermediate values.
    
    Args:
        preface: Optional text to print before the input
        
    Returns:
        RunnableLambda: A function that prints then returns its input
    """
    def print_and_return(x):
        msg = f"{preface}: {x}" if preface else x
        pprint(msg)
        return x
    
    return RunnableLambda(print_and_return)

# Function to format document chunks as a string
def docs2str(docs, title=None):
    """
    Formats a list of document chunks into a readable string format.
    
    Args:
        docs: List of document objects with page_content and metadata
        title: Optional title to include in the output
        
    Returns:
        str: Formatted string representation of the documents
    """
    if not docs:
        return "(No relevant documents found)"
    
    result = f"--- {title} ---\n\n" if title else ""
    
    for i, doc in enumerate(docs):
        source_info = ""
        if hasattr(doc, 'metadata') and doc.metadata:
            if 'title' in doc.metadata:
                source_info = f"[Quote from {doc.metadata['title']}]"
            elif 'source' in doc.metadata:
                source_info = f"[Quote from {doc.metadata['source']}]"
        
        result += f"{source_info}\n{doc.page_content}\n\n"
    
    return result

# Create document transformer for reordering documents for better context
long_reorder = RunnableLambda(lambda docs: LongContextReorder().transform_documents(docs))

# Configure text splitter for document chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " "]
)

# Load academic papers using ArxivLoader
print("Loading Documents")
documents = []
arxiv_papers = [
    ("1706.03762", "Attention Is All You Need"),  # Transformer architecture
    ("1810.04805", "BERT"),  # Bidirectional Encoder Representations from Transformers
    ("2005.11401", "RAG"),  # Retrieval-Augmented Generation
    ("2205.00445", "MRKL"),  # Modular Reasoning, Knowledge and Language
    ("2310.06825", "Mistral"),  # Mistral LLM
    ("2306.05685", "LLM-as-a-Judge"),  # Using LLMs for evaluations
    ("2210.03629", "ReAct"),  # Reasoning and Acting in LLMs
    ("2112.10752", "Latent Stable Diffusion"),  # Diffusion Models
    ("2103.00020", "CLIP")   # Contrastive Language-Image Pre-training
]

for paper_id, description in arxiv_papers:
    try:
        loader = ArxivLoader(query=paper_id)
        doc = loader.load()
        if doc:
            # Add description to metadata
            for d in doc:
                if not hasattr(d, 'metadata'):
                    d.metadata = {}
                d.metadata['description'] = description
            documents.extend(doc)
            print(f"Loaded: {paper_id} - {description}")
    except Exception as e:
        print(f"Failed to load {paper_id}: {e}")

print(f"Loaded {len(documents)} documents")

# Process the loaded papers
# Convert to JSON and truncate at References section if present
processed_docs = []
for doc in documents:
    content = doc.page_content
    
    # Truncate content at References section to focus on the main content
    ref_indices = [content.find("\nReferences\n"), content.find("\nREFERENCES\n")]
    ref_indices = [idx for idx in ref_indices if idx != -1]
    
    if ref_indices:
        # Truncate at the first occurrence of References
        content = content[:min(ref_indices)]
    
    doc.page_content = content
    processed_docs.append(doc)

# Split documents into chunks
print("Chunking Documents")
docs_chunks = []
for doc in processed_docs:
    # Split document into chunks
    chunks = text_splitter.split_documents([doc])
    
    # Filter out very short chunks (likely section headers or incomplete sentences)
    filtered_chunks = [chunk for chunk in chunks if len(chunk.page_content) >= 200]
    
    docs_chunks.append(filtered_chunks)

# Create metadata and summary information
doc_string = "Available Documents:\n"
doc_metadata = []

# Build document summary string and metadata
for i, doc_chunk in enumerate(docs_chunks):
    if not doc_chunk:  # Skip empty document chunks
        continue
    
    # Extract document metadata
    metadata = doc_chunk[0].metadata
    title = metadata.get('title', f"Document {i+1}")
    description = metadata.get('description', '')
    
    # Add to document string
    doc_string += f"• {title} - {description}\n"
    
    # Add to metadata
    doc_metadata.append({
        'index': i,
        'title': title,
        'description': description,
        'chunk_count': len(doc_chunk)
    })

# Create extra chunks with document metadata
extra_chunks = [
    doc_string,
    json.dumps(doc_metadata, indent=2)
]

# Print summary information
pprint(doc_string)

# Print detailed document information
for i, chunks in enumerate(docs_chunks):
    if not chunks:  # Skip empty document chunks
        continue
        
    metadata = chunks[0].metadata
    print(f"Document {i+1}:")
    print(f"  Chunks: {len(chunks)}")
    pprint(f"  Metadata: {metadata}")
    print()  # Empty line for better readability

  from .autonotebook import tqdm as notebook_tqdm


Loading Documents
Loaded: 1706.03762 - Attention Is All You Need
Loaded: 1810.04805 - BERT
Loaded: 2005.11401 - RAG
Loaded: 2205.00445 - MRKL
Loaded: 2310.06825 - Mistral
Loaded: 2306.05685 - LLM-as-a-Judge
Loaded: 2210.03629 - ReAct
Loaded: 2112.10752 - Latent Stable Diffusion
Loaded: 2103.00020 - CLIP
Loaded 9 documents
Chunking Documents


Document 1:
  Chunks: 34



Document 2:
  Chunks: 42



Document 3:
  Chunks: 42



Document 4:
  Chunks: 37



Document 5:
  Chunks: 20



Document 6:
  Chunks: 43



Document 7:
  Chunks: 48



Document 8:
  Chunks: 48



Document 9:
  Chunks: 143





In [22]:
%%time
### CODE BLOCK 8

# Constructing Your Document Vector Stores

def default_FAISS(embedder):
    """
    Create an empty FAISS vectorstore with proper configuration.
    
    Args:
        embedder: The embedding model to use for determining vector dimensions
        
    Returns:
        An empty FAISS vectorstore configured with IndexFlatL2 and InMemoryDocstore
    """
    # Get the embedding dimension from a sample embedding
    sample_embedding = embedder.embed_query("Sample text to determine dimensions")
    dimension = len(sample_embedding)
    
    # Create an empty FAISS index with L2 distance metric
    index = IndexFlatL2(dimension)
    
    # Create an empty docstore and index mapping
    docstore = InMemoryDocstore({})
    index_to_docstore_id = {}
    
    # Return the configured FAISS instance
    return FAISS(
        embedding_function=embedder,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        normalize_L2=False  # Don't normalize vectors as per requirements
    )

def aggregate_vstores(vecstores):
    """
    Efficiently merge multiple vector stores into a single vectorstore.
    
    Args:
        vecstores: List of FAISS vectorstores to merge
        
    Returns:
        A unified FAISS vectorstore containing all documents from input stores
    """
    # Create an empty FAISS instance to serve as the aggregation target
    # Using the embedding function from the first vectorstore
    merged_store = default_FAISS(vecstores[0].embedding_function)
    
    # Track the total number of vectors for progress reporting
    total_docs = 0
    
    # Iterate through each vectorstore and merge its contents
    for vs in vecstores:
        # Access the internal components of each FAISS instance
        docstore_dict = vs.docstore._dict
        
        # Collect texts, metadatas, and ids for batch processing
        texts = []
        metadatas = []
        ids = []
        
        # Process each document from the source vectorstore
        for doc_id, doc in docstore_dict.items():
            texts.append(doc.page_content)
            metadatas.append(doc.metadata)
            ids.append(doc_id)
            total_docs += 1
        
        # Batch add all documents from this vectorstore
        if texts:
            merged_store.add_texts(texts, metadatas, ids)
    
    print(f"Merged {len(vecstores)} vectorstores with {total_docs} total vectors")
    return merged_store

# Create vector store from metadata/summary chunks
print("Creating vectorstore for metadata and summaries...")
metadata_vectorstore = FAISS.from_texts(
    extra_chunks,  # The summary and metadata text chunks
    embedder,      # NVIDIA embeddings model
    metadatas=[{"source": "metadata"} for _ in extra_chunks]  # Tag these as metadata
)

# Flatten the list of document chunks before creating the vector store
# This fixes the "list object has no attribute page_content" error
print("Flattening document chunks...")
flattened_docs_chunks = []
for doc_chunk_list in docs_chunks:
    flattened_docs_chunks.extend(doc_chunk_list)

print(f"Total flattened document chunks: {len(flattened_docs_chunks)}")

# Create vector store from document chunks
print("Creating vectorstore for document chunks...")
docs_vectorstore = FAISS.from_documents(
    flattened_docs_chunks,  # Now a flat list of document chunks
    embedder               # NVIDIA embeddings model
)

# Combine the vectorstores
vecstores = [metadata_vectorstore, docs_vectorstore]

# Aggregate into a final docstore
print("Aggregating vectorstores...")
docstore = aggregate_vstores(vecstores)

# Print confirmation of total document count
total_chunks = len(docstore.docstore._dict)  # Use _dict instead of dict
print(f"✅ Vector store construction complete with {total_chunks} total document chunks available for retrieval")

# Optional: Memory optimization hint
print("Note: Large vectorstores have been merged efficiently to minimize memory usage")

Creating vectorstore for metadata and summaries...
Flattening document chunks...
Total flattened document chunks: 457
Creating vectorstore for document chunks...
Aggregating vectorstores...
Merged 2 vectorstores with 459 total vectors
✅ Vector store construction complete with 459 total document chunks available for retrieval
Note: Large vectorstores have been merged efficiently to minimize memory usage
CPU times: total: 953 ms
Wall time: 58.5 s


In [27]:
### CODE BLOCK 9 - RAG Chat Interface

# Set up models
# ChatNVIDIA.get_available_models()  # Uncomment to see available models
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x7b-instruct-v0.1")
# Alternative model option: instruct_llm = ChatNVIDIA(model="meta/llama-3.1-8b-instruct")

# Set up conversation memory store
print("Creating conversation memory store...")
convstore = default_FAISS(embedder)

def save_memory_and_get_output(interaction_dict, vector_store):
    """
    Saves both the user question and agent response to memory as embedded texts.
    
    Args:
        interaction_dict: Dictionary containing 'input' (user message) and 'output' (agent response)
        vector_store: FAISS vector store for storing conversation history
        
    Returns:
        str: The agent's response (output value from the dictionary)
    """
    from datetime import datetime  # Added missing import
    
    user_text = f"User previously responded with {interaction_dict['input']}"
    agent_text = f"Agent previously responded with {interaction_dict['output']}"
    
    # Add both texts to the vector store with appropriate metadata
    vector_store.add_texts(
        [user_text, agent_text],
        metadatas=[
            {"role": "user", "timestamp": str(datetime.now())},
            {"role": "agent", "timestamp": str(datetime.now())}
        ]
    )
    
    return interaction_dict["output"]

# Create welcome message
initial_msg = f"""
# Welcome to Huang's Hype Factory Document Assistant!

I'm here to help you with information about the following documents:

{doc_string}

How can I help you understand these papers today?
"""

# Create chat prompt template
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are a knowledgeable research assistant who helps users understand technical AI papers. 
    
    The user's question is: {input}
    
    CONVERSATION HISTORY:
    {history}
    
    DOCUMENT CONTEXT:
    {context}
    
    INSTRUCTIONS:
    1. Only answer using information from the provided context and your understanding of AI concepts
    2. If you don't know or the answer isn't in the context, say so honestly
    3. Keep responses conversational but technically precise
    4. Do not make up information that isn't in the papers
    5. Include relevant quotes where helpful, citing the paper
    """),
    ("user", "{input}")
])

# Create streaming response chain
stream_chain = (
    chat_prompt 
    | RPrint("Prompt Sent to LLM") 
    | instruct_llm 
    | StrOutputParser()
)

# Fix: Create retrieval chain as a proper RunnableLambda
def create_context(message):
    """Function that creates context for the chat prompt."""
    # First create the base input
    context = {"input": message}
    
    # Add conversation history
    history_docs = convstore.similarity_search(message, k=5)
    history_docs = long_reorder.invoke(history_docs)
    history_str = docs2str(history_docs, title="Conversation History")
    context["history"] = history_str
    
    # Add document context
    doc_docs = docstore.similarity_search(message, k=8)
    doc_docs = long_reorder.invoke(doc_docs)
    doc_str = docs2str(doc_docs, title="Document Context")
    context["context"] = doc_str
    
    # Debug print
    pprint(f"Retrieved Context for: {message}")
    
    return context

retrieval_chain = RunnableLambda(create_context)

# Chat generator function
def chat_gen(message, history=[], return_buffer=True):
    """
    Generate a streaming chat response using RAG.
    
    Args:
        message: User's input message
        history: Chat history (not actively used as we have vector store)
        return_buffer: If True, return accumulated buffer; if False, return individual tokens
        
    Returns:
        Generator yielding either the growing buffer or individual tokens
    """
    buffer = ""
    
    # Perform retrieval to get context
    context = retrieval_chain.invoke(message)
    
    # Stream responses from the LLM
    for token in stream_chain.stream(context):
        buffer += token
        if return_buffer:
            yield buffer
        else:
            yield token
    
    # Save the conversation to memory
    save_memory_and_get_output({
        "input": message,
        "output": buffer
    }, convstore)

# Test the implementation
test_question = "Tell me about RAG!"
print("\nTesting chat implementation with query:", test_question)
print("\nResponse:")

for token in chat_gen(test_question, return_buffer=False):
    print(token, end='')

Creating conversation memory store...

Testing chat implementation with query: Tell me about RAG!

Response:


RAG, or Retrieval-Augmented Generation, is a method that combines non-parametric and parametric memory to improve sequence-to-sequence tasks. The non-parametric memory is a dense vector index of Wikipedia, and a pre-trained neural retriever is used to access this information. The parametric memory is a pre-trained seq2seq transformer. These two components are combined in a probabilistic model trained end-to-end.

There are two variants of RAG: RAG-Sequence and RAG-Token. RAG-Sequence treats the retrieved document as a single latent variable that is marginalized to get the seq2seq probability p(y|x) via a top-K approximation. RAG-Token, on the other hand, allows the generator to choose content from several documents when producing an answer by drawing a different latent document for each target token and marginalizing accordingly.

The RAG models use a retriever pη(z|x) with parameters η that returns distributions over text passages given a query x and a generator pθ(yi|x, z, y1:i−1) pa

In [28]:
chatbot = gr.Chatbot(value = [[None, initial_msg]])
demo = gr.ChatInterface(chat_gen, chatbot=chatbot).queue()

try:
    demo.launch(debug=True, share=True, show_api=False)
    demo.close()
except Exception as e:
    demo.close()
    print(e)
    raise e

  chatbot = gr.Chatbot(value = [[None, initial_msg]])


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://2cb647ab31ff8a9f58.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2cb647ab31ff8a9f58.gradio.live
Closing server running on port: 7860
