# Let's study LangChain

In [270]:
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_ollama import OllamaLLM
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.document_loaders import PyMuPDFLoader,ArxivLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable.passthrough import RunnableAssign
from operator import itemgetter

from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore

import json
import os


from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme


console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

If you plan to use Gemini, can get the API [here](https://ai.google.dev/gemini-api/docs/api-key#windows). Click the `Get gemini API key in Google AI Studio`.

Once you have it, uncomment and run the cell below, then paste the API key.

In [271]:
# you can 
# import getpass
# import os

# if "GOOGLE_API_KEY" not in os.environ:
#     os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [272]:
# in the nvidia study exercise, they use 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# llm = OllamaLLM(model="llama3.1:8b") # uncomment this if you wanna use OllamaLLM locally

# Create vector stores from nutrient papers

In [273]:
# 
docs = []
for fname in os.listdir("./PAPER_DOCS/"):
    loader = PyMuPDFLoader(f"./PAPER_DOCS/{fname}")
    docs.append(loader.load())

for doc in docs:
    content = json.dumps(doc[0].page_content)
    if "References" in content:
        doc[0].page_content = content[:content.index("References")]

# print("Chunking Documents")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " "],
)
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunks = [[c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks]

# ## Make some custom Chunks to give big-picture details
doc_string = "Available Documents:"
doc_metadata = []
for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata['title']
    doc_metadata += [str(metadata)]

extra_chunks = [doc_string] + doc_metadata
print(doc_string)

Available Documents:
 - Within-person comparison of eating behaviors, time of eating, and dietary intake on days with and without breakfast: NHANES 2005–20101–3
 - Nutrient Intakes from Meals and Snacks Differ with Age in Middle-Aged and Older Americans


In [274]:
%%time
print("Constructing Vector Stores")
vecstores = [FAISS.from_texts(extra_chunks, embeddings)]
vecstores += [FAISS.from_documents(doc_chunks, embeddings) for doc_chunks in docs_chunks]

Constructing Vector Stores
CPU times: total: 500 ms
Wall time: 864 ms


In [275]:
embed_dims = len(embeddings.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embeddings,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    ## We'll use default_faiss for simplicity, though it's tied to your embedder by reference
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

## Unintuitive optimization; merge_from seems to optimize constituent vector stores away
docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")
docstore.save_local("docstore_index")

Constructed aggregate docstore with 115 chunks


# Create Price Tags and menu

In [276]:
# Example tabular data
table_data = [
    {"Product Name": "Chicken set", "Price": "$10"},
    {"Product Name": "Pork set", "Price": "$15"},
    {"Product Name": "Instant noodle", "Price": "$25"},
]

# Concatenate fields into a single string representation for each row
row_texts = [
    f"{row['Product Name']} {row['Price']}"
    for row in table_data
]

menus = FAISS.from_texts(row_texts, embedding=embeddings)


def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

## Optional; Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)

context_prompt = ChatPromptTemplate.from_template(
    "Answer the question using only the context"
    "\n\nRetrieved Context: {context}"
    "\n\nUser Question: {question}"
    "\nAnswer the user conversationally. User is not aware of context."
)

chain = (
    {
        'context': menus.as_retriever() | long_reorder | docs2str,
        'question': (lambda x:x)
    }
    | context_prompt
    # | RPrint()
    | llm
    | StrOutputParser()
)

# Create Historic Behavior

In [277]:
def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([f"User said {d.get('input')}", f"Agent said {d.get('output')}"])
    return d.get('output')

## Optional; Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)

memorized_conv = ["User not saying anything","Agent said Hi! what could I help?"]
convstore = FAISS.from_texts(memorized_conv, embedding=embeddings)
retriever = convstore.as_retriever()



# Purchase History 

In [278]:
import pandas as pd
from datetime import datetime
import re

csv_file = './your_data/meal_history.csv'

# Function to log a purchase with a timestamp

def extract_meal_details(response):
    # Regular expression to match each meal item and its price
    pattern = r"\[\s*([^,]+)\s*,\s*\$(\d+(\.\d{1,2})?)\s*\]"
    
    meals = []
    
    # Find all matches using the regex pattern
    matches = re.findall(pattern, response)
    
    for match in matches:
        item = match[0].strip()  # Meal name
        price = float(match[1])  # Price as float
        meals.append({"item": item, "price": price})
    
    return meals

def load_purchase_history():
    try:
        df = pd.read_csv(csv_file)
        row_texts = df.apply(lambda row: f"{row['Timestamp']} - {row['Item']} for ${row['Price']}", axis=1).tolist()
        return row_texts
    except FileNotFoundError:
        return []

# Function to log purchase to CSV
def log_purchase(meals):
    """Logs the confirmed purchase to the CSV file."""
    # Get the current timestamp
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Create a DataFrame for the meal details
    meal_entries = []
    
    for meal in meals:
        # For each meal, add a timestamp and log it
        print(meal['item'])
        meal_entries.append({
            "Timestamp": timestamp,
            "Item": meal["item"],
            "Price": meal["price"]
        })
    
    # Create a DataFrame or load existing CSV
    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        df = pd.DataFrame(columns=["Timestamp", "Item", "Price"])

    # Append new entries
    df = pd.concat([df, pd.DataFrame(meal_entries)], ignore_index=True)
    
    # Save to CSV
    df.to_csv(csv_file, index=False)
    print(f"Logged {len(meal_entries)} meal(s) to {csv_file}.")

# Inside chat_gen after response
purchase_history = FAISS.from_texts(load_purchase_history(),embedding=embeddings)


In [279]:
# response = "I buy these [chicken set, $10], [instant noodle, $25], [pork set, $15]"

# meal_details = extract_meal_details(response)
# log_purchase(meal_details)
# # meal_details

# Nutrient's related RAG

I want the model to?
1. Answer based on the current meal set provided
2. Able to recommend menu depending on budget

In [280]:
def RPrint(preface=""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        if preface: print(preface, end="")
        pprint(x)
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str
long_reorder = RunnableLambda(LongContextReorder().transform_documents)

chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked: {input}\n\n"
    " I the user intended to purchase meal, summarize the meal and present in list format: "
    " [item name,price],[item 2 name, price], ... \n\n"
    " Once confirmed, say meal purchased succesfully and continue\n\n"
    " Otherwise,"
    " From this, we have retrieved the following potentially-useful info: "
    " These items is sold by the cafeteria: \n{list_of_items}\n\n"
    " Document Retrieval:\n{context}\n\n"
    " Previous conversation:\n{memory}\n\n"
    " Previous meal purchaed:\n{purchased_meal}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)"
    " (Answer briefly.)"
), ('user', '{input}')])


In [281]:
# method 1, no saved conversation
def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    ## First perform the retrieval based on the input message
    retrieval = retrieval_chain.invoke(message)
    meal_purchases = []

    ## Then, stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token
        # Check if we detect the "meal purchased successfully" confirmation
        if "meal purchased successfully" in buffer.lower():
            # Assuming extract_meal_details is defined to extract meal item and price from response
            meal_item, meal_price = extract_meal_details(buffer)
            
            if meal_item and meal_price:
                meal_purchases.append({"item": meal_item, "price": meal_price})
                log_purchase(meal_item, meal_price)  # Log the purchase to CSV
                buffer = ""  # Reset buffer after logging
        ## If you're using standard print, keep line from getting too long
        # yield buffer if return_buffer else token
        yield token


retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({'list_of_items' : itemgetter('input') | menus.as_retriever()  | long_reorder | docs2str})
    | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
    | RunnableAssign({'memory' : itemgetter('input') | convstore.as_retriever()  | long_reorder | docs2str})
    | RunnableAssign({'purchased_meal': itemgetter('input') | purchase_history.as_retriever() | long_reorder | docs2str})  # Add purchase history retrieval
)

stream_chain = (
    chat_prompt | llm | StrOutputParser()
    )
# for response in chat_gen("can give me list of items?"):
#     print(response,end='')




In [282]:
stream_chain

ChatPromptTemplate(input_variables=['context', 'input', 'list_of_items', 'memory', 'purchased_meal'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input', 'list_of_items', 'memory', 'purchased_meal'], input_types={}, partial_variables={}, template='You are a document chatbot. Help the user as they ask questions about documents. User messaged just asked: {input}\n\n I the user intended to purchase meal, summarize the meal and present in list format:  [item name,price],[item 2 name, price], ... \n\n Once confirmed, say meal purchased succesfully and continue\n\n Otherwise, From this, we have retrieved the following potentially-useful info:  These items is sold by the cafeteria: \n{list_of_items}\n\n Document Retrieval:\n{context}\n\n Previous conversation:\n{memory}\n\n Previous meal purchaed:\n{purchased_meal}\n\n (Answer only from retrieval. Only cite sources that are used. Make your response conversation

In [283]:
# # method 2: with saved conversation
# RAGChain = (
#     {
#         'memory': convstore.as_retriever() | long_reorder | docs2str,
#         'list_of_items': menu_retriever | long_reorder | docs2str,
#         'context': docstore.as_retriever() | long_reorder | docs2str,
#         'input': (lambda x:x)
#     }
#     | RunnableAssign({'output' : chat_prompt | llm | StrOutputParser()})
#     | partial(save_memory_and_get_output, vstore=convstore)
# )

# RAGChain.invoke("Hi?")

In [284]:
# # Retrieve all documents in convstore
# doc_ids = convstore.index_to_docstore_id.values()  # Get all document IDs
# for doc_id in doc_ids:
#     doc = convstore.docstore.search(doc_id)  # Fetch document by ID
#     print(doc.page_content)  # Prints "User said..." and "Agent said..." texts


In [285]:
# sentence = ""
# for token in stream_chain.stream(retrieval_chain.invoke("What did I eat in the past?")):
#     sentence += token
# sentence

# Deploy

In [None]:
import gradio as gr
# implement chat-like Gradio
def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    meal_purchases = []
    ## First perform the retrieval based on the input message
    retrieval = retrieval_chain.invoke(message)
    line_buffer = ""

    ## Then, stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token
        ## If you're using standard print, keep line from getting too long
        # Check if we detect the "meal purchased successfully" confirmation
        if "meal purchased successfully" in buffer.lower():
            # Assuming extract_meal_details is defined to extract meal item and price from response
            meal_details = extract_meal_details(buffer.lower())
            log_purchase(meal_details)
            
        yield buffer if return_buffer else token

    ## Lastly, save the chat exchange to the conversation memory buffer
    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)


# chatbot = gr.Chatbot(value = [[None, initial_msg]])
demo = gr.ChatInterface(chat_gen).queue()

try:
    demo.launch(debug=True, share=True, show_api=False)
    demo.close()
except Exception as e:
    demo.close()
    print(e)
    raise e



* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://84732926cc9e89a3c6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


pork set
instant noodle
chicken set
Logged 3 meal(s) to ./your_data/meal_history.csv.
