# Let's study LangChain

In [51]:
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain_ollama import OllamaLLM
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.document_loaders import PyMuPDFLoader,ArxivLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable.passthrough import RunnableAssign
from operator import itemgetter

from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore

import json
import os


from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

In [52]:
# in the nvidia study exercise, they use 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llm = OllamaLLM(model="llama3.1:8b")

# Create vector stores from nutrient papers

In [53]:
# 
docs = []
for fname in os.listdir("./PAPER_DOCS/"):
    loader = PyMuPDFLoader(f"./PAPER_DOCS/{fname}")
    docs.append(loader.load())

for doc in docs:
    content = json.dumps(doc[0].page_content)
    if "References" in content:
        doc[0].page_content = content[:content.index("References")]

# print("Chunking Documents")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " "],
)
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunks = [[c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks]

# ## Make some custom Chunks to give big-picture details
doc_string = "Available Documents:"
doc_metadata = []
for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata['title']
    doc_metadata += [str(metadata)]

extra_chunks = [doc_string] + doc_metadata
print(doc_string)

Available Documents:
 - Within-person comparison of eating behaviors, time of eating, and dietary intake on days with and without breakfast: NHANES 2005–20101–3
 - Nutrient Intakes from Meals and Snacks Differ with Age in Middle-Aged and Older Americans


In [54]:
%%time
print("Constructing Vector Stores")
vecstores = [FAISS.from_texts(extra_chunks, embeddings)]
vecstores += [FAISS.from_documents(doc_chunks, embeddings) for doc_chunks in docs_chunks]

Constructing Vector Stores
CPU times: total: 484 ms
Wall time: 907 ms


In [55]:
embed_dims = len(embeddings.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embeddings,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    ## We'll use default_faiss for simplicity, though it's tied to your embedder by reference
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

## Unintuitive optimization; merge_from seems to optimize constituent vector stores away
docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")
docstore.save_local("docstore_index")

Constructed aggregate docstore with 115 chunks


In [57]:
# Example tabular data
table_data = [
    {"Product Name": "Widget A", "Price": "$10", "Category": "Electronics", "Availability": "In Stock"},
    {"Product Name": "Widget B", "Price": "$15", "Category": "Electronics", "Availability": "Out of Stock"},
    {"Product Name": "Gadget X", "Price": "$25", "Category": "Home Goods", "Availability": "In Stock"},
]

# Concatenate fields into a single string representation for each row
row_texts = [
    f"{row['Product Name']} {row['Category']} {row['Price']} {row['Availability']}"
    for row in table_data
]

menus = FAISS.from_texts(row_texts, embedding=embeddings)
menu_retriever = menus.as_retriever()


def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

## Optional; Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)

context_prompt = ChatPromptTemplate.from_template(
    "Answer the question using only the context"
    "\n\nRetrieved Context: {context}"
    "\n\nUser Question: {question}"
    "\nAnswer the user conversationally. User is not aware of context."
)

chain = (
    {
        'context': menu_retriever | long_reorder | docs2str,
        'question': (lambda x:x)
    }
    | context_prompt
    # | RPrint()
    | llm
    | StrOutputParser()
)

In [58]:
chain.invoke("Give me the list of items")

"So you're looking for a list, huh? Let me see what I have here...\n\nAlright, I've got a few things listed out. There's a Gadget X that costs $25 and it's in stock. Then there's two different Widgets - Widget B is actually out of stock, but Widget A is still available for $10."

# Nutrient's related RAG

I want the model to?
1. Answer based on the current meal set provided
2. Able to recommend menu depending on budget

In [40]:
def RPrint(preface=""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        if preface: print(preface, end="")
        pprint(x)
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str
long_reorder = RunnableLambda(LongContextReorder().transform_documents)

chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked: {input}\n\n"
    " From this, we have retrieved the following potentially-useful info: "
    " Document Retrieval:\n{context}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)"
), ('user', '{input}')])

retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
)

stream_chain = chat_prompt | llm | StrOutputParser()


In [41]:
retrieval = retrieval_chain.invoke("How to balance my daily intake?")
# stream_chain.invoke(retrieval)
retrieval

{'input': 'How to balance my daily intake?',
 'context': '[Quote from Document] for aging individuals are focused on meeting nutrients of concern, and modifying dietary intake\npatterns to address their changing health and nutritional status, such as alterations in metabolism,\ndigestion and absorption, and lifestyles. Suﬃcient nutrient intakes are also critically important for\nmiddle-aged and older adults as aging is associated with increased risk of chronic diseases, such as\ncardiovascular disease and diabetes, and sarcopenia, a gradual and progressive decline in muscle\nmass, strength, and endurance [1,2]. Mounting evidence shows that the increasing prevalence of these\nconditions at younger ages is not a normal function of aging, but rather a consequence of unhealthy\nbehaviors [3–5]. Therefore, meeting the nutrient needs through healthy dietary patterns is key to\nmaintaining good health for healthy aging.\nNutrients 2019, 11, 1301; doi:10.3390/nu11061301\nwww.mdpi.com/journal/n

In [43]:
from typing import List
table_data = [
    {"Product Name": "Widget A", "Price": "$10", "Category": "Electronics", "Availability": "In Stock"},
    {"Product Name": "Widget B", "Price": "$15", "Category": "Electronics", "Availability": "Out of Stock"},
    {"Product Name": "Gadget X", "Price": "$25", "Category": "Home Goods", "Availability": "In Stock"},
]
# Custom Retriever for Tabular Data
def table_retriever(query: str, table: List[dict]):
    # Naive search: Match any value in the row with the query
    results = [row for row in table if any(query.lower() in str(value).lower() for value in row.values())]
    return results

# Convert rows into a string format for context
def rows_to_str(rows: List[dict]) -> str:
    return "\n".join([str(row) for row in rows])

# RunnableAssign for Tabular Data Retrieval
tabular_retrieval_chain = RunnableAssign({
    'input': (lambda x: x),  # Pass the input as-is
    'context': (
        itemgetter('input')             # Extract the input query
        | (lambda query: table_retriever(query, table_data))  # Retrieve relevant rows
        | rows_to_str                   # Convert rows into a single string
    )
})


TypeError: unsupported operand type(s) for |: 'operator.itemgetter' and 'function'