## Simple GENAI APP using Langchain

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")  ## Fpr Langsmith tracking
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")

In [3]:
## Data Ingestion: Scrape data from Website
from langchain_community.document_loaders import WebBaseLoader


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
loader = WebBaseLoader("https://docs.smith.langchain.com/evaluation/tutorials/evaluation")
loader


<langchain_community.document_loaders.web_base.WebBaseLoader at 0x267a02da0b0>

In [5]:
docs= loader.load()
docs

[Document(metadata={'source': 'https://docs.smith.langchain.com/evaluation/tutorials/evaluation', 'title': 'Evaluate a chatbot | 🦜️🛠️ LangSmith', 'description': 'In this guide we will set up evaluations for a chatbot.', 'language': 'en'}, page_content='\n\n\n\n\nEvaluate a chatbot | 🦜️🛠️ LangSmith\n\n\n\n\n\n\nSkip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!API ReferenceRESTPythonJS/TSSearchRegionUSEUGo to AppGet StartedObservabilityEvaluationQuick StartTutorialsEvaluate a chatbotEvaluate a RAG applicationRun backtests on a new version of an agentRunning SWE-bench with LangSmithEvaluate a complex agentTest a ReAct agent with Pytest/Vitest and LangSmithHow-to GuidesAnalyze a single experimentLog user feedbackHow to run an evaluationCreating and Managing Datasets in the UIRenaming an experimentHow to bind an evaluator to a dataset in the UIHow to manage datasets programmaticallyRunning an evaluation from the prompt playgroun

In [6]:
## Data Transformation: Loaded data-->Docs--> Text into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(docs)
documents

[Document(metadata={'source': 'https://docs.smith.langchain.com/evaluation/tutorials/evaluation', 'title': 'Evaluate a chatbot | 🦜️🛠️ LangSmith', 'description': 'In this guide we will set up evaluations for a chatbot.', 'language': 'en'}, page_content='Evaluate a chatbot | 🦜️🛠️ LangSmith'),
 Document(metadata={'source': 'https://docs.smith.langchain.com/evaluation/tutorials/evaluation', 'title': 'Evaluate a chatbot | 🦜️🛠️ LangSmith', 'description': 'In this guide we will set up evaluations for a chatbot.', 'language': 'en'}, page_content="Skip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!API ReferenceRESTPythonJS/TSSearchRegionUSEUGo to AppGet StartedObservabilityEvaluationQuick StartTutorialsEvaluate a chatbotEvaluate a RAG applicationRun backtests on a new version of an agentRunning SWE-bench with LangSmithEvaluate a complex agentTest a ReAct agent with Pytest/Vitest and LangSmithHow-to GuidesAnalyze a single experimentLog

In [7]:
## Text to vectors using OPENAI Embeddings 
from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings()


In [8]:
## Storing vectors in vectorstore DB-FAISS
from langchain_community.vectorstores import FAISS
vectordb=FAISS.from_documents(documents=documents,embedding=embeddings)
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x267a8791720>

In [11]:
query="Each datapoint should consist of, at the very least, the inputs to the application"
result=vectordb.similarity_search(query)
result[0].page_content

"Schema: Each datapoint should consist of, at the very least, the inputs to the application.\nIf you are able, it is also very helpful to define the expected outputs - these represent what you would expect a properly functioning application to output.\nOften times you cannot define the perfect output - that's okay! Evaluation is an iterative process.\nSometimes you may also want to define more information for each example - like the expected documents to fetch in RAG, or the expected steps to take as an agent.\nLangSmith datasets are very flexible and allow you to define arbitrary schemas.\nHow many: There's no hard and fast rule for how many you should gather.\nThe main thing is to make sure you have proper coverage of edge cases you may want to guard against.\nEven 10-50 examples can provide a lot of value!\nDon't worry about getting a large number to start - you can (and should) always add over time!\nHow to get: This is maybe the trickiest part."

In [12]:
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(model="gpt-4o")

In [13]:
## Retrival Chain, Document Chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt= ChatPromptTemplate.from_template(
    """
Answer the following question based on the content:
<context>
{context}
</context>

"""
)

document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nAnswer the following question based on the content:\n<context>\n{context}\n</context>\n\n'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x00000267C48AEF80>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x00000267C48BC100>, root_client=<openai.OpenAI object at 0x00000267C48AF370>, root_async_client=<openai.AsyncOpenAI object at 0x00000267C48AE530>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser(), kwargs={}, config={'run_name': '

In [14]:
from langchain_core.documents import Document
document_chain.invoke({
    "input":"Each datapoint should consist of, at the very least, the inputs to the application",
    "context":[Document(page_content="Each datapoint should consist of, at the very least, the inputs to the application. If you are able, it is also very helpful to define the expected outputs - these represent what you would expect a properly functioning application to output.")]
})

'The question appears to be asking for a summary or extraction of key information from the provided context. Based on the given context, a basic answer could be:\n\nEach datapoint should at minimum include the inputs to the application. Additionally, defining the expected outputs is beneficial as this represents what you anticipate a well-functioning application to produce.'

"However, we want the documents to first come from the retriever we just set up. That way, we can use the retriever to dynamically select the most relevant documents and pass those in for a given question."


In [15]:
vectordb

<langchain_community.vectorstores.faiss.FAISS at 0x267a8791720>

In [17]:
# Input-->Retriever-->VectorStore
retriever=vectordb.as_retriever()
from langchain.chains import create_retrieval_chain
retrieval_chain= create_retrieval_chain(retriever,document_chain)


In [18]:
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000267A8791720>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nAnswer the following question based on the content:\n<context>\n{context}\n</context>\n\n'), additional_kwargs={})])
            | ChatOpenAI(clien

In [19]:
## Get the response from the LLM
response= retrieval_chain.invoke({"input":"Each datapoint should consist of, at the very least, the inputs to the application"})
response['answer']

'1. **What should the schema of each datapoint be?**\n   - Each datapoint should at least include the inputs to the application. It is also helpful to define the expected outputs, which represent what a properly functioning application should produce. Additionally, more information can be added, such as the expected documents to fetch or the expected steps for an agent, depending on the context. LangSmith datasets allow for arbitrary schemas, offering flexibility in what you include.\n\n2. **How many datapoints should I gather?**\n   - Initially, it is recommended to gather around 10-20 examples by hand to start building the dataset. This provides a base value and can cover many edge cases that need guarding against. Over time, more datapoints can be added, especially after observing how real users interact with the application and identifying pain points.\n\n3. **How should I gather those datapoints?**\n   - Start by manually labeling around 10-20 examples. After establishing this bas

In [20]:
response

{'input': 'Each datapoint should consist of, at the very least, the inputs to the application',
 'context': [Document(id='28165ccb-448d-476b-86ba-83fdffa424f2', metadata={'source': 'https://docs.smith.langchain.com/evaluation/tutorials/evaluation', 'title': 'Evaluate a chatbot | 🦜️🛠️ LangSmith', 'description': 'In this guide we will set up evaluations for a chatbot.', 'language': 'en'}, page_content="Schema: Each datapoint should consist of, at the very least, the inputs to the application.\nIf you are able, it is also very helpful to define the expected outputs - these represent what you would expect a properly functioning application to output.\nOften times you cannot define the perfect output - that's okay! Evaluation is an iterative process.\nSometimes you may also want to define more information for each example - like the expected documents to fetch in RAG, or the expected steps to take as an agent.\nLangSmith datasets are very flexible and allow you to define arbitrary schemas.\

In [21]:
response['context']

[Document(id='28165ccb-448d-476b-86ba-83fdffa424f2', metadata={'source': 'https://docs.smith.langchain.com/evaluation/tutorials/evaluation', 'title': 'Evaluate a chatbot | 🦜️🛠️ LangSmith', 'description': 'In this guide we will set up evaluations for a chatbot.', 'language': 'en'}, page_content="Schema: Each datapoint should consist of, at the very least, the inputs to the application.\nIf you are able, it is also very helpful to define the expected outputs - these represent what you would expect a properly functioning application to output.\nOften times you cannot define the perfect output - that's okay! Evaluation is an iterative process.\nSometimes you may also want to define more information for each example - like the expected documents to fetch in RAG, or the expected steps to take as an agent.\nLangSmith datasets are very flexible and allow you to define arbitrary schemas.\nHow many: There's no hard and fast rule for how many you should gather.\nThe main thing is to make sure you