In [1]:
import getpass
import os

# Set LangSmith Environment Variables
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter API key for LangSmith")


In [2]:
# Set up google llm model for RAG usage
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")


print("Initializing chat model!")
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
print("Chat model initialization over!")

Initializing chat model!
Chat model initialization over!


In [3]:
print("Initializing embedder model!")
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print("Embedder model initialized!")

Initializing embedder model!


  from .autonotebook import tqdm as notebook_tqdm


Embedder model initialized!


In [4]:
print("Initialize vector store!")
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)
print("Vector store initialized!")

Initialize vector store!
Vector store initialized!


In [5]:
# Loading documents for RAG use

# get beautiful soup
import bs4
# get Document Loaders
from langchain_community.document_loaders import WebBaseLoader
desired_webpage_link = "https://lilianweng.github.io/posts/2023-06-23-agent/"
# the original tutorial only wanted us to keep html tags that
# contain any of the class names listed in the tuple assigned to the 
# class_ keyword, in the below variable. We discard all other html tags.
bs4_filtered = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))

webpage_loader = WebBaseLoader(
  web_paths=(desired_webpage_link,),
  bs_kwargs={"parse_only": bs4_filtered}
)

documents = webpage_loader.load()
#for key in documents[0]:
  #print(f"New key: {key}")
#print(f"how many characters are in our page: {len(documents[0].page_content)}")
#print(f"The first two thousand characters of the document: {documents[0].page_content[:2000]}")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# create a text splitter instance
recursive_text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=1000,
  chunk_overlap=200,
  add_start_index=True,
)

# obtain splitted text. our document is now a bunch of document shards.
splitted_text = recursive_text_splitter.split_documents(documents)
#print(f"The splits we got back:\n\n{splitted_text[4].page_content}")
#print(f"Number of splits:{len(splitted_text)}")

# We've loaded in our document. 
Split it up into text chunks, to allow our model to access bits of the data that fit into the LLM context window.

Now, it's time to put these chunks into a storage format the LLM can access.

In [7]:
# the vector store was previously initialized with our defined embeddings
# in one command sequence, we tell the vector store to embed each of our document chunks.
# we then store the embedded versions of these chunks into our vector store, for 
# later retrieval
vector_store__document_embedded_vectors = vector_store.add_documents(splitted_text)


In [8]:
print(f"vector store shape: {vector_store__document_embedded_vectors}")

vector store shape: ['3c76c953-85d1-4259-a476-1d2dd62a49ef', '0423c64b-bf95-49d9-acb5-fbbe88396bc9', '85e3f792-b3d3-4232-bdb7-ad6aca97cae6', 'ca24eaff-32c1-4288-be45-096e5ab6c3ff', '23a933a5-4fb7-4f28-b063-65d6b0a5d49c', 'aa2ca783-7b85-4ce8-813a-134551ad2188', '230b992a-9fb6-409d-ae84-07d0443c4549', '4a807761-bc9c-41dd-8813-d6a6bfc8d766', '417b929f-23e3-4cc8-8d44-824a4d4bc3af', '84b02693-64a3-4022-b94f-eb21bdc872cf', '54c246db-e38f-4a8a-8a99-4994e6875c64', '71a6923b-0fd0-4333-a734-2ae292c703ee', 'a5c38ce7-301c-4aac-991a-6a0603c7ae83', 'fa9e5f14-8049-424c-9414-163270f5c12d', '33a740b9-86b3-4ee4-a454-bc067092d3b7', '17e95539-2d28-4ce1-baba-814933724128', 'c2da44d5-4a93-40ef-9f7e-34adea180c6d', 'dec193b1-3dc9-46f9-882b-2d7525210cb8', 'fb5dfe54-ba31-49d6-83ce-ebfc1191b19d', '4c986bf8-f0d3-4711-a207-db84a9b8cbf1', 'f11b6967-c3c7-4a18-83bc-df296e58b0a1', '42a81ee6-4e75-4c81-ab83-cd72ce42cb41', 'e26bc43d-a23e-4e57-bda2-a8a02bde9a8c', 'a02f7c21-d607-4bc8-939e-33564a311508', 'f8df3097-ceeb-4e66

# We now should be able to answer User Queries
By accessing our vector store and returning the relevant chunks of text, from Lillian's blog post, that answers the user's questions.

## Onto Retrieval And Generation

In [9]:
# Get RAG prompt
from langchain import hub

# The actual RAG prompt
rag_prompt = hub.pull("rlm/rag-prompt")
#print(f"Actual template response: {rag_prompt.messages[0].prompt.template}.")

example_rag_message = rag_prompt.invoke({
  "context": "context here",
  "question": "question here"
}).to_messages() # Example of passing in context and question to a full RAG prompt.

print(f"Example rag message: {example_rag_message}")

Example rag message: [HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: question here \nContext: context here \nAnswer:", additional_kwargs={}, response_metadata={})]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
## Use LangGraph to generate for our RAG application.

# Begin the state definition phase, where we discuss what type of data we want our RAG model to process.

from langchain_core.documents import Document
from typing_extensions import List, TypedDict

# Create the actual state, to define what data our RAG model handles

class RAG_State(TypedDict):
  question: str
  context: List[Document] # remember, context should be several chunks of a document
  answer: str

def retrieve(rag_state: RAG_State):
  """
  Retrieves relevant data, based on our user question, from the vector store

  Args:
    rag_state: A dictionary which, for this problem, contains the user question, in the key 'question'. Has to be of type RAG_State
  
  Returns:
    All relevant context document chunks. Should be a dictionary of the following form: {str ; List[Document]}
  """
  similar_document_chunks = vector_store.similarity_search(rag_state["question"])
  return {"context" : similar_document_chunks}

def generate(rag_state: RAG_State):
  """
  Generate appropriate RAG response to the user query.

  Args:
    rag_state: A dictionary which, for this problem, contains the user question, in the key 'question', and the context of the question, in the key 'context'. Has to be of type RAG_State
  
  Returns:
    The model's response. Should be a dictionary of the following form: {str ; ?}
  """
  documents_contents = "\n\n".join(doc.page_content for doc in rag_state["context"])
  formalized_RAG_prompt = rag_prompt.invoke({"question": rag_state["question"], "context": rag_state["context"]})
  llm_RAG_response = llm.invoke(formalized_RAG_prompt)
  return {"answer" : llm_RAG_response.content}

In [11]:
# Create a graph object, to link together the retrieval and generate steps.
# Note that our retrieval and generate steps are simple as of now, but can be
# made more complex.

from langgraph.graph import START, StateGraph # START is a special Start Node

rag_agent_maker = StateGraph(RAG_State).add_sequence([retrieve, generate])
rag_agent_maker.add_edge(START, "retrieve") # links the special Start node to the retrieve node sequence we defined above.
rag_agent = rag_agent_maker.compile() # now we have made our RAG agent.

In [21]:
user_question = input("Please input your question to give to our RAG agent.")
rag_agent_response = rag_agent.invoke({"question": user_question})

print(f"Context: {rag_agent_response['context'][0].page_content}")
print(f"Answer: {rag_agent_response['answer']}")

Context: Illustration of the Reflexion framework. (Image source: Shinn & Labash, 2023)

The heuristic function determines when the trajectory is inefficient or contains hallucination and should be stopped. Inefficient planning refers to trajectories that take too long without success. Hallucination is defined as encountering a sequence of consecutive identical actions that lead to the same observation in the environment.
Self-reflection is created by showing two-shot examples to LLM and each example is a pair of (failed trajectory, ideal reflection for guiding future changes in the plan). Then reflections are added into the agent’s working memory, up to three, to be used as context for querying LLM.


Experiments on AlfWorld Env and HotpotQA. Hallucination is a more common failure than inefficient planning in AlfWorld. (Image source: Shinn & Labash, 2023)
Answer: Optimizing actions in the moment refers to making decisions based on immediate circumstances, while optimizing over time inv