In [5]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


os.environ["LANGSMITH_TRACING"] = "true"
# os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

# embeddings 
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# in memory storage (more like database)
vector_store = InMemoryVectorStore(embeddings) # stores our embeddings in memory: for faster searches and retrievals

# loading documents
loader = WebBaseLoader(
  web_path=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
  bs_kwargs=dict(
    parse_only=bs4.SoupStrainer(class_=("post-content", "post-title", "post-loader"))
  ),
)

docs = loader.load()

print(f"loaded documents with {len(docs[0].page_content)} characters")


loaded documents with 42964 characters


In [6]:
# splitting the loaded document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap=200,
)
all_splits = text_splitter.split_documents(docs)
print(f"document split into {len(all_splits)} chunks")

document split into 63 chunks


In [None]:
# storing the chunks
# the document chunks have now been stored as searchable vetctors
document_ids = vector_store.add_documents(documents=all_splits)
print(f"stored {len(document_ids)} ducument chunks") 

In [None]:
# retrieval and generation 

#setup the application structure

from langchain import hub
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

# get premade prompt for RAG
prompt = hub.pull("rlm/rag-prompt")

# define what information flows through your app
class State (TypedDict):
    question: str # user's question
    context: List[Document] # retrieved documents
    answer = str # generted answer