In [None]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader, SitemapLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import nest_asyncio
import requests
from bs4 import BeautifulSoup

In [None]:
nest_asyncio.apply()
load_dotenv('.env')
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
loader  = WebBaseLoader("https://www.ehackacademy.com/about")

In [None]:
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
chunks = splitter.split_documents(docs)

### XML Parsing 

In [None]:
response = requests.get("https://www.ehackacademy.com/sitemap.xml")
soup = BeautifulSoup(response.content, "xml")
urls = [loc.text for loc in soup.find_all("loc")]

In [None]:
print(urls)

In [None]:
pages = []
for url in urls:
    loader = WebBaseLoader(url)
    doc = loader.aload()
    pages.extend(doc)


In [None]:
print((pages[0].page_content))  

In [None]:
splitter = RecursiveCharacterTextSplitter(
    separators=[" "],
    chunk_size=500,
    chunk_overlap =20
)

chunks = splitter.split_documents(pages)
print(len(chunks))

In [None]:
## Embed and save using vectorstore
vector_store = FAISS.from_documents(chunks,embeddings)
vector_store.save_local("faiss_index")


In [None]:
vectorstore = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k":3})

In [None]:
results = retriever.invoke("Who is founder of eHack Academy")
for i, doc in enumerate(results):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(doc.page_content[:300])  # first 300 chars

In [None]:
from langchain_openai import ChatOpenAI
from langchain.tools import tool

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-4.1")
tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from a webaite pages. "
    "Use the tool to help answer user queries."
)
agent = create_agent(model, tools, system_prompt=prompt)
query = (
    "What programs does eHack Academy offer?\n\n"
    "Once you get the answer, list all programs and their features also."
)

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()

In [None]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a helpful assistant. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message


agent = create_agent(model, tools=[], middleware=[prompt_with_context])

In [None]:
query = "How many program eHack Academy offer??"
for step in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()