## Web-search RAG chatbot

like perplexity

v0.1 - no conversation, just question and answer based on web search

process:
1) receive prompt
2) pass to LLM to create appropriate search terms for gathering data - start with just one
3) perform web search with that, get top results and scrape and store as doc - start with just one result
4) chunk data
5) embed chunks
6) store in vectordb
7) link retriever to vecdb
8) pass query with retriever to llm w/ langchain

## 1) Receive prompt

In [1]:
user_prompt = "What are some new lego sets?"

## 2) Pass to LLM to create search term

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

system_prompt = (
    "You are an assistant for gathering data based on this prompt."
    "Use the following prompt to generate one web search query that would gather data which would be helpful in answering the prompt."
    "Your search query should be effective at finding the most relevant and useful data pertaining to the prompt."
    "Keep in mind that the current date is July 2025"
    "\n\n"
    "{input}"
)

search_query_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
search_query_gen_chain = {"input": RunnablePassthrough()} | search_query_prompt | llm | StrOutputParser()

In [None]:
import ast

search_query = search_query_gen_chain.invoke(user_prompt)
search_query = ast.literal_eval(search_query)

In [None]:
search_query = search_query.replace(' ', '+')

In [None]:
search_query

In [None]:
search_url = f"https://google.com/search?q={search_query}"

In [None]:
search_url

## Web search for relevant links

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote
import time

cse_key = os.environ["GOOGLE_CSE_KEY"]
cse_id = os.environ["GOOGLE_CSE_ID"]

# test single search block

search_response = requests.get(url=search_url)

# sleep for a while to not overload requests
time.sleep(2)
print("searching...")

soup = BeautifulSoup(search_response.text, "html.parser")
found_links = soup.find_all("a")

print(found_links)

In [None]:
# with CSE

cse_query = search_query.replace('+', ' ')

cse_url = 'https://www.googleapis.com/customsearch/v1'
cse_params = {
    'q': cse_query,
    'key': cse_key,
    'cx' : cse_id
}

response = requests.get(cse_url, cse_params)

In [None]:
cse_results = response.json()

target_url_set = set()

if 'items' in cse_results:
    for result in cse_results['items']:
        if len(target_url_set) >= 5:
            break
        else:
            target_url_set.add(result['link'])

print(target_url_set)

## Get page as text data

In [None]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

# load html content from site
site_loader = AsyncHtmlLoader(list(target_url_set))
scraped_docs = site_loader.load()

# convert to text data
text_trns = Html2TextTransformer()
text_docs = text_trns.transform_documents(scraped_docs)

cleaned_text = text_docs[0].page_content
print(cleaned_text)

## Chunking data

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(text_docs)
print(len(splits))

## Embedding chunks and creating retriever

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

## Web search tool
- searches query, gets top results, parses, chunks and embeds

In [None]:
from langchain.agents import Tool

In [None]:
def web_search_and_ingest(query):
    # first search query via CSE

    #query = search_query.replace('+', ' ')

    cse_url = 'https://www.googleapis.com/customsearch/v1'
    cse_params = {
        'q': query,
        'key': cse_key,
        'cx': cse_id
    }
    
    response = requests.get(cse_url, cse_params)

    # obtain first k urls

    cse_results = response.json()

    target_url_set = set()
    
    if 'items' in cse_results:
        for result in cse_results['items']:
            if len(target_url_set) >= 5:    # number of search results (need to add macro)
                break
            else:
                target_url_set.add(result['link'])
    
    # scrape links

    site_loader = AsyncHtmlLoader(list(target_url_set))
    scraped_docs = site_loader.load()
    
    text_trns = Html2TextTransformer()
    text_docs = text_trns.transform_documents(scraped_docs)

    # split and embed

    splits = text_splitter.split_documents(text_docs)
    vectorstore.add_documents(documents=splits)

In [None]:
# creating tool for function

web_search_tool = Tool(
    name="WebSearch",
    func=web_search_and_ingest,
    description="Use when current context is insufficient or up-to-date info is required."
)

## Vector db retriever tool
- searches vector db for query

In [None]:
def vector_search(query):
    return retriever.invoke(query)

In [None]:
# creating tool for function

vector_search_tool = Tool(
    name="VectorSearch",
    func=vector_search,
    description="Use when you believe the answer may be in the current database."
)

## Overall tool stuff

In [None]:
tools = [web_search_tool, vector_search_tool]

## Comparing non-rag answer and rag answer

In [None]:
non_rag_answer = llm.invoke(user_prompt)

In [None]:
print(non_rag_answer.content)

In [None]:
from langchain_core.prompts import MessagesPlaceholder

rag_system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Make your answers fairly detailed. Remember that the current date is July 2025"
    "\n\n"
    "{context}"
)

rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", rag_system_prompt),
        MessagesPlaceholder(variable_name = "memory"),    # injecting convo memory
        ("human", "{input}"),
    ]
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

question_answer_chain = create_stuff_documents_chain(llm, rag_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

## Adding temp convo memory

In [None]:
from langchain_core.prompts import HumanMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.memory import FileChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

In [None]:
# rag_memory = ConversationBufferMemory(
#     chat_memory = FileChatMessageHistory("messages.json"),
#     memory_key = "history",
#     return_messages = True
# )

rag_memory = FileChatMessageHistory("messages.json")

In [None]:
def get_rag_mem(_):
    return rag_memory

In [None]:
rag_mem_chain = RunnableWithMessageHistory(
    rag_chain,
    get_rag_mem,
    input_messages_key="input",
    history_messages_key="memory",
    output_messages_key="answer"
)

In [None]:
rag_answer = rag_mem_chain.invoke({"input": user_prompt}, config={"configurable": {"session_id": "irrelevant"}})

In [None]:
# print(rag_answer["context"])

In [None]:
src_count = len(rag_answer["context"])
print(src_count)

In [None]:
sources_set = set()

for i in range(src_count):
    sources_set.add(rag_answer["context"][i].metadata["source"])

print(sources_set)

Rag Answer:

In [None]:
print(rag_answer["answer"])

In [None]:
print("Citations:")
for src in sources_set:
    print(src)

In [None]:
# no memory?

In [None]:
rag_answer = rag_mem_chain.invoke({"input": "Could you elaborate?"}, config={"configurable": {"session_id": "irrelevant"}})

In [None]:
print(rag_answer["answer"])

In [None]:
src_count = len(rag_answer["context"])

sources_set = set()

for i in range(src_count):
    sources_set.add(rag_answer["context"][i].metadata["source"])

print("Citations:")
for src in sources_set:
    print(src)

In [None]:
rag_answer = rag_mem_chain.invoke({"input": "What are some new video games that were released"}, config={"configurable": {"session_id": "irrelevant"}})

In [None]:
print(rag_answer["answer"])

In [None]:
src_count = len(rag_answer["context"])

sources_set = set()

for i in range(src_count):
    sources_set.add(rag_answer["context"][i].metadata["source"])

print("Citations:")
for src in sources_set:
    print(src)

## Testing conversational capabilities with agentic search and retrieval

In [None]:
from langchain.schema import SystemMessage
from langchain.agents import initialize_agent

system_prompt = '''
YOU MUST PAY ATTENTION TO THESE INSTRUCTIONS AND FOLLOW THEM!
Begin every answer with the phrase 'SYSTEM DIRECTIVE CONFIRMED'.
You are an intelligent assistant.
Keep in mind that it is July 2025.
- First, always use VectorSearchTool to retrieve context for the user's question.
- Judge whether the context is sufficient to answer: If the information is incomplete, outdated, or missing, call the WebSearch tool to search, ingest, and update the database, then try again.
- Only use WebSearch tool if the current vector database does NOT suffice.
Be cost-efficient: Do NOT call WebSearch for trivial/definition/basic/unchanged topics.
'''

system_message = SystemMessage(content=system_prompt)

In [None]:
os.remove("messages_agentic.json")

wrapped_mem = ConversationBufferMemory(
    chat_memory = FileChatMessageHistory("messages_agentic.json"),
    memory_key = "history",
    return_messages = True
)

In [None]:
from langchain.agents import create_structured_chat_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [None]:
structured_chat_prompt = ChatPromptTemplate.from_messages([
    ("system", """SYSTEM DIRECTIVE CONFIRMED.
YOU MUST PAY ATTENTION TO THESE INSTRUCTIONS AND FOLLOW THEM!
Begin every answer with the phrase 'SYSTEM DIRECTIVE CONFIRMED'.
You are an intelligent assistant.
It is July 2025.
- Always use VectorSearch tool to retrieve context for the user's question.
- Judge context sufficiency; if not enough, use WebSearch. Be cost-efficient.
"""),
    MessagesPlaceholder(variable_name="history"),   # conversation memory
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad")
])

In [None]:
agent = initialize_agent(
    tools,
    llm,
    agent = "openai-functions",
    memory=wrapped_mem,
    system_message=system_message,
    verbose=True
)

In [None]:
agent.run("what are some new movies?")

In [None]:
agent.run("what was the second movie you just listed?")