In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import OpenAI
from langchain_core.output_parsers.string import StrOutputParser

gen_searches_sys_prompt = (
    """
    You are a specialized AI Assistant helping a user create an LLM agent for them to use.
    The user is presumably non technical. Given the user request, consider which web searches would yield
    the most relevant information to add to the vector store of the user's new agent. Do NOT answer the
    user's request, just create five web search queries based on it to help populate the vector store with data.
    Make the five requests relatively varied, each of the requests will have the top website result fully scraped so
    you should make them varied to grab a wide range of information. Output five strings of the search requests separated
    by new lines. Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example (Do NOT use this directly):
    Question:
    Human: help me make assistant to apply to santa clara university
    
    Five Web Searches (separated by new lines):
    santa clara university main website
    \n
    applying to santa clara university
    \n
    santa clara university application requirements
    \n
    how to apply to santa clara university
    \n
    santa clara university information

    \n\n
    Question:
    """
)

gen_searches_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", gen_searches_sys_prompt),
        ("human", "{input}"),
        ("system", "\n\nFive Web Searches (separated by new lines):")
    ]
)

gen_searches_llm = OpenAI()
gen_searches_chain = gen_searches_prompt | gen_searches_llm | StrOutputParser()



In [3]:
searches = gen_searches_chain.invoke("please help make an assistant for me that will teach me to cook simple recipes for gym")
searches

'\nsimple gym recipes\n\neasy gym meals\n\ncooking for fitness\n\nsimple recipes for workout\n\nhealthy and easy gym recipes'

In [4]:
searches = gen_searches_chain.invoke("i want helper for cleaning my apartment")

In [5]:
searches = gen_searches_chain.invoke("teach me to ski")

In [6]:
from googlesearch import search

def get_top_google_results(query, num_results=5):
    # Perform the search and return the top links
    return [link for link in search(query, num_results=num_results)]

# Example usage
query = "santa clara university"
top_links = get_top_google_results(query)
for i, link in enumerate(top_links, 1):
    print(f"Result {i}: {link}")

Result 1: https://www.scu.edu/
Result 2: https://www.scu.edu/


In [7]:
lines = [line for line in searches.split('\n') if line.strip()]
lines

['    beginner skiing tips',
 '    how to ski for beginners',
 '    ski lessons for beginners',
 '    skiing techniques for beginners',
 '    skiing for beginners tutorial']

In [8]:
links = []
for line in lines:
    links.extend(get_top_google_results(line))

links

['https://www.reddit.com/r/skiing/comments/v9tl03/first_time_skiing_any_tips/',
 'https://www.rei.com/learn/expert-advice/how-to-ski.html',
 'https://bearfoottheory.com/beginner-ski-tips-for-adults/',
 'https://hea-www.harvard.edu/~fine/opinions/firsttimeski.html',
 'https://www.skimag.com/performance/top-10-tips-beginners/',
 'https://www.reddit.com/r/skiing/comments/v9tl03/first_time_skiing_any_tips/',
 'https://www.thesnowpros.org/take-a-lesson/beginners-guide-to-skiing/',
 'https://www.rei.com/learn/expert-advice/how-to-ski.html',
 'https://bearfoottheory.com/beginner-ski-tips-for-adults/',
 'https://www.thesnowpros.org/take-a-lesson/beginners-guide-to-skiing/',
 'https://www.northstarcalifornia.com/plan-your-trip/ski-and-ride-lessons/first-timers.aspx',
 'https://www.reneeroaming.com/beginners-guide-to-skiing/',
 'https://www.reddit.com/r/bayarea/comments/yy7bmu/cost_effective_option_to_learn_skiing_up_to_at/',
 'https://www.reddit.com/r/skiing/comments/sgtd73/hey_rskiing_any_tips

In [10]:
import re
import lxml
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

docs = []
links = ["https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/"]
for link in links:
    loader = RecursiveUrlLoader(link, extractor=bs4_extractor, max_depth=3)
    docs.extend(loader.load())
    
print(docs[0].page_content[:200])

Department of Computer Science and Engineering - School of Engineering - Santa Clara University

Skip to main content 

Students

Faculty & Staff

Families

Alumni

Visitors


              Offices &


In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

In [12]:
splits = text_splitter.split_documents(docs)

In [13]:
splits[100].page_content

'Military Science\nModern Languages and Literatures\nMusic\nMusical Theatre\nNeuroscience\nPhilosophy\nPhysics\nPolitical Science\nPremodern Studies Program\nPsychology\nPublic Health\nReligious Studies\nSociology\nSustainability Program\nTheatre and Dance\nUrban Education Program'

In [14]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings_model.model

'text-embedding-3-small'

In [15]:
len(splits)

205

In [16]:
from langchain_community.vectorstores.utils import filter_complex_metadata

splits = filter_complex_metadata(splits)

In [17]:
len(splits)

205

In [18]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(splits, embeddings_model)

In [19]:
from langchain_openai import OpenAI

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 30, "fetch_k": 10, "lambda_mult": 0.5})
rephraser_llm = OpenAI(temperature=0, model="gpt-4o-mini")

In [20]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = (
    """
    You are a specialized AI Assistant. Given a chat history and the latest user question
    which might reference context in the chat history, 
    formulate a standalone question which can be understood 
    without the chat history. Do NOT answer the question, just 
    reformulate it if needed and otherwise return it as is.
    Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example:
    History:
    Human: what is an LLM?
    AI: an LLM is a large language model, a type of transformer machine learning algorithm used to interpret images and speech.
    Question:
    Human: how do I make it?
    Rephrased Question:
    How do I create an LLM transformer?
    \n\n
    History:
    \n\n
    """

)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:\n\n"),
        ("human", "{input}"),
        ("system", "\n\nRephrased Question:\n\n")
    ]
)
history_aware_retriever = create_history_aware_retriever(
    rephraser_llm, retriever, contextualize_q_prompt
)

In [25]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

sys_prompt = (
    """
    You are an assistant for question-answering tasks with retrieval augmented generation.
    Use the following pieces of retrieved context and the history of your interactions
    with the human to answer the question. If you don't know the answer, say that you
    don't know. Be descriptive. Do not complete or further generate the
    user's question. Do not refer to the context or history of interaction, avoid phrases like
    "according to the context" or "from the given documents".
    \n\n
    Context:
    \n\n
    {context}
    \n\n
    History:
    \n\n
    """
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", sys_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:"),
        ("human", "{input}"),
        ("system", "\n\nAnswer (Do not start your answer with AI or System):\n\n")
    ]
)
main_llm = OpenAI()
question_answer_chain = create_stuff_documents_chain(main_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
main_llm.temperature

0.7

In [26]:
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

chat_history = []

def ask_question(question):
    global chat_history
    ai_msg = rag_chain.invoke({"input": question, "chat_history": chat_history})
    chat_history.append(HumanMessage(content = question))
    print(ai_msg)
    chat_history.append(AIMessage(content = ai_msg["answer"]))
    return(ai_msg["answer"])

In [27]:
chat_history

[]

In [28]:
ask_question("how do I prepare a sweet protein shake?")

{'input': 'how do I prepare a sweet protein shake?', 'chat_history': [HumanMessage(content='how do I prepare a sweet protein shake?', additional_kwargs={}, response_metadata={})], 'context': [Document(metadata={'content_type': 'text/html; charset=utf-8', 'description': '', 'language': 'en', 'source': 'https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/research/', 'title': 'Research - School of Engineering - Santa Clara University'}, page_content="Dr. Weijia Shang's Research\n\nDr. Shang's research interests include parallel processing, computer architecture, algorithm theory, and nonlinear programming.\n\n \n\nDepartment of Computer Science and Engineering Sections \nDepartment of Computer Science and Engineering\n\nDepartment of Computer Science and EngineeringAbout UsUndergraduateGraduateFaculty & StaffLaboratoriesResearch\n\nMultimedia Visual Processing Laboratory (MVP Lab)\nSCU Internet of Things Research Lab"), Document(metadata={'content_type': '

'\nTo prepare a sweet protein shake, you will need protein powder, milk or milk alternative, a sweetener of your choice (such as honey, agave nectar, or stevia), and any additional flavorings (such as cocoa powder or fruit). \n1. Start by adding a scoop of protein powder into a blender.\n2. Next, pour in your desired amount of milk or milk alternative. You can adjust the amount depending on how thick you want your shake to be.\n3. Add in your chosen sweetener, starting with a small amount and adjusting to taste.\n4. If desired, add in any additional flavorings.\n5. Blend all ingredients together until smooth.\n6. Pour into a glass and enjoy your sweet protein shake!'

In [29]:
chat_history

[HumanMessage(content='how do I prepare a sweet protein shake?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='\nTo prepare a sweet protein shake, you will need protein powder, milk or milk alternative, a sweetener of your choice (such as honey, agave nectar, or stevia), and any additional flavorings (such as cocoa powder or fruit). \n1. Start by adding a scoop of protein powder into a blender.\n2. Next, pour in your desired amount of milk or milk alternative. You can adjust the amount depending on how thick you want your shake to be.\n3. Add in your chosen sweetener, starting with a small amount and adjusting to taste.\n4. If desired, add in any additional flavorings.\n5. Blend all ingredients together until smooth.\n6. Pour into a glass and enjoy your sweet protein shake!', additional_kwargs={}, response_metadata={})]