In [3]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import OpenAI
from langchain_core.output_parsers.string import StrOutputParser

gen_searches_sys_prompt = (
    """
    You are a specialized AI Assistant helping a user create an LLM agent for them to use.
    The user is presumably non technical. Given the user request, consider which web searches would yield
    the most relevant information to add to the vector store of the user's new agent. Do NOT answer the
    user's request, just create five web search queries based on it to help populate the vector store with data.
    Make the five requests relatively varied, each of the requests will have the top website result fully scraped so
    you should make them varied to grab a wide range of information. Output five strings of the search requests separated
    by new lines. Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example (Do NOT use this directly):
    Question:
    Human: help me make assistant to apply to santa clara university
    
    Five Web Searches (separated by new lines):
    santa clara university main website
    \n
    applying to santa clara university
    \n
    santa clara university application requirements
    \n
    how to apply to santa clara university
    \n
    santa clara university information

    \n\n
    Question:
    """
)

gen_searches_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", gen_searches_sys_prompt),
        ("human", "{input}"),
        ("system", "\n\nFive Web Searches (separated by new lines):")
    ]
)

gen_searches_llm = OpenAI(model="gpt-4o-mini")
gen_searches_chain = gen_searches_prompt | gen_searches_llm | StrOutputParser()

'gpt-4o-mini'

In [69]:
searches = gen_searches_chain.invoke("please help make an assistant for me that will teach me to cook simple recipes for gym")
searches

'\n\nsimple recipes for gym\ncooking for gym\nhealthy recipes for gym\neasy recipes for gym\ngym meal prep recipes'

In [5]:
searches = gen_searches_chain.invoke("i want helper for cleaning my apartment")

In [46]:
searches = gen_searches_chain.invoke("teach me to ski")

In [70]:
from googlesearch import search

def get_top_google_results(query, num_results=5):
    # Perform the search and return the top links
    return [link for link in search(query, num_results=num_results)]

# Example usage
query = "santa clara university"
top_links = get_top_google_results(query)
for i, link in enumerate(top_links, 1):
    print(f"Result {i}: {link}")

Result 1: https://www.scu.edu/
Result 2: https://www.scu.edu/
Result 3: https://santaclarabroncos.com/


In [72]:
lines = [line for line in searches.split('\n') if line.strip()]
lines

['simple recipes for gym',
 'cooking for gym',
 'healthy recipes for gym',
 'easy recipes for gym',
 'gym meal prep recipes']

In [73]:
links = []
for line in lines:
    links.extend(get_top_google_results(line))

links

['https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://www.reddit.com/r/fitmeals/comments/7sg5lz/quick_and_easy_fit_meals_that_are_actually_quick/',
 'https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://www.reddit.com/r/cookingforbeginners/comments/1dprzcq/cooking_guide_sources_for_gym_g/',
 'https://www.menshealth.com/uk/nutrition/a26128424/meal-preps-for-muscle-gain/',
 'https://www.youtube.com/user/fitmencook',
 'https://www.amazon.com/You-Are-Your-Own-Gym/dp/0553395009',
 'https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://healthyfitnessmeals.com/blog/',
 'https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://healthyfitnessmeals.com/',
 'https://www.reddit.com/r/fitmeals/comments/7sg5lz/quick_and_easy_fit_meals_that_are_actually_quick/',
 'https://www.instagram.com/gymratrecipes/?hl=en',
 'https://www.mypro

In [74]:
import re
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

docs = []
for link in links:
    loader = RecursiveUrlLoader(link, extractor=bs4_extractor, max_depth=3)
    docs.extend(loader.load())
    
print(docs[0].page_content[:200])

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


57 meal prep recipes for muscle building & fat loss | Myprotein           

     Skip to main content 

       
✕
      Protein    Discover Best SellersNew ProductsBundlesSamplesNew to supplements?Imp


In [75]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

In [76]:
splits = text_splitter.split_documents(docs)

In [77]:
splits[100].page_content

"Searching the internet for quick and easy meals turns up all these beautiful looking meals that are sprinkled with nuts and cranberries and made with all these strange exotic ingredients, etc.\n  \n    And maybe it would be quick if you had a prep chef who chopped up all the ingredients for you.\n  \n    Last night I made a 'quick and easy' meal and it took me 3 hours and it was uber bland."

In [4]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings_model.model

'text-embedding-3-small'

In [79]:
len(splits)

842

In [80]:
from langchain_community.vectorstores.utils import filter_complex_metadata

splits = filter_complex_metadata(splits)

In [81]:
len(splits)

842

In [82]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(splits, embeddings_model)

In [83]:
from langchain_openai import OpenAI

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 30, "fetch_k": 10, "lambda_mult": 0.5})
rephraser_llm = OpenAI(temperature=0, model="gpt-4o-mini")

In [84]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = (
    """
    You are a specialized AI Assistant. Given a chat history and the latest user question
    which might reference context in the chat history, 
    formulate a standalone question which can be understood 
    without the chat history. Do NOT answer the question, just 
    reformulate it if needed and otherwise return it as is.
    Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example:
    History:
    Human: what is an LLM?
    AI: an LLM is a large language model, a type of transformer machine learning algorithm used to interpret images and speech.
    Question:
    Human: how do I make it?
    Rephrased Question:
    How do I create an LLM transformer?
    \n\n
    History:
    \n\n
    """

)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:\n\n"),
        ("human", "{input}"),
        ("system", "\n\nRephrased Question:\n\n")
    ]
)
history_aware_retriever = create_history_aware_retriever(
    rephraser_llm, retriever, contextualize_q_prompt
)

In [85]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

sys_prompt = (
    """
    You are an assistant for question-answering tasks with retrieval augmented generation.
    Use the following pieces of retrieved context and the history of your interactions
    with the human to answer the question. If you don't know the answer, say that you
    don't know. Be descriptive. Do not complete or further generate the
    user's question. Do not refer to the context or history of interaction, avoid phrases like
    "according to the context" or "from the given documents".
    \n\n
    Context:
    \n\n
    {context}
    \n\n
    History:
    \n\n
    """
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", sys_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:"),
        ("human", "{input}"),
        ("system", "\n\nAnswer (Do not start your answer with AI or System):\n\n")
    ]
)
main_llm = OpenAI(model="gpt-4o-mini")
question_answer_chain = create_stuff_documents_chain(main_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
main_llm.temperature

0.7

In [86]:
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

chat_history = []

def ask_question(question):
    global chat_history
    ai_msg = rag_chain.invoke({"input": question, "chat_history": chat_history})
    chat_history.append(HumanMessage(content = question))
    print(ai_msg)
    chat_history.append(AIMessage(content = ai_msg["answer"]))
    return(ai_msg["answer"])

In [87]:
chat_history

[]

In [88]:
ask_question("how do I prepare a sweet protein shake?")

{'input': 'how do I prepare a sweet protein shake?', 'chat_history': [HumanMessage(content='how do I prepare a sweet protein shake?', additional_kwargs={}, response_metadata={})], 'context': [Document(metadata={'content_type': 'text/html; charset=UTF-8', 'description': 'Indulge in crafting your own single-serving protein cookie recipe, a guilt-free treat packed with high-protein ingredients!', 'language': 'en-US', 'source': 'https://fitmencook.com/recipes/protein-cookie-recipe/', 'title': 'Single Serve Protein Cookie - Fit Men Cook'}, page_content='These ingredients are combined in a mixing dish to create a sticky batter that serves as the foundation for the protein cookie. Optional extras like chocolate chips with stevia sweetness enhance the flavor even further. The secret is to get the batter to the proper consistency, making sure that it is not too moist and adding more protein powder if necessary.'), Document(metadata={'content_type': 'text/html; charset=UTF-8', 'language': 'en-US

'\nTo prepare a sweet protein shake, you will need a few key ingredients. First, you will need one scoop of chocolate protein powder, six tablespoons of almond milk, and one small banana. You can also add in optional extras like a tablespoon of PB2 or some crushed almonds for extra texture. Simply blend all of the ingredients together until smooth and enjoy!'

In [65]:
chat_history

[HumanMessage(content='how about the Green Foundations award?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='\nAI: Yes, Santa Clara received the Green Foundations award for their pioneering work in creating water-saving systems for Africa. This project was led by the computer science department and focused on using eco-friendly neural networks.', additional_kwargs={}, response_metadata={})]