In [25]:
# run this cell OR next cell, not both

from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# DO NOT COMMIT YOUR API KEY!!

import os
os.environ["OPENAI_API_KEY"]="KEY_HERE"

In [26]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import OpenAI
from langchain_core.output_parsers.string import StrOutputParser

gen_searches_sys_prompt = (
    """
    You are a specialized AI Assistant helping a user create an LLM agent for them to use.
    The user is presumably non technical. Given the user request, consider which web searches would yield
    the most relevant information to add to the vector store of the user's new agent. Do NOT answer the
    user's request, just create five web search queries based on it to help populate the vector store with data.
    Make the five requests relatively varied, each of the requests will have the top website result fully scraped so
    you should make them varied to grab a wide range of information. Output five strings of the search requests separated
    by new lines. Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example (Do NOT use this directly):
    Question:
    Human: help me make assistant to apply to santa clara university
    
    Five Web Searches (separated by new lines):
    santa clara university main website
    \n
    applying to santa clara university
    \n
    santa clara university application requirements
    \n
    how to apply to santa clara university
    \n
    santa clara university information

    \n\n
    Question:
    """
)

gen_searches_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", gen_searches_sys_prompt),
        ("human", "{input}"),
        ("system", "\n\nFive Web Searches (separated by new lines):")
    ]
)

gen_searches_llm = OpenAI()
gen_searches_chain = gen_searches_prompt | gen_searches_llm | StrOutputParser()

In [27]:
searches = gen_searches_chain.invoke("please help make an assistant for me that will teach me to cook simple recipes for gym")
searches

'\nsimple recipes for gym\ncooking tutorial for beginners\nhealthy recipes for gym goers\nhow to meal prep for gym\neasy healthy meals for fitness'

In [4]:
searches = gen_searches_chain.invoke("i want helper for cleaning my apartment")

In [5]:
searches = gen_searches_chain.invoke("teach me to ski")

In [28]:
from googlesearch import search

def get_top_google_results(query, num_results=5):
    # Perform the search and return the top links
    return [link for link in search(query, num_results=num_results)]

# Example usage
query = "santa clara university"
top_links = get_top_google_results(query)
for i, link in enumerate(top_links, 1):
    print(f"Result {i}: {link}")

Result 1: https://www.scu.edu/
Result 2: https://www.scu.edu/
Result 3: https://santaclarabroncos.com/


In [29]:
lines = [line for line in searches.split('\n') if line.strip()]
lines

['simple recipes for gym',
 'cooking tutorial for beginners',
 'healthy recipes for gym goers',
 'how to meal prep for gym',
 'easy healthy meals for fitness']

In [30]:
links = []
for line in lines:
    links.extend(get_top_google_results(line))

links

['https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://www.reddit.com/r/fitmeals/comments/7sg5lz/quick_and_easy_fit_meals_that_are_actually_quick/',
 'https://www.escoffier.edu/blog/value-of-culinary-education/learning-how-to-cook-a-guide-for-beginners/',
 'https://www.eater.com/2020/4/3/21203517/easy-cooking-recipes-tips-tricks-roast-chicken-vegetables-rice-beans',
 'https://fcs.tennessee.edu/wp-content/uploads/sites/23/2021/08/Cooking-Basics.pdf',
 'https://www.youtube.com/channel/UCzyMt1XGP50JLxzJ0-pluJg',
 'https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://www.muscleandfitness.com/nutrition/healthy-eating/beginners-guide-meal-prepping/',
 'https://www.myprotein.com/thezone/recipe/meal-prep-recipes-muscle-building-fat-loss/',
 'https://www.ladies-who-lift.com/motivation/mastering-the-art-of-meal-prep',
 'https://www.menshealth.com/uk/nutrition/a26128424/meal-preps-for-muscle-gain/',
 'https://

In [9]:
import re
import lxml
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

docs = []
links = ["https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/"]
for link in links:
    loader = RecursiveUrlLoader(link, extractor=bs4_extractor, max_depth=3)
    docs.extend(loader.load())
    
print(docs[0].page_content[:200])

Department of Computer Science and Engineering - School of Engineering - Santa Clara University

Skip to main content 

Students

Faculty & Staff

Families

Alumni

Visitors


              Offices &


In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

In [11]:
splits = text_splitter.split_documents(docs)

In [12]:
splits[100].page_content

'AIMES Program\nAnthropology\nArt and Art History\nAsian Studies Program\nBiology\nCatholic Studies Program\nChemistry and Biochemistry\nChild Studies\nClassics\nCommunication\nEconomics\nEnglish\nEnvironmental Studies and Sciences\nEthnic Studies\nGender and Sexuality Studies\nGerontology Program\nHistory\nIndividual Studies\nLatin American Studies Program\nMathematics and Computer Science\nMedical and Health Humanities Program\nMilitary Science\nModern Languages and Literatures\nMusic\nMusical Theatre\nNeuroscience'

In [13]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings_model.model

'text-embedding-3-small'

In [14]:
len(splits)

192

In [15]:
from langchain_community.vectorstores.utils import filter_complex_metadata

splits = filter_complex_metadata(splits)

In [16]:
len(splits)

192

In [17]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(splits, embeddings_model)

In [18]:
from langchain_openai import OpenAI

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 30, "fetch_k": 10, "lambda_mult": 0.5})
rephraser_llm = OpenAI(temperature=0, model="gpt-4o-mini")

In [19]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = (
    """
    You are a specialized AI Assistant. Given a chat history and the latest user question
    which might reference context in the chat history, 
    formulate a standalone question which can be understood 
    without the chat history. Do NOT answer the question, just 
    reformulate it if needed and otherwise return it as is.
    Do NOT preceed your answer with any tags like System: or AI:.

    Here is an example:
    History:
    Human: what is an LLM?
    AI: an LLM is a large language model, a type of transformer machine learning algorithm used to interpret images and speech.
    Question:
    Human: how do I make it?
    Rephrased Question:
    How do I create an LLM transformer?
    \n\n
    History:
    \n\n
    """

)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:\n\n"),
        ("human", "{input}"),
        ("system", "\n\nRephrased Question:\n\n")
    ]
)
history_aware_retriever = create_history_aware_retriever(
    rephraser_llm, retriever, contextualize_q_prompt
)

In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

sys_prompt = (
    """
    You are an assistant for question-answering tasks with retrieval augmented generation.
    Use the following pieces of retrieved context and the history of your interactions
    with the human to answer the question. If you don't know the answer, say that you
    don't know. Be descriptive. Do not complete or further generate the
    user's question. Do not refer to the context or history of interaction, avoid phrases like
    "according to the context" or "from the given documents".
    \n\n
    Context:
    \n\n
    {context}
    \n\n
    History:
    \n\n
    """
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", sys_prompt),
        MessagesPlaceholder("chat_history"),
        ("system", "\n\nQuestion:"),
        ("human", "{input}"),
        ("system", "\n\nAnswer (Do not start your answer with AI or System):\n\n")
    ]
)
main_llm = OpenAI()
question_answer_chain = create_stuff_documents_chain(main_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
main_llm.temperature

0.7

In [21]:
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

chat_history = []

def ask_question(question):
    global chat_history
    ai_msg = rag_chain.invoke({"input": question, "chat_history": chat_history})
    chat_history.append(HumanMessage(content = question))
    print(ai_msg)
    chat_history.append(AIMessage(content = ai_msg["answer"]))
    return(ai_msg["answer"])

In [22]:
chat_history

[]

In [23]:
ask_question("what are techniques for skiing moguls?")

{'input': 'what are techniques for skiing moguls?', 'chat_history': [HumanMessage(content='what are techniques for skiing moguls?', additional_kwargs={}, response_metadata={})], 'context': [Document(metadata={'content_type': 'text/html; charset=utf-8', 'description': '', 'language': 'en', 'source': 'https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/about-us/', 'title': 'About Us - School of Engineering - Santa Clara University'}, page_content='function effectively as a member or leader of a team engaged in activities appropriate to the program’s discipline\napply computer science theory and software development fundamentals to produce computing-based solutions'), Document(metadata={'content_type': 'text/html; charset=utf-8', 'description': '', 'language': 'en', 'source': 'https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/coen-student-policies/', 'title': 'COEN Student Policies - School of Engineering - Santa Clara Un

'Off-piste skiing is a technique used for skiing moguls, which involves navigating through uneven terrain and avoiding obstacles. Another technique is to use short, sharp turns to absorb the bumps on the moguls. Skiers can also use a technique called "zipper lining" which involves making quick, tight turns while skiing down the fall line of the mogul. Additionally, skiers can use a technique called "hopping" where they use their legs to absorb the impact of the bumps and maintain control while skiing.'

In [24]:
chat_history

[HumanMessage(content='what are techniques for skiing moguls?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Off-piste skiing is a technique used for skiing moguls, which involves navigating through uneven terrain and avoiding obstacles. Another technique is to use short, sharp turns to absorb the bumps on the moguls. Skiers can also use a technique called "zipper lining" which involves making quick, tight turns while skiing down the fall line of the mogul. Additionally, skiers can use a technique called "hopping" where they use their legs to absorb the impact of the bumps and maintain control while skiing.', additional_kwargs={}, response_metadata={})]