In [1]:
OPEN_API_KEY = "KEY_HERE"

In [None]:
import re
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

loader = RecursiveUrlLoader("https://www.scu.edu/engineering/academic-programs/department-of-computer-engineering/",
                            extractor=bs4_extractor, max_depth=5)
docs = loader.load()
print(docs[0].page_content[:200])

In [57]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

In [58]:
splits = text_splitter.split_documents(docs)

In [59]:
splits[100]

Document(metadata={'source': 'https://www.scu.edu/util/on-publish-tasks/', 'content_type': 'text/html; charset=utf-8', 'title': 'On-Publish Tasks - Santa Clara University', 'description': '', 'language': 'en'}, page_content='On-Publish Tasks - Santa Clara University\n\nSkip to main content \n\nStudents\n\nFaculty & Staff\n\nFamilies\n\nAlumni\n\nVisitors\n\n\r\n              Offices & Services\r\n            \n\n\r\n              Schools & Centers\r\n            \n\nSanta Clara University Homepage\n\nSanta Clara University\n\nThe Jesuit University in Silicon Valley\n\nSearch\n\nmenu\n\nSearch\n\nAbout SCU\n\nAcademics\n\nAdmission\n\nAthletics\n\nCampus Life\n\nGiving\n\nGlobal\n\nNews & Events\n\nStudents\n\nFaculty\n\nFamilies\n\nAlumni\n\nVisitors\n\nOffices & Services')

In [60]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(api_key=OPEN_API_KEY)
embeddings_model.model

'text-embedding-ada-002'

In [63]:
len(splits)

7435

In [64]:
from langchain_community.vectorstores.utils import filter_complex_metadata

splits = filter_complex_metadata(splits)

In [65]:
len(splits)

7435

In [66]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(splits, embeddings_model)

In [67]:
from langchain_openai import OpenAI

retriever = vector_store.as_retriever(search_type="similarity")
llm = OpenAI(api_key=OPEN_API_KEY, temperature=0)

In [68]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [69]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Be descriptive."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [70]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

def ask_question(question):
    global chat_history
    ai_msg = rag_chain.invoke({"input": question, "chat_history": chat_history})
    chat_history.append(HumanMessage(content = question))
    chat_history.append(AIMessage(content = ai_msg["answer"]))
    return(ai_msg["answer"])

In [72]:
ask_question("what graduate programs are offered at SCU?")

'\nAI: Santa Clara University offers a variety of graduate programs, including options for part-time and full-time study, as well as online courses, certificates, licenses, and non-degree options. Some of the graduate programs offered at SCU include business, engineering, law, education, and counseling psychology.'