## Goals:

- Read the BS4 documentation at https://www.crummy.com/software/BeautifulSoup/bs4/doc/
- Parse the documentation
- Read it into an in-memory Chroma instance
- See if we can query against it with "How can I use BeautifulSoup to get elements by CSS selector?"

## Notes:
- The documentation is contained in `<section>` tags directly within the body element

In [28]:
pip install --quiet bs4 chromadb requests langchain langchain_openai pydantic

Note: you may need to restart the kernel to use updated packages.


# Retrieve the Documentation

In [1]:
BS4_DOCUMENTATION_URL="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"

In [38]:
from bs4 import BeautifulSoup, SoupStrainer
import requests
from operator import itemgetter
from chromadb import Client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from pydantic import BaseModel
from typing import Dict, List, Optional, Sequence
from langchain.schema import Document
from langchain.schema.retriever import BaseRetriever
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.runnable import (Runnable, RunnableBranch,
                                       RunnableLambda, RunnableMap)
from langchain_community.chat_models import ChatOpenAI

In [12]:
# Get the raw page content
response = requests.get(BS4_DOCUMENTATION_URL)

In [13]:
# Extract metadata from a section
def extract_metadata(section):
    results = {}

    # Extract the section ID
    results["section-id"] = section.attrs["id"]

    # Extract the section title
    section_title = section.find("h1")
    if section_title:
        results["section-title"] = section_title.get_text()
    
    return results


# Split the section text into Documents
def split_into_documents(texts, metadatas):
    splitter = text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=750,  # Set the desired chunk size (number of characters)
        chunk_overlap=50,  # Set the desired overlap between chunks (number of characters)
        length_function=len,  # Specify the function to measure the length of the text
        is_separator_regex=False  # Specify whether the separators are regular expressions
    )    
    return splitter.create_documents(texts, metadatas)



# Extract the documentation sections
strainer = SoupStrainer("section")
soup = BeautifulSoup(response.content, "html.parser", parse_only=strainer)
documentation_sections = soup.find_all("section")


sections = []
metadatas = []

for section in documentation_sections:
    metadatas.append(extract_metadata(section))
    sections.append(section.get_text())


documents = split_into_documents(sections, metadatas)

# Add the Documents to the VectorStore

In [10]:
collection_name="bs4_documentation"
embedding_function = OpenAIEmbeddings()
vector_store = Chroma(persist_directory="./chroma_db", collection_name=collection_name,
                   embedding_function=embedding_function)

In [15]:
vector_store.add_documents(documents)
vector_store.persist()

# Query the VectorStore

In [6]:
question = "How can I get an element's children?"
result = vector_store.similarity_search_with_score(question, k=10)
result.sort(key=lambda input: input[1])
print(result[0])

(Document(page_content='.parents¶\nYou can iterate over all of an element’s parents with\n.parents. This example uses .parents to travel from an <a> tag\nburied deep within the document, to the very top of the document:\nlink = soup.a\nlink\n# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>\nfor parent in link.parents:\n    print(parent.name)\n# p\n# body\n# html\n# [document]', metadata={'section-id': 'parents'}), 0.4240224063396454)


# Query the LLM

In [31]:
RESPONSE_TEMPLATE = """\
You are an expert programmer and problem-solver, tasked with answering any question \
about Beautiful Soup 4.

Generate a comprehensive and informative answer of 80 words or less for the \
given question based solely on the provided context. You must \
only use information from the provided context. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text.

You should use bullet points in your answer for readability. Put citations where they apply
rather than putting them all at the end.

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." Don't try to make up an answer.

Anything between the following `context`  html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. 

<context>
    {context} 
<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
"""

REPHRASE_TEMPLATE = """\
Given the following conversation and a follow up question, rephrase the follow up \
question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone Question:
"""


class ChatRequest(BaseModel):
    question: str
    chat_history: Optional[List[Dict[str, str]]]



In [13]:
def get_retriever() -> BaseRetriever:
    return vector_db.as_retriever()

print(get_retriever())

tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x11bd95c90>


In [20]:
def create_retriever_chain(
    llm: BaseLanguageModel, retriever: BaseRetriever
) -> Runnable:
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(REPHRASE_TEMPLATE)
    condense_question_chain = (
        CONDENSE_QUESTION_PROMPT | llm | StrOutputParser()
    ).with_config(
        run_name="CondenseQuestion",
    )
    conversation_chain = condense_question_chain | retriever
    return RunnableBranch(
        (
            RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
                run_name="HasChatHistoryCheck"
            ),
            conversation_chain.with_config(run_name="RetrievalChainWithHistory"),
        ),
        (
            RunnableLambda(itemgetter("question")).with_config(
                run_name="Itemgetter:question"
            )
            | retriever
        ).with_config(run_name="RetrievalChainWithNoHistory"),
    ).with_config(run_name="RouteDependingOnChatHistory")

In [23]:
def format_docs(docs: Sequence[Document]) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
        formatted_docs.append(doc_string)
    return "\n".join(formatted_docs)


def serialize_history(request: ChatRequest):
    chat_history = request["chat_history"] or []
    converted_chat_history = []
    for message in chat_history:
        if message.get("human") is not None:
            converted_chat_history.append(HumanMessage(content=message["human"]))
        if message.get("ai") is not None:
            converted_chat_history.append(AIMessage(content=message["ai"]))
    return converted_chat_history

In [24]:
def create_chain(
    llm: BaseLanguageModel,
    retriever: BaseRetriever,
) -> Runnable:
    retriever_chain = create_retriever_chain(
        llm,
        retriever,
    ).with_config(run_name="FindDocs")
    _context = RunnableMap(
        {
            "context": retriever_chain | format_docs,
            "question": itemgetter("question"),
            "chat_history": itemgetter("chat_history"),
        }
    ).with_config(run_name="RetrieveDocs")
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", RESPONSE_TEMPLATE),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )

    response_synthesizer = (prompt | llm | StrOutputParser()).with_config(
        run_name="GenerateResponse",
    )
    return (
        {
            "question": RunnableLambda(itemgetter("question")).with_config(
                run_name="Itemgetter:question"
            ),
            "chat_history": RunnableLambda(serialize_history).with_config(
                run_name="SerializeHistory"
            ),
        }
        | _context
        | response_synthesizer
    )

In [45]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    streaming=True,
    temperature=0,
)
retriever = get_retriever()
answer_chain = create_chain(
    llm,
    retriever,
)

inputs = {"question": question, "chat_history": None}

result = answer_chain.invoke(inputs)
print(result)

To get an element's children using Beautiful Soup 4, you can use the `.children` attribute. Here's how you can do it:

1. First, select the element you want to get the children of.
2. Use the `.children` attribute on the selected element to retrieve its children.
3. Iterate over the children using a loop or convert them to a list for further processing.

Example:
```python
for child in element.children:
    print(child)
```

This will print out each child element of the selected element.
