In [4]:
# pip install beautifulsoup4-4.12.3
# pip install langgraph==0.2.3

import re
import requests

from typing import Annotated, Literal, TypedDict
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, END, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode
from langchain_openai import ChatOpenAI

In [5]:

def fetch_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

def extract_text(content):
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        # Extract all text, stripping unnecessary whitespace
        text = soup.get_text(separator="\n")
        return text.strip()
    return None

def clean_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    
    # Replace any remaining instances of " \n" or "\n " with just "\n"
    text = re.sub(r'(\s*\n\s*)+', '\n', text)
    
    return text

def scrape_website(base_url: str) -> str:
    """
    Scrapes the homepage, "About Us," and "Contact Us" pages from the given website URL, 
    consolidates the text content, and returns the cleaned text.

    base_url (str): The base URL of the website to scrape (e.g., "https://example.com/").

    Returns:
        str: The consolidated and cleaned text content from the homepage, "About Us," 
        and "Contact Us" pages of the website.
    
    Example Usage:
        scrape_website("https://example.com/")
    """

    collected_text = ""

    # Fetch homepage
    homepage_content = fetch_page_content(base_url)
    if homepage_content:
        homepage_text = extract_text(homepage_content)
        if homepage_text:
            collected_text += homepage_text + "\n"

    # Define potential About Us and Contact Us page names
    about_us_paths = ["om-oss"]
    contact_us_paths = ["kontakt"]

    # Scrape About Us page
    for path in about_us_paths:
        about_us_url = urljoin(base_url, path)
        about_us_content = fetch_page_content(about_us_url)
        if about_us_content:
            about_us_text = extract_text(about_us_content)
            if about_us_text:
                collected_text += about_us_text + "\n"
            break

    # Scrape Contact Us page
    for path in contact_us_paths:
        contact_us_url = urljoin(base_url, path)
        contact_us_content = fetch_page_content(contact_us_url)
        if contact_us_content:
            contact_us_text = extract_text(contact_us_content)
            if contact_us_text:
                collected_text += contact_us_text + "\n"
            break

    # Clean the collected text
    cleaned_text = clean_text(collected_text)
    return cleaned_text

tools = [scrape_website]

tool_node = ToolNode(tools)

# put your openai key here
openai_key = '...'
model = 'gpt-4o-2024-08-06'

model  = ChatOpenAI(model=model,
                 openai_api_key = openai_key)

model = model.bind_tools(tools)

class MyMessagesState(MessagesState):
    # New key for the system prompt
    system_prompt: str  

# Define the function that determines whether to continue or not
def should_continue(state: MessagesState) -> Literal["tools", END]:
    messages = state['messages']
    last_message = messages[-1]
    # If the LLM makes a tool call, then we route to the "tools" node
    if last_message.tool_calls:
        return "tools"
    # Otherwise, we stop (reply to the user)
    return END

# Define the function that calls the model
def call_model(state: MessagesState):
    # Use the system prompt from the state
    system_prompt = state.get('system_prompt', """You are a helpful assistant. Gather the necessary information to return a json file exactly like this:\n
                        ```json
                        business_name:
                        type_of_business: (e.g., restaurant, pizzeria, retail)
                        location: 
                        opening_days:
                        opening_hours:
                              Monday:
                              Tuesday:
                              Wednesday:
                              Thursday:
                              Friday:
                              Saturday:
                              Sunday:
                        email:
                        phone:
                        suggested_roles_needed: (i.e., only if not already provided, based on the business information, suggest the roles that may be necessary for this business (e.g., 2 cooks, 3 waiters, 1 receptionist)
                        ```  """)
    
    messages = state['messages']
    
    # If the system prompt is not already included in the messages, add it
    messages.insert(0, {"role": "system", "content": system_prompt})
    
    response = model.invoke(messages)
    
    # We return a list, because this will get added to the existing list
    return {"messages": [response]}


# Define a new graph
workflow = StateGraph(MessagesState)

# Define the two nodes we will cycle between
workflow.add_node("agent", call_model)
workflow.add_node("tools", tool_node)


# Set the entrypoint as `agent` = this means that this node is the first one called
workflow.set_entry_point("agent")

# We now add a conditional edge
workflow.add_conditional_edges(
    # These are the conditional edges taken after the `agent` node is called.
    "agent",
    # Pass in the function that will determine which node is called next.
    should_continue,
)

# We now add a normal edge from `tools` to `agent` so that after `tools` is called, `agent` node is called next.
workflow.add_edge("tools", 'agent')

# Initialize memory to persist state between graph runs
checkpointer = MemorySaver()

# We're (optionally) passing the memory when compiling the graph
app = workflow.compile(checkpointer=checkpointer)

config = {"configurable": {"thread_id": "1"}}

In [6]:
user_input = "I have 3 cooks, 4 waiters and 1 receptionist that I need to schedule in. Grab all the rest of the information from my website www.paganini.nu and do your job!"

# The config is the **second positional argument** to stream() or invoke()!
events = app.stream(
    {"messages": [("user", user_input)]}, config, stream_mode="values"
)
for event in events:
    event["messages"][-1].pretty_print()



I have 3 cooks, 4 waiters and 1 receptionist that I need to schedule in. Grab all the rest of the information from my website www.paganini.nu and do your job!
Tool Calls:
  scrape_website (call_VuelHUvjBJA7a4ziyu7VyIMP)
 Call ID: call_VuelHUvjBJA7a4ziyu7VyIMP
  Args:
    base_url: https://www.paganini.nu
Name: scrape_website

Paganini - Italiensk Restaurang Stockholm, sedan 1995 Paganini - Italiensk Restaurang Stockholm, sedan 1995 Toggle navigation SV | EN Startsida Boka Menyer Aktuellt Om oss Kontakt TEL: 08-406 06 07 E-post: reservation@paganini.nu Boka online Restaurang Paganini, Stockholm En äkta genuin italiensk restaurang i Gamla stan som har försett stockholmarna med god & kvalitativ mat sedan 1995. I en rustik och anrik atmosfär avnjuter ni autentiska italienska delikatesser, tillagade med både kärlek och respekt för den egna råvaran. Framför allt god service! Som gammelfarmor alltid sade -Sempre felice! Alla är välkomna stora som små, många eller få. Du hittar oss på Västerl