In [None]:
import json
import re
import time
import urllib.parse
from typing import Any, Dict, List, Optional

import validators
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from utils import (
    find_parent_element_text,
    get_all_text_elements,
    prettify_text,
    truncate_string_from_last_occurrence,
)

In [None]:
# Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

In [None]:
def find_form_fields(url: Optional[str] = None) -> str:
        """Find form fields on the website."""
        if url and url != driver.current_url and url.startswith("http"):
            try:
                driver.switch_to.window(driver.window_handles[-1])
                driver.get(url)
                # Let driver wait for website to load
                time.sleep(1)  # Wait for website to load
            except WebDriverException as e:
                return f"Error loading url {url}, message: {e.msg}"
        fields = []
        for element in driver.find_elements(By.XPATH, "//textarea | //input"):
            label_txt = (
                element.get_attribute("name")
                or element.get_attribute("aria-label")
                or find_parent_element_text(element)
            )
            if label_txt and "\n" not in label_txt and len(label_txt) < 100 and label_txt not in fields:
                label_txt = prettify_text(label_txt)
                fields.append(label_txt)
        return str(fields)

In [None]:
def get_website_main_content() -> str:
        texts = get_all_text_elements(driver)
        pretty_texts = [prettify_text(text) for text in texts]
        if not pretty_texts:
            return ""

        description = "Current window displays the following contents, try scrolling up or down" " to view more: "
        description += json.dumps(pretty_texts)

        return description

In [None]:
def get_interactable_elements() -> str:
        # Extract interactable components (buttons and links)
        interactable_elements = driver.find_elements(
            By.XPATH,
            "//button | //div[@role='button'] | //a | //input[@type='checkbox']",
        )

        interactable_texts = []
        for element in interactable_elements:
            button_text = find_parent_element_text(element)
            button_text = prettify_text(button_text, 50)
            if (
                button_text
                and button_text not in interactable_texts
                and element.is_displayed()
                and element.is_enabled()
            ):
                interactable_texts.append(button_text)

        # Split up the links and the buttons
        buttons_text = []
        links_text = []
        for text in interactable_texts:
            if validators.url(text):
                links_text.append(text)
            else:
                buttons_text.append(text)
        interactable_output = ""
        if links_text:
            interactable_output += f"Goto these links: {json.dumps(links_text)}\n"
        if buttons_text:
            interactable_output += f"Click on these buttons: {json.dumps(buttons_text)}"
        return interactable_output

In [None]:
def describe_website(url: Optional[str] = None) -> str:
        """Describe the website."""
        output = ""
        if url:
            try:
                driver.switch_to.window(driver.window_handles[-1])
                driver.get(url)
            except Exception:
                return (
                    f"Cannot load website {url}. Make sure you input the correct and"
                    " complete url starting with http:// or https://."
                )

        # Let driver wait for website to load
        time.sleep(1)  # Wait for website to load

        try:
            # Extract main content
            main_content = get_website_main_content()
        except WebDriverException:
            return "Website still loading, please wait a few seconds and try again."
        if main_content:
            output += f"{main_content}\n"

        # Extract interactable components (buttons and links)
        interactable_content = get_interactable_elements()
        if interactable_content:
            output += f"{interactable_content}\n"

        # Extract form inputs
        form_fields = find_form_fields()
        if form_fields:
            output += "You can input text in these fields using fill_form function: " + form_fields
        return output

In [None]:
describe_website("https://lumy.co/")

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set Chrome options for headless operation.
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

# Navigate to Wikipedia main page.
driver.get("https://en.wikipedia.org/wiki/Main_Page")

# Extract title
title = driver.find_element(By.ID, "firstHeading")
print("Title: ", title.text)

# Extract first paragraph
first_paragraph = driver.find_element(By.XPATH, '//*[@id="mw-content-text"]/div[1]/p[1]')
print("First Paragraph: ", first_paragraph.text)

# Extract table of contents
toc = driver.find_element(By.ID, "toc")
print("Table of Contents: ", toc.text)

# Close the driver.
driver.quit()


In [None]:
def fetch_wikipedia_description(driver, article_title):
    # Navigate to the Wikipedia page
    driver.get(f"https://en.wikipedia.org/wiki/{article_title}")

    # Wait for the page to load
    driver.implicitly_wait(5)

    # Find the first paragraph of the article
    first_paragraph = driver.find_element_by_css_selector('div.mw-parser-output > p:not(.mw-empty-elt)').text

    # Print the first paragraph
    print(first_paragraph)

In [None]:
def get_wikipedia_article_summary(driver, article_title):
    """
    Navigate to a Wikipedia article and extract the first paragraph.

    Parameters:
        driver (webdriver.Chrome): The Chrome driver instance.
        article_title (str): The title of the Wikipedia article.

    Returns:
        str: The text of the first paragraph of the article.
    """
    # Navigate to the article
    url = f"https://en.wikipedia.org/wiki/{article_title}"
    driver.get(url)

    # Find the first paragraph. Wikipedia typically structures the summary with 'p' tag after the first 'table' tag.
    summary_element = driver.find_element(By.XPATH, '//table/following-sibling::p')

    # Extract and return the text
    return summary_element.text

In [None]:
get_wikipedia_article_summary(driver, "Machine Learning")