Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarking Fuji-web #162

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a46f48c
feat: build python script to interact with chrome extension
lynchee-owo May 8, 2024
0497266
feat: create python server to receive message from chrome extension
lynchee-owo May 8, 2024
3f00843
feat: get task history and save to local
lynchee-owo May 8, 2024
f7d6856
feat: store action history to local files
lynchee-owo May 8, 2024
3ecfa6b
feat: get screenshots and save locally
lynchee-owo May 8, 2024
ccdc159
feat: use message listener in python script to get data from chrome e…
lynchee-owo May 10, 2024
83e719d
fix: remove unnecessary permission in manifest
lynchee-owo May 10, 2024
267d6d3
fix: added test results and handled WebDriverException
lynchee-owo May 15, 2024
c8c2833
fix: temp
lynchee-owo May 16, 2024
3c24013
feat: added logger
lynchee-owo May 16, 2024
1757b8e
feat: logged task execution results
lynchee-owo May 16, 2024
8275852
fix: addressed comments
lynchee-owo May 16, 2024
207b200
feat: use zustand middleware to subscribe the status change
lynchee-owo May 16, 2024
f405fc7
fix: clean code and test result
lynchee-owo May 16, 2024
892367a
feat: added steps to run benchmarking in comments
lynchee-owo May 16, 2024
ab81888
fix: removed unnecessary photos
lynchee-owo May 16, 2024
1270f5b
fix: small fix
lynchee-owo May 16, 2024
2b4ec6a
fix: not remove listener after screenshot
lynchee-owo May 16, 2024
7172a10
feat: process log file using python script
lynchee-owo May 17, 2024
cc36e31
feat: script receive error message from chrome extension
lynchee-owo May 19, 2024
46ad15b
feat: send error message
lynchee-owo May 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -18,3 +18,9 @@
utils/reload/*.js
utils/reload/injections/*.js
public/manifest.json

# benchmarking
/results
tasks_test.jsonl
webwand_test_log.txt
tasks_status.txt
205 changes: 205 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""
To run benchmark testing, copy and paste corresponding test dataset from https://github.com/MinorJerry/WebVoyager/blob/main/data/WebVoyager_data.jsonl
to the tasks_test.jsonl file.
Then in terminal, first run "pnpm dev" to start the webwand server.
Then run "python benchmark.py"
"""

import base64
import os
import json
import time
import pyautogui
import logging
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

dataset = 'tasks_test.jsonl'

# Place to store task execution results
results_dir = 'results'
os.makedirs(results_dir, exist_ok=True)

# Setup logging
logs_path = 'webwand_test_log.txt'
logging.basicConfig(filename=logs_path, level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

def setup_driver():
chrome_options = Options()
# Load the unpacked webwand chrome extension
chrome_options.add_argument("--load-extension=./dist")
chrome_options.add_argument("--window-size=1600,1000")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
# Set script timeout to 120 seconds
driver.set_script_timeout(120)
return driver

def dispatch_event(driver, event_name, event):
logging.info(f'Dispatched event {event_name}')
script = f"""
var event = new CustomEvent('{event_name}', {{ detail: {json.dumps(event)} }});
document.dispatchEvent(event);
"""
driver.execute_script(script)

def add_task_listener(driver, task_id, max_retries=3):
logging.info(f'Adding task listeners for task {task_id}')
"""
Add event listeners for task history and screenshot events. Both events include task status.
Then process those events as they are captured.
"""

script = f"""
var callback = arguments[0];
var eventListener = function (e) {{
console.log("received evnet");
console.log(e);
if (e.detail.type == 'history') {{
if (e.detail.status === 'success' || e.detail.status === 'error') {{
callback({{status: e.detail.status, type: 'history', data: e.detail.data, errorMessage: e.detail.errorMessage}});
document.removeEventListener('TaskUpdate', eventListener); // Optional: remove if you need continuous listening
console.log("listener removed after history");
}}
// Does not do anything when the status is 'running' or 'idle'.
// The status 'interrupted' will never be triggered automatically.
}} else if (e.detail.type == 'screenshot') {{
callback({{status: e.detail.status, type: 'screenshot', data: e.detail.data}});
}} else {{
throw new Error("Invalid event type received: " + e.detail.type);
}}
}};

document.addEventListener('TaskUpdate', eventListener);
console.log("added event listener");
"""

attempts = 0
result = ""

def handle_event(event_data):
nonlocal attempts
nonlocal result
if not event_data:
logging.info("No event data received")
return
if event_data['type'] == 'history':
# Record history when task stops
if event_data['status'] == 'error':
logging.error(f"Task {task_id} error: {event_data['errorMessage']}")
result = event_data['status']
write_history(task_id, event_data['data'])
attempts = 0
return
if event_data['type'] == 'screenshot':
write_screenshots(task_id, event_data['data'])
attempts = 0
# Task is still running. Continue to listen for events
handle_event(driver.execute_async_script(script))
else:
logging.error(f"Unhandled event data type: {event_data['type']}")
result = "script-error"
return

while attempts < max_retries:
try:
logging.info("Setting up event listener...")
handle_event(driver.execute_async_script(script))
break
except WebDriverException as e:
if "javascript error: document unloaded while waiting for result" in str(e):
attempts += 1
logging.warning(f'Document unloaded error during task {task_id} attempt {attempts}: {str(e)}')
if attempts == max_retries:
logging.error(f'Maximum retry attempts reached for task {task_id}.')
result = 'doc-unload-max-retry'
break
else:
logging.info("Retrying...")
elif "script timeout" in str(e):
logging.error(f'Script timeout for task {task_id}: {str(e)}')
result = 'js-script-timeout'
break
else:
logging.error(f'WebDriver exception for task {task_id}: {str(e)}')
result = 'webdriver-error'
break
except Exception as e:
logging.error(f'Unhandled error for task {task_id}: {str(e)}')
result = 'python-script-error'
break
return result

def write_history(task_id, task_history):
task_dir = os.path.join(results_dir, f"test{task_id}")
os.makedirs(task_dir, exist_ok=True)
file_path = os.path.join(task_dir, 'interact_messages.json')

with open(file_path, 'w') as file:
json.dump(task_history, file, indent=4)
logging.info(f'History saved for task {task_id}')

def write_screenshots(task_id, image_data):
image_bytes = base64.b64decode(image_data)
task_dir = os.path.join(results_dir, f"test{task_id}")
os.makedirs(task_dir, exist_ok=True)
timestamp = int(time.time())
file_path = os.path.join(task_dir, f'screenshot_{timestamp}.png')
with open(file_path, 'wb') as file:
file.write(image_bytes)
logging.info(f'Screenshot saved for task {task_id}')

def run_webwand_task(driver, task_id, task_description):
logging.info(f'Start running task {task_id} {task_description}')
start = time.time()
dispatch_event(driver, 'SetAPIKey', {"value": api_key})
dispatch_event(driver, 'SetTask', {"value": task_description})
dispatch_event(driver, 'RunTask', {})
result = add_task_listener(driver, task_id)
end = time.time()
logging.info(f'Task {task_id} took {end - start} seconds to complete.')
return result

def click_extensions_icon(driver):
# Simulate click to open side panel
window_position = driver.get_window_rect()
top = window_position['y']
right = window_position['x'] + window_position['width']
# click Extensions icon
pyautogui.click(right - 165, top + 50)
# click webwand
pyautogui.click(right - 300, top + 210)

def main():
driver = setup_driver()
initial_load = True

with open(dataset, 'r') as file:
for line in file:
logging.info(f'-------------------------------------')
task = json.loads(line)
task_id = task["id"]
driver.get(task['web'])

if initial_load:
click_extensions_icon(driver)
initial_load = False

result = run_webwand_task(driver, task_id, task['ques'])
logging.info(f'Task {task_id} status: {result}')
# Optional: if the previous task timed out, reset the driver after each task to ensure proper state for the next task
if result == "js-script-timeout":
driver.quit()
driver = setup_driver()
initial_load = True
driver.quit()

if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions example_results/testGoogle--0/interact_messages.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
[
{
"action": {
"operation": {
"args": {
"label": "8",
"value": "bananas"
},
"name": "setValueAndEnter"
},
"thought": "I will enter 'bananas' into the search bar and then press Enter to perform the search."
},
"prompt": "The user requests the following task:\n\n Search for bananas\n\n\nCurrent time: 5/16/2024, 4:24:11 PM\nCurrent URL: https://www.google.com/\nCurrent page scrolling position: 100.0%\n\n\nUse the following data as a reference of the annotated elements (using `===` as a delimiter between each annotation):\n\nlabel = 1\nname = About\ntagName = A\n===\nlabel = 2\nname = Store\ntagName = A\n===\nlabel = 3\nname = Gmail\ntagName = A\n===\nlabel = 4\nname = Search for Images\ntagName = A\n===\nlabel = 5\nname = Google apps\ntagName = A\nrole = button\n===\nlabel = 6\nname = Sign in\ntagName = A\n===\nlabel = 7\nname = Celebrating Hank Adams\ntagName = A\n===\nlabel = 8\nname = Search\ntagName = TEXTAREA\ncurrentValue = \nplaceholder = \nrole = combobox\n===\nlabel = 9\nname = Search by voice\ntagName = DIV\nrole = button\n===\nlabel = 10\nname = Search by image\ntagName = DIV\nrole = button\n===\nlabel = 11\nname = Google Search\ntagName = INPUT\ncurrentValue = Google Search\nplaceholder = \nrole = button\n===\nlabel = 12\nname = I'm Feeling Lucky\ntagName = INPUT\ncurrentValue = I'm Feeling Lucky\nplaceholder = \nrole = button\n===\nlabel = 13\nname = Advertising\ntagName = A\n===\nlabel = 14\nname = Business\ntagName = A\n===\nlabel = 15\nname = How Search works\ntagName = A\n===\nlabel = 16\nname = Our third decade of climate action: join us\ntagName = A\n===\nlabel = 17\nname = Privacy\ntagName = A\n===\nlabel = 18\nname = Terms\ntagName = A\n===\nlabel = 19\nname = Settings\ntagName = DIV\nrole = button\n\nThis textarea currently has focus:\nlabel = 8\nname = Search\ntagName = TEXTAREA\ncurrentValue = \nplaceholder = \nrole = combobox\n",
"response": "{\n \"thought\": \"I will enter 'bananas' into the search bar and then press Enter to perform the search.\",\n \"action\": {\n \"name\": \"setValueAndEnter\",\n \"args\": {\n \"label\": \"8\",\n \"value\": \"bananas\"\n }\n }\n}",
"usage": {
"completion_tokens": 66,
"prompt_tokens": 2599,
"total_tokens": 2665
}
},
{
"action": {
"operation": {
"args": {},
"name": "finish"
},
"thought": "The search results for 'bananas' are displayed. The first result is a Wikipedia link about bananas."
},
"prompt": "The user requests the following task:\n\n Search for bananas\nYou have already taken the following actions: \nThought: I will enter 'bananas' into the search bar and then press Enter to perform the search.\nAction:{\"name\":\"setValueAndEnter\",\"args\":{\"label\":\"8\",\"value\":\"bananas\"}}\n\n\n\nCurrent time: 5/16/2024, 4:24:20 PM\nCurrent URL: https://www.google.com/search\nCurrent page scrolling position: 22.2%\n\n\nUse the following data as a reference of the annotated elements (using `===` as a delimiter between each annotation):\n\nlabel = 1\nname = Celebrating Hank Adams\ntagName = A\n===\nlabel = 2\nname = Search\ntagName = TEXTAREA\ncurrentValue = bananas\nplaceholder = \nrole = combobox\n===\nlabel = 3\nname = Clear\ntagName = DIV\nrole = button\n===\nlabel = 4\nname = Search by voice\ntagName = DIV\nrole = button\n===\nlabel = 5\nname = Search by image\ntagName = DIV\nrole = button\n===\nlabel = 6\nname = Search\ntagName = BUTTON\n===\nlabel = 7\nname = Settings\ntagName = DIV\nrole = button\n===\nlabel = 8\nname = Google apps\ntagName = A\nrole = button\n===\nlabel = 9\nname = Sign in\ntagName = A\n===\nlabel = 10\nname = Images\ntagName = A\nrole = link\n===\nlabel = 11\nname = Shopping\ntagName = A\nrole = link\n===\nlabel = 12\nname = Videos\ntagName = A\nrole = link\n===\nlabel = 13\nname = Forums\ntagName = A\nrole = link\n===\nlabel = 14\nname = More\ntagName = DIV\nrole = button\n===\nlabel = 15\nname = Tools\ntagName = DIV\nrole = button\n===\nlabel = 16\nname = Add Nutrition facts\ntagName = A\nrole = link\n===\nlabel = 17\nname = Add Benefits\ntagName = A\nrole = link\n===\nlabel = 18\nname = Add Recipes\ntagName = A\nrole = link\n===\nlabel = 19\nname = Add In stock\ntagName = A\nrole = link\n===\nlabel = 20\nname = Add Types\ntagName = A\nrole = link\n===\nlabel = 21\nname = Add Open now\ntagName = A\nrole = link\n===\nlabel = 22\nname = Add Tree\ntagName = A\nrole = link\n===\nlabel = 23\nname = Add Fruit\ntagName = A\nrole = link\n===\nlabel = 24\nname = Add Plant\ntagName = A\nrole = link\n===\nlabel = 25\nname = SafeSearch\ntagName = DIV\nrole = button\n===\nlabel = 26\nname = Banana Wikipedia https://en.wikipedia.org \u203a wiki \u203a Banana\ntagName = A\n===\nlabel = 27\nname = About this result\ntagName = DIV\nrole = button\n===\nlabel = 28\nname = List of banana cultivars\ntagName = A\n===\nlabel = 29\nname = Cavendish banana\ntagName = A\n===\nlabel = 30\nname = Gros Michel\ntagName = A\n===\nlabel = 31\nname = Cooking banana\ntagName = A\n===\nlabel = 32\nname = About this result\ntagName = DIV\nrole = button\n===\nlabel = 33\nname = What are the benefits of eating bananas?\ntagName = DIV\nrole = button\n===\nlabel = 34\nname = Is it good to eat a banana every day?\ntagName = DIV\nrole = button\n===\nlabel = 35\nname = What are bananas high in?\ntagName = DIV\nrole = button\n===\nlabel = 36\nname = What is the best time to eat bananas?\ntagName = DIV\nrole = button\n===\nlabel = 37\nname = Feedback\ntagName = DIV\nrole = button\n===\nlabel = 38\nname = Food Nutrition\u2026Banana nutrition\ntagName = DIV\nrole = button\n===\nlabel = 39\nname = Protein Amount\u2026protein in banana\ntagName = DIV\nrole = button\n===\nlabel = 40\nname = Equivalent\u2026what fruit is equivalent to a banana\ntagName = DIV\nrole = button\n===\nlabel = 41\nname = Sugar Amount\u2026how much sugar in a banana\ntagName = DIV\nrole = button\n===\nlabel = 42\nname = 6 more\ntagName = DIV\nrole = button\n===\nlabel = 43\nname = Feedback\ntagName = DIV\nrole = button\n",
"response": "{\n \"thought\": \"The search results for 'bananas' are displayed. The first result is a Wikipedia link about bananas.\",\n \"action\": {\n \"name\": \"finish\",\n \"args\": {}\n }\n}",
"usage": {
"completion_tokens": 47,
"prompt_tokens": 3148,
"total_tokens": 3195
}
}
]
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.