In [1]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Image, GenerationConfig

PROJECT_ID="ak-flow"
REGION = "us-central1"
vertexai.init(project=PROJECT_ID, location=REGION)

conifg = GenerationConfig(
    temperature=0.1,
)
generative_multimodal_model = GenerativeModel("gemini-pro-vision")
text_model = GenerativeModel("gemini-pro")

In [2]:
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement

options = ChromeOptions()
# options.add_argument("force-device-scale-factor=0.5")
options.add_argument("window-size=1920,1080")
driver = Chrome(options=options)

driver.get('chrome://settings/')
driver.execute_script('chrome.settingsPrivate.setDefaultZoom(1.25);')
driver.get("https://services.sungarddx.com/LogOn/?redirectUrl=%2F")
# driver.get("https://iam.intralinks.com/idp/login/?applicationid=98e3df54-2ef1-48b4-82d4-02b83d273d23&hostname=services.intralinks.com")

In [3]:
import cv2
import bs4

def screenshot_with_highlights(filename: str):
    driver.save_screenshot(filename)
    image = cv2.imread(filename)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


    selectors = ["input", "button", "a", "select", "textarea"]
    elems: list[WebElement] = []
    for selector in selectors:
        candidates = driver.find_elements(By.CSS_SELECTOR, selector)

        # filter out elements with no size
        elems.extend(filter(lambda elem: elem.is_displayed(), candidates))

    for elem in elems:
        # draw a rectangle around the element
        h, w, x, y = map(lambda x: int(x * 1.25), elem.rect.values())
        cv2.rectangle(image, (x, y), (x+w, y+h), (255, 0, 0), 2)

    cv2.imwrite(filename, cv2.cvtColor(image, cv2.COLOR_RGB2BGR)) 

    return Image.load_from_file(filename)

def get_cleaned_html():
    html = driver.find_element(
        By.CSS_SELECTOR, 'html').get_attribute('innerHTML')
    soup = bs4.BeautifulSoup(html, 'html.parser')
    # remove all the script and style tags
    for tag in soup(['head', 'title', 'meta', 'script', 'style']):
        tag.decompose()

    # remove comments
    comments = soup.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for comment in comments:
        comment.extract()

    # remove all base64 images
    for img in soup.find_all('img'):
        if img.has_attr('src') and img['src'].startswith('data:image'):
            img.decompose()

    return str(soup)

def click(css_selector: str):
    driver.find_element(By.CSS_SELECTOR, css_selector).click()

def type(css_selector: str, text: str):
    driver.find_element(By.CSS_SELECTOR, css_selector).send_keys(text)

In [4]:
def get_step(previous):
    prompt = f"""Describe what is in the picture. Provide a description of the elements in the picture.
Answer yes or no to the following questions:
- Is there a popup on the screen?
- Is there a email address or username field in the picture?
- Does the username field contain text?
- Is there a password field in the picture?
- Does the password field have anything inside of it? (text, asterisks, dots)

If you were helping a person log in, what specific action would you recommend out of the following:
- Close popups because they are blocking the view
- Enter username
- Click Next
- Enter password
- Click login

You can only choose one action. For example you can say "Close popups because they are blocking the view" or "Enter username" or "Click Next" or "Enter password" or "Click login"

You've done these steps so far:
{previous}

Don't repeat steps.

Provide the response as a json object with the following keys:
- page_description: <description of the elements in the picture>
- <question>: <yes or no>
- action: <action to take>
- how_to: <how to take the action>
Thanks!
"""
    driver.save_screenshot("screenshot.png")
    image = Image.load_from_file("screenshot.png")

    return generative_multimodal_model.generate_content([image, prompt], generation_config=conifg)


In [11]:
def get_action(action):
    findCssPrompt = f"""
You are given html and a json object describing the page as a person would see it. The json is as follows:
{action}
You have two python functions you can call to help you.
- click(css_selector: string): void
- type(css_selector: string, text: string): void

The parameters must be in quotes. For example:
click("button")
type("input", "hello")

Your username is "user@example.ca"
Your password is "password123!"

What do you need to do to accomplish the "action" as described in the json object?
What css selector do you need to interact with?
If you're selecting by ID, use "[id='<id>']" as the css selector.

Please provide the following keys in a json object:
- thoughts: your thoughts on the task with answers to the questions
- step: step description
- specific css selector to interact with: css selector
- function: <funciton call with parameters filled>
"""
    return generative_multimodal_model.generate_content([get_cleaned_html(), findCssPrompt], generation_config=conifg)

In [6]:
import re
import json

prevSteps = []

def get_json(responseText):
    extractJson = re.search(r"\{.*\}", responseText, re.DOTALL)
    return json.loads(extractJson.group(0))

def update_prev_steps(responseJson):
    global prevSteps
    prevSteps.append((responseJson["step"], responseJson["function"]))

In [7]:
prevSteps = []

In [8]:
step = get_step("\n".join([f"{step[0]}: {step[1]}" for step in prevSteps]))
print(step.text)
action = get_action(step.text)
print(action.text)
actionJson = get_json(action.text)
eval(actionJson['function'])
update_prev_steps(actionJson)

 ```json
{
 "page_description": "There is a green background with a FIS logo in the top left corner. In the center of the page is a white box with a blue FIS logo and 'Digital Data Exchange' text at the top. Below that is a form with two input fields: 'Email' and 'Password'. The 'Email' field has focus. There is a checkbox next to the 'Password' field that says 'Remember Me'. Below the 'Password' field is a blue button that says 'Submit'. Below the button is a link that says 'Forgot Your Password?'. In the bottom right corner is a dropdown that says 'Change Language: English (United States)'.",
 "is_there_a_popup_on_the_screen": "no",
 "is_there_a_email_address_or_username_field_in_the_picture": "yes",
 "does_the_username_field_contain_text": "no",
 "is_there_a_password_field_in_the_picture": "yes",
 "does_the_password_field_have_anything_inside_of_it": "no",
 "action": "Enter username",
 "how_to": "Type your username into the 'Email' field."
}
```
```json
{
 "thoughts": "I need to cli

In [9]:
step = get_step("\n".join([f"{step[0]}: {step[1]}" for step in prevSteps]))
print(step.text)
action = get_action(step.text)
print(action.text)
actionJson = get_json(action.text)
eval(actionJson['function'])
update_prev_steps(actionJson)

 ```json
{
  "page_description": "There is a green background with a FIS logo in the top left corner. In the center of the page is a white box with a form to log in. There are two fields in the form, one labeled 'Email' and the other labeled 'Password'. There is a button below the form that says 'Submit'. Below the button is a link that says 'Activate Your Account'. To the right of the 'Activate Your Account' link is a link that says 'Forgot Your Email?'. Below the 'Forgot Your Email?' link is a link that says 'Forgot Your Password?'. Below the 'Forgot Your Password?' link is a link that says 'Unlock Account'. Below the 'Unlock Account' link is a link that says 'Change Language: English (United States)'.",
  "is_there_a_popup_on_the_screen": "no",
  "is_there_a_email_address_or_username_field_in_the_picture": "yes",
  "does_the_username_field_contain_text": "yes",
  "is_there_a_password_field_in_the_picture": "yes",
  "does_the_password_field_have_anything_inside_of_it": "no",
  "actio

In [13]:
step = get_step("\n".join([f"{step[0]}: {step[1]}" for step in prevSteps]))
print(step.text)
action = get_action(step.text)
print(action.text)
actionJson = get_json(action.text)
eval(actionJson['function'])
update_prev_steps(actionJson)

 ```json
{
  "page_description": "There is a green background with a FIS logo in the top left corner. In the center of the page is a white box with a form to login. There are two fields in the form, one labeled 'Email' and the other labeled 'Password'. The email field contains the text 'user@example.ca'. There is a button below the form that says 'Submit'. Below the button is a link that says 'Forgot Your Password?'",
  "Is there a popup on the screen?": "no",
  "Is there a email address or username field in the picture?": "yes",
  "Does the username field contain text?": "yes",
  "Is there a password field in the picture?": "yes",
  "Does the password field have anything inside of it? (text, asterisks, dots)": "yes",
  "action": "Click login",
  "how_to": "Click the 'Submit' button."
}
```
```json
{
  "thoughts": "I need to click the submit button to log in. The submit button has the css selector '#854e7817c6fc41dda6d1c8d43c5dd920'.",
  "step": "Click the submit button.",
  "specific 

InvalidSelectorException: Message: invalid selector: An invalid or illegal selector was specified
  (Session info: chrome=121.0.6167.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalid-selector-exception
Stacktrace:
#0 0x5629411bad93 <unknown>
#1 0x562940e9e337 <unknown>
#2 0x562940ea369a <unknown>
#3 0x562940ea596f <unknown>
#4 0x562940ea5a1c <unknown>
#5 0x562940ee9a6e <unknown>
#6 0x562940ee9e51 <unknown>
#7 0x562940f2dbc4 <unknown>
#8 0x562940f0c46d <unknown>
#9 0x562940f2b12d <unknown>
#10 0x562940f0c1e3 <unknown>
#11 0x562940edc135 <unknown>
#12 0x562940edd13e <unknown>
#13 0x56294117ee4b <unknown>
#14 0x562941182dfa <unknown>
#15 0x56294116b6d5 <unknown>
#16 0x562941183a6f <unknown>
#17 0x56294114f69f <unknown>
#18 0x5629411a8098 <unknown>
#19 0x5629411a8262 <unknown>
#20 0x5629411b9f34 <unknown>
#21 0x7fa1a397fac3 <unknown>
