### Prototype Code : Scraper

In [1]:
from selenium import webdriver

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

def print_list(group):
    for member in group:
        print(member)

In [None]:
# GET URL FUNCTION
def get_url():
    return input("Enter Website URL: ")

# DATA SCRAPING FUNCTION
def data_scrape(url):
    # Set up Selenium with a WebDriver, e.g., ChromeDriver
    driver = webdriver.Firefox()
    
    # Open the website
    driver.get(url)
    
    # Find all visible buttons
    buttons = driver.find_elements(By.CSS_SELECTOR, "button")
    visible_buttons = []
    for button in buttons:
        if button.is_displayed():
            store = f"Button Element: {button.text} with URL : {button.get_attribute('href')}"
            visible_buttons.append(store)
            
    # Find all visible links
    links = driver.find_elements(By.TAG_NAME,"a")
    visible_links = []
    for link in links:
        if link.is_displayed():
            store = f"Link Element: {link.text} with URL : {link.get_attribute('href')}"
            visible_links.append(store)
            
    # Find all visible forms (text inputs)
    visible_text_inputs = []
    # Scrape input fields
    input_fields = driver.find_elements(By.TAG_NAME,"input")
    for input_field in input_fields:
        if input_field.is_displayed():
            store = f"Input Field Name: {input_field.get_attribute("name")} Input field type: {input_field.get_attribute("type")}"
            visible_text_inputs.append(store)
    
    # Find all visible text
    visible_text = []
    # Scrape headings
    for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
        headings = driver.find_elements(By.TAG_NAME,f"h{level}")
        for heading in headings:
            if heading.is_displayed():
                store = f"Heading (h{level}): {heading.text}"
                visible_text.append(store)
    # Scrape text content
    paragraphs = driver.find_elements(By.TAG_NAME,"p")
    for paragraph in paragraphs:
        if paragraph.is_displayed():
            store = f"Paragraph Text: {paragraph.text}"
            visible_text.append(store)
            
    # Find all visible navigation menus
    visible_nav_menus = []
    # Scrape navigation menus
    navigation_menus = driver.find_elements(By.TAG_NAME,"nav")
    for nav in navigation_menus:
        if nav.is_displayed():
            store = f"Navigation Menu: {nav.text}"
            visible_nav_menus.append(store)
            
    # Close the browser
    driver.quit()

    # Return Data
    data = []
    
    # Data append
    data.append(visible_buttons)
    data.append(visible_links)
    data.append(visible_text_inputs)
    data.append(visible_text)
    data.append(visible_nav_menus)

    return data

In [None]:
# Possibly Outdated: Other methods
from selenium import webdriver

# Set up Selenium with a WebDriver, e.g., ChromeDriver
driver = webdriver.Chrome()

# Open the website
driver.get("https://example.com")

# Find an element (example button)
button = driver.find_element_by_css_selector("button")

# Get various properties
button_text = button.text
button_id = button.get_attribute("id")
button_class = button.get_attribute("class")
button_color = button.value_of_css_property("background-color")
button_size = button.size
button_location = button.location
button_tag_name = button.tag_name
button_visible = button.is_displayed()
button_enabled = button.is_enabled()
button_inner_html = button.get_attribute("innerHTML")

# Print properties
print("Button text:", button_text)
print("Button ID:", button_id)
print("Button class:", button_class)
print("Button color:", button_color)
print("Button size:", button_size)
print("Button location:", button_location)
print("Button tag name:", button_tag_name)
print("Button visible:", button_visible)
print("Button enabled:", button_enabled)
print("Button inner HTML:", button_inner_html)

# Close the browser
driver.quit()


### Version 1: Improvement According to Design

##### Planning

In [None]:
#####################################
# GET URL FUNCTION (non UI version) #
#####################################
def get_url():
    return input("Enter Website URL: ")

# DATA SCRAPING FUNCTION
def data_scrape(url):
    # Setup Selenium Webdriver
    # TODO : Add other drivers according to user settings. Detect user browser and use that as driver potentially
    driver = webdriver.Firefox()
    # Setup Return Data
    data = []

    # Open the website
    driver.get(url)

    #######################################
    ## GET UI ELEMENTS (must be visible) ##
    #######################################

    # BUTTONS
    # [~] TODO : "Button <button-name> with size <size> and color <color>. This button is <isEnabled>"
    """
    - Name (text)
    - Size 
    - Color 
    - isEnabled
    """
    buttons = driver.find_elements(By.CSS_SELECTOR, "button")
    visible_buttons = []
    for button in buttons:
        if button.is_displayed():
            
            # Variation a - Simple (only link assigned to button alongside button name)
            button_link = button.get_attribute('href')

            # Variation b - Complex (with additional properties)
            button_size = button.size
            button_color = button.value_of_css_property("background-color")
            button_enabled = "is clickable" if button.is_enabled() else "not clickable"
            button_template = f"with Size: {button_size} with Color: {button_color} and the button {button_enabled}"

            if (button_link != ""):
                store = f"Button Element: {button.text} with URL : {button_link} {button_template}"
            else:
                store = f"Button Element: {button.text} {button_template}"
            visible_buttons.append(store)
            
    # LINKS
    # [~] TODO : "Link <link-name> with url <href>"
    """
    - Name (text)
    - href
    """
    links = driver.find_elements(By.TAG_NAME,"a")
    visible_links = []
    for link in links:
        if link.is_displayed():
            
            link_url = f"with URL : {link.get_attribute('href')}" if link.get_attribute('href') != "" else ""

            store = f"Link Element: {link.text} {link_url}"
            visible_links.append(store)

    # VISIBLE TEXT
    # [~] TODO : 
    #   Separate Paragraphs From HNs [perhaps no need]
    #   ""<p> or <h> with content <text>""
    """
    - Content (text)
    """
    visible_text = []
    # HN
    for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
        headings = driver.find_elements(By.TAG_NAME,f"h{level}")
        for heading in headings:
            if heading.is_displayed():
                store = f"Heading Element (h{level}): {heading.text}"
                visible_text.append(store)
    # PARAGRAPHS
    paragraphs = driver.find_elements(By.TAG_NAME,"p")
    for paragraph in paragraphs:
        if paragraph.is_displayed():
            store = f"Paragraph Element: {paragraph.text}"
            visible_text.append(store)

    # FORMS : INPUT TYPE TEXT
    # [~] TODO : 
    #   Input Type Text : "form input of type text named <name>"
    #   Input Type Submit : "form input of type submit named <name>"
    """
    - input type text
        - Name
    - input type submit
        - Name
    """
    visible_text_inputs = []
    input_tags = driver.find_elements(By.TAG_NAME,"input")
    for input_tag in input_tags:
        if input_tag.is_displayed():
            if (input_tag.get_attribute("type") == "submit"):
                store = f"Form Submit Button Element: {input_tag.get_attribute("name")}"
            else:
                store = f"Input Field ELement: {input_tag.get_attribute("name")} and Input field type: {input_tag.get_attribute("type")}"
            visible_text_inputs.append(store)
            
    # Close the browser
    driver.quit()
    
    ###########################################
    ## COMPILE ELEMENTS INTO A LIST OF LISTS ##
    ###########################################

    data.append(visible_buttons)
    data.append(visible_links)
    data.append(visible_text_inputs)
    data.append(visible_text)

    return data

##### Design-Coding : V1 (Individual) [TO-TEST]

In [3]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

buttons = driver.find_elements(By.CSS_SELECTOR, "button")
visible_buttons = []
for button in buttons:
    if button.is_displayed():
        # Button Element Attributes
        button_text = f"Button Element '{button.text}'" if (button.text != None and button.text != "") else "Button Element Without Name"
        button_size = f"With Size Attribute {button.size}" if button.size != None else "Without Size Attribute"
        button_color = f"With Background Color {button.value_of_css_property("background-color")}" if button.value_of_css_property("background-color") != None else "Without Background Color"
        button_enabled = "and the button is Clickable" if button.is_enabled() else "and the button is not Clickable"
        
        # Store Button Element
        store = f"{button_text} {button_size} {button_color} {button_enabled}"
        visible_buttons.append(store)

driver.quit()

## BUTTON TEST ##
print_list(visible_buttons)

Button Element 'Search' With Size Attribute {'height': 32.0, 'width': 70.2833251953125} With Background Color rgb(248, 249, 250) and the button is Clickable
Button Element 'hide' With Size Attribute {'height': 22.40000915527344, 'width': 36.80000305175781} With Background Color rgb(234, 236, 240) and the button is Clickable
Button Element 'Toggle History subsection' With Size Attribute {'height': 22.0, 'width': 22.0} With Background Color rgba(0, 0, 0, 0) and the button is Clickable
Button Element 'Toggle Geography subsection' With Size Attribute {'height': 22.0, 'width': 22.0} With Background Color rgba(0, 0, 0, 0) and the button is Clickable
Button Element 'Toggle Government and politics subsection' With Size Attribute {'height': 22.0, 'width': 22.0} With Background Color rgba(0, 0, 0, 0) and the button is Clickable
Button Element 'Toggle Economy subsection' With Size Attribute {'height': 22.0, 'width': 22.0} With Background Color rgba(0, 0, 0, 0) and the button is Clickable
Button E

In [6]:

driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

links = driver.find_elements(By.TAG_NAME,"a")
visible_links = []
for link in links:
    if link.is_displayed():
        # Link Element Attributes
        link_text = f"Link Element '{link.text}'" if (link.text != None and link.text != "") else "Link Element Without Name"
        link_url = f"With URL {link.get_attribute('href')}" if link.get_attribute('href') != None else "Without URL"
        # Additional Attributes
        link_rel = f"With rel attribute {link.get_attribute('rel')}" if link.get_attribute('rel') != None and link.get_attribute('rel') != "" else ""
        link_target = f"With target attribute {link.get_attribute('target')}" if link.get_attribute('target') != None and link.get_attribute('target') != "" else ""
        link_download = f"This is a download link to document {link.get_attribute('download')}" if link.get_attribute('download') != None and link.get_attribute('download') != "" else ""
        
        # Store Link Element
        store = f"{link_text} {link_url} {link_rel} {link_target} {link_download}"
        visible_links.append(store)

driver.quit()

## LINKS TEST ##
print_list(visible_links)

Link Element 'Jump to content' With URL https://en.wikipedia.org/wiki/Philippines#bodyContent   
Link Element Without Name With URL https://en.wikipedia.org/wiki/Main_Page   
Link Element 'Donate' With URL https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en   
Link Element 'Create account' With URL https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Philippines   
Link Element 'Log in' With URL https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Philippines   
Link Element '(Top)' With URL https://en.wikipedia.org/wiki/Philippines#   
Link Element 'Etymology' With URL https://en.wikipedia.org/wiki/Philippines#Etymology   
Link Element 'History' With URL https://en.wikipedia.org/wiki/Philippines#History   
Link Element 'Geography' With URL https://en.wikipedia.org/wiki/Philippines#Geography   
Link Element 'Government and politics' With URL https://en.wikipedia.org/wiki/Philippines#Government

In [None]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

visible_text = []
# Heading Elements
for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
    headings = driver.find_elements(By.TAG_NAME,f"h{level}")
    for heading in headings:
        if heading.is_displayed():
            store = f"Heading Element (h{level}): '{heading.text}'"
            # Store
            visible_text.append(store)
# Paragraph Elements
paragraphs = driver.find_elements(By.TAG_NAME,"p")
for paragraph in paragraphs:
    if paragraph.is_displayed():
        store = f"Paragraph Element: '{paragraph.text}'"
        # Store
        visible_text.append(store)

driver.quit()

print_list(visible_text)

In [9]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

visible_text_inputs = []
input_tags = driver.find_elements(By.TAG_NAME,"input")
for input_tag in input_tags:
    if input_tag.is_displayed():
        # Basic Attributes
        input_field_name = f"Input Field Element: '{input_tag.get_attribute('name')}'" if input_tag.get_attribute('name') != None else "Without Name"
        input_field_type = f"With Type {input_tag.get_attribute('type')}" if input_tag.get_attribute('name') != None else "Without Type"
        # Additional Attributes
        input_field_value = f"With Value {input_tag.get_attribute('value')}" if input_tag.get_attribute('name') != None else "Without Value"
        input_field_placeholder = f"With Placeholder '{input_tag.get_attribute('placeholder')}'" if input_tag.get_attribute('name') != None else "Without Placeholder"
        input_field_readonly = f"Is Readonly" if input_tag.get_attribute('readonly') != None else "Is Editable"
        input_field_disabled = f"Is Not Clickable" if input_tag.get_attribute('disabled') != None else "Is Clickable"
        input_field_required = f"Is Required" if input_tag.get_attribute('required') != None else "Not Required"
        input_field_autocomplete = f"Is Autocomplete" if input_tag.get_attribute('autocomplete') != None else "Not Autocomplete"

        # Store based on type
        if (input_tag.get_attribute('type') == "submit"):
            store = f"Form Submit Button Element: {input_tag.get_attribute("name")} {input_field_disabled}"
        else:
            store = f"{input_field_name} {input_field_type} {input_field_value} {input_field_placeholder} {input_field_readonly} {input_field_disabled} {input_field_disabled} {input_field_autocomplete}"
        visible_text_inputs.append(store)

driver.quit()

print_list(visible_text_inputs)

Input Field Element: 'search' With Type search With Value  With Placeholder 'Search Wikipedia' Is Editable Is Clickable Is Clickable Is Autocomplete


##### V1 Resulting Version

In [None]:
## SCRAPER IMPORTS ##
from selenium import webdriver

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

## GET URL FUNCTION (non UI version) ##
def get_url():
    return input("Enter Website URL: ")

# DATA SCRAPING FUNCTION
def data_scrape(url):
    # Setup Selenium Webdriver
    # TODO : For V2 Add other drivers according to user settings. Detect user browser and use that as driver potentially
    driver = webdriver.Firefox()
    # Setup Return Data
    data = []

    # Open the website
    driver.get(url)
    
    ## GET UI ELEMENTS (must be visible) ##

    # BUTTONS
    buttons = driver.find_elements(By.CSS_SELECTOR, "button")
    for button in buttons:
        if button.is_displayed():
            # Button Element Attributes
            button_text = f"Button Element '{button.text}'" if (button.text != None and button.text != "") else "Button Element Without Name"
            button_size = f"With Size Attribute {button.size}" if button.size != None else "Without Size Attribute"
            button_color = f"With Background Color {button.value_of_css_property("background-color")}" if button.value_of_css_property("background-color") != None else "Without Background Color"
            button_enabled = "and the button is Clickable" if button.is_enabled() else "and the button is not Clickable"
            
            # Store Button Element
            store = f"{button_text} {button_size} {button_color} {button_enabled}"
            data.append(store)
            
    # LINKS
    links = driver.find_elements(By.TAG_NAME,"a")
    for link in links:
        if link.is_displayed():
            # Link Element Attributes
            link_text = f"Link Element '{link.text}'" if (link.text != None and link.text != "") else "Link Element Without Name"
            link_url = f"With URL {link.get_attribute('href')}" if link.get_attribute('href') != None else "Without URL"
            # Additional Attributes
            link_rel = f"With rel attribute {link.get_attribute('rel')}" if link.get_attribute('rel') != None and link.get_attribute('rel') != "" else ""
            link_target = f"With target attribute {link.get_attribute('target')}" if link.get_attribute('target') != None and link.get_attribute('target') != "" else ""
            link_download = f"This is a download link to document {link.get_attribute('download')}" if link.get_attribute('download') != None and link.get_attribute('download') != "" else ""
            
            # Store Link Element
            store = f"{link_text} {link_url} {link_rel} {link_target} {link_download}"
            data.append(store)

    # VISIBLE TEXT
    # Heading Elements
    for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
        headings = driver.find_elements(By.TAG_NAME,f"h{level}")
        for heading in headings:
            if heading.is_displayed():
                store = f"Heading Element (h{level}): '{heading.text}'"
                # Store
                data.append(store)
    # Paragraph Elements
    paragraphs = driver.find_elements(By.TAG_NAME,"p")
    for paragraph in paragraphs:
        if paragraph.is_displayed():
            store = f"Paragraph Element: '{paragraph.text}'"
            # Store
            data.append(store)

    # INPUT
    input_tags = driver.find_elements(By.TAG_NAME,"input")
    for input_tag in input_tags:
        if input_tag.is_displayed():
            # Basic Attributes
            input_field_name = f"Input Field Element: '{input_tag.get_attribute('name')}'" if input_tag.get_attribute('name') != None else "Without Name"
            input_field_type = f"With Type {input_tag.get_attribute('type')}" if input_tag.get_attribute('name') != None else "Without Type"
            # Additional Attributes
            input_field_value = f"With Value {input_tag.get_attribute('value')}" if input_tag.get_attribute('name') != None else "Without Value"
            input_field_placeholder = f"With Placeholder '{input_tag.get_attribute('placeholder')}'" if input_tag.get_attribute('name') != None else "Without Placeholder"
            input_field_readonly = f"Is Readonly" if input_tag.get_attribute('readonly') != None else "Is Editable"
            input_field_disabled = f"Is Not Clickable" if input_tag.get_attribute('disabled') != None else "Is Clickable"
            input_field_required = f"Is Required" if input_tag.get_attribute('required') != None else "Not Required"
            input_field_autocomplete = f"Is Autocomplete" if input_tag.get_attribute('autocomplete') != None else "Not Autocomplete"

            # Store based on type
            if (input_tag.get_attribute('type') == "submit"):
                store = f"Form Submit Button Element: {input_tag.get_attribute("name")} {input_field_disabled}"
            else:
                store = f"{input_field_name} {input_field_type} {input_field_required} {input_field_value} {input_field_placeholder} {input_field_readonly} {input_field_disabled} {input_field_autocomplete}"
            data.append(store)
            
    # Close the browser
    driver.quit()
    
    ## RETURN COMPILED ELEMENT DATA ##
    return data

##### Version 2: Polished Context

In [None]:
# TODO :
# Polish Conditionals' results (make it more understandable to the llm)
# Consider adding better context to llm prompt

##### Back to the drawing board : individual testing

In [None]:
# Webdriver testing

def setup_driver():
    try :
        driver = webdriver.Firefox() # Default with Firefox
    except Exception:
        try : 
            driver = webdriver.Chrome() # Default with Chrome
        except Exception:
            try : 
                driver = webdriver.Edge() # Default with Edge
            except Exception:
                print("Error. No usable browser found for scraping")
    
    return driver

In [None]:
"""
Button Element
Name : 
Size Attribute : large/small, to be specific : {}
Background Color : 
Is Clickable : 
"""

In [None]:
# BUTTONS
buttons = driver.find_elements(By.CSS_SELECTOR, "button")
for button in buttons:
    if button.is_displayed():
        # Button Element Attributes
        button_text = f"Button Element '{button.text}'" if (button.text != None and button.text != "") else "Button Element Without Name"
        button_size = f"With Size Attribute {button.size}" if button.size != None else "Without Size Attribute"
        button_color = f"With Background Color {button.value_of_css_property("background-color")}" if button.value_of_css_property("background-color") != None else "Without Background Color"
        button_enabled = "and the button is Clickable" if button.is_enabled() else "and the button is not Clickable"
        
        # Store Button Element
        store = f"{button_text} {button_size} {button_color} {button_enabled}"

In [None]:
    # LINKS
links = driver.find_elements(By.TAG_NAME,"a")
for link in links:
    if link.is_displayed():
        # Link Element Attributes
        link_text = f"Link Element '{link.text}'" if (link.text != None and link.text != "") else "Link Element Without Name"
        link_url = f"With URL {link.get_attribute('href')}" if link.get_attribute('href') != None else "Without URL"
        # Additional Attributes
        link_rel = f"With rel attribute {link.get_attribute('rel')}" if link.get_attribute('rel') != None and link.get_attribute('rel') != "" else ""
        link_target = f"With target attribute {link.get_attribute('target')}" if link.get_attribute('target') != None and link.get_attribute('target') != "" else ""
        link_download = f"This is a download link to document {link.get_attribute('download')}" if link.get_attribute('download') != None and link.get_attribute('download') != "" else ""
        
        # Store Link Element
        store = f"{link_text} {link_url} {link_rel} {link_target} {link_download}"



In [None]:
# VISIBLE TEXT
# Heading Elements
for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
    headings = driver.find_elements(By.TAG_NAME,f"h{level}")
    for heading in headings:
        if heading.is_displayed():
            store = f"Heading Element (h{level}): '{heading.text}'"
# Paragraph Elements
paragraphs = driver.find_elements(By.TAG_NAME,"p")
for paragraph in paragraphs:
    if paragraph.is_displayed():
        store = f"Paragraph Element: '{paragraph.text}'"

In [None]:
# INPUT
input_tags = driver.find_elements(By.TAG_NAME,"input")
for input_tag in input_tags:
    if input_tag.is_displayed():
        # Basic Attributes
        input_field_name = f"Input Field Element: '{input_tag.get_attribute('name')}'" if input_tag.get_attribute('name') != None else "Without Name"
        input_field_type = f"With Type {input_tag.get_attribute('type')}" if input_tag.get_attribute('name') != None else "Without Type"
        # Additional Attributes
        input_field_value = f"With Value {input_tag.get_attribute('value')}" if input_tag.get_attribute('name') != None else "Without Value"
        input_field_placeholder = f"With Placeholder '{input_tag.get_attribute('placeholder')}'" if input_tag.get_attribute('name') != None else "Without Placeholder"
        input_field_readonly = f"Is Readonly" if input_tag.get_attribute('readonly') != None else "Is Editable"
        input_field_disabled = f"Is Not Clickable" if input_tag.get_attribute('disabled') != None else "Is Clickable"
        input_field_required = f"Is Required" if input_tag.get_attribute('required') != None else "Not Required"
        input_field_autocomplete = f"Is Autocomplete" if input_tag.get_attribute('autocomplete') != None else "Not Autocomplete"

        # Store based on type
        if (input_tag.get_attribute('type') == "submit"):
            store = f"Form Submit Button Element: {input_tag.get_attribute("name")} {input_field_disabled}"
        else:
            store = f"{input_field_name} {input_field_type} {input_field_required} {input_field_value} {input_field_placeholder} {input_field_readonly} {input_field_disabled} {input_field_autocomplete}"