### Prototype Code : Scraper

In [7]:
from selenium import webdriver

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

def print_list(group):
    for member in group:
        print(member)

In [None]:
# GET URL FUNCTION
def get_url():
    return input("Enter Website URL: ")

# DATA SCRAPING FUNCTION
def data_scrape(url):
    # Set up Selenium with a WebDriver, e.g., ChromeDriver
    driver = webdriver.Firefox()
    
    # Open the website
    driver.get(url)
    
    # Find all visible buttons
    buttons = driver.find_elements(By.CSS_SELECTOR, "button")
    visible_buttons = []
    for button in buttons:
        if button.is_displayed():
            store = f"Button Element: {button.text} with URL : {button.get_attribute('href')}"
            visible_buttons.append(store)
            
    # Find all visible links
    links = driver.find_elements(By.TAG_NAME,"a")
    visible_links = []
    for link in links:
        if link.is_displayed():
            store = f"Link Element: {link.text} with URL : {link.get_attribute('href')}"
            visible_links.append(store)
            
    # Find all visible forms (text inputs)
    visible_text_inputs = []
    # Scrape input fields
    input_fields = driver.find_elements(By.TAG_NAME,"input")
    for input_field in input_fields:
        if input_field.is_displayed():
            store = f"Input Field Name: {input_field.get_attribute("name")} Input field type: {input_field.get_attribute("type")}"
            visible_text_inputs.append(store)
    
    # Find all visible text
    visible_text = []
    # Scrape headings
    for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
        headings = driver.find_elements(By.TAG_NAME,f"h{level}")
        for heading in headings:
            if heading.is_displayed():
                store = f"Heading (h{level}): {heading.text}"
                visible_text.append(store)
    # Scrape text content
    paragraphs = driver.find_elements(By.TAG_NAME,"p")
    for paragraph in paragraphs:
        if paragraph.is_displayed():
            store = f"Paragraph Text: {paragraph.text}"
            visible_text.append(store)
            
    # Find all visible navigation menus
    visible_nav_menus = []
    # Scrape navigation menus
    navigation_menus = driver.find_elements(By.TAG_NAME,"nav")
    for nav in navigation_menus:
        if nav.is_displayed():
            store = f"Navigation Menu: {nav.text}"
            visible_nav_menus.append(store)
            
    # Close the browser
    driver.quit()

    # Return Data
    data = []
    
    # Data append
    data.append(visible_buttons)
    data.append(visible_links)
    data.append(visible_text_inputs)
    data.append(visible_text)
    data.append(visible_nav_menus)

    return data

In [None]:
# Possibly Outdated: Other methods
from selenium import webdriver

# Set up Selenium with a WebDriver, e.g., ChromeDriver
driver = webdriver.Chrome()

# Open the website
driver.get("https://example.com")

# Find an element (example button)
button = driver.find_element_by_css_selector("button")

# Get various properties
button_text = button.text
button_id = button.get_attribute("id")
button_class = button.get_attribute("class")
button_color = button.value_of_css_property("background-color")
button_size = button.size
button_location = button.location
button_tag_name = button.tag_name
button_visible = button.is_displayed()
button_enabled = button.is_enabled()
button_inner_html = button.get_attribute("innerHTML")

# Print properties
print("Button text:", button_text)
print("Button ID:", button_id)
print("Button class:", button_class)
print("Button color:", button_color)
print("Button size:", button_size)
print("Button location:", button_location)
print("Button tag name:", button_tag_name)
print("Button visible:", button_visible)
print("Button enabled:", button_enabled)
print("Button inner HTML:", button_inner_html)

# Close the browser
driver.quit()


### Version 1: Improvement According to Design

In [None]:
#####################################
# GET URL FUNCTION (non UI version) #
#####################################
def get_url():
    return input("Enter Website URL: ")

# DATA SCRAPING FUNCTION
def data_scrape(url):
    # Setup Selenium Webdriver
    # TODO : Add other drivers according to user settings. Detect user browser and use that as driver potentially
    driver = webdriver.Firefox()
    # Setup Return Data
    data = []

    # Open the website
    driver.get(url)

    #######################################
    ## GET UI ELEMENTS (must be visible) ##
    #######################################

    # BUTTONS
    # [~] TODO : "Button <button-name> with size <size> and color <color>. This button is <isEnabled>"
    """
    - Name (text)
    - Size 
    - Color 
    - isEnabled
    """
    buttons = driver.find_elements(By.CSS_SELECTOR, "button")
    visible_buttons = []
    for button in buttons:
        if button.is_displayed():
            
            # Variation a - Simple (only link assigned to button alongside button name)
            button_link = button.get_attribute('href')

            # Variation b - Complex (with additional properties)
            button_size = button.size
            button_color = button.value_of_css_property("background-color")
            button_enabled = "is clickable" if button.is_enabled() else "not clickable"
            button_template = f"with Size: {button_size} with Color: {button_color} and the button {button_enabled}"

            if (button_link != ""):
                store = f"Button Element: {button.text} with URL : {button_link} {button_template}"
            else:
                store = f"Button Element: {button.text} {button_template}"
            visible_buttons.append(store)
            
    # LINKS
    # [~] TODO : "Link <link-name> with url <href>"
    """
    - Name (text)
    - href
    """
    links = driver.find_elements(By.TAG_NAME,"a")
    visible_links = []
    for link in links:
        if link.is_displayed():
            
            link_url = f"with URL : {link.get_attribute('href')}" if link.get_attribute('href') != "" else ""

            store = f"Link Element: {link.text} {link_url}"
            visible_links.append(store)

    # VISIBLE TEXT
    # [~] TODO : 
    #   Separate Paragraphs From HNs [perhaps no need]
    #   ""<p> or <h> with content <text>""
    """
    - Content (text)
    """
    visible_text = []
    # HN
    for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
        headings = driver.find_elements(By.TAG_NAME,f"h{level}")
        for heading in headings:
            if heading.is_displayed():
                store = f"Heading Element (h{level}): {heading.text}"
                visible_text.append(store)
    # PARAGRAPHS
    paragraphs = driver.find_elements(By.TAG_NAME,"p")
    for paragraph in paragraphs:
        if paragraph.is_displayed():
            store = f"Paragraph Element: {paragraph.text}"
            visible_text.append(store)

    # FORMS : INPUT TYPE TEXT
    # [~] TODO : 
    #   Input Type Text : "form input of type text named <name>"
    #   Input Type Submit : "form input of type submit named <name>"
    """
    - input type text
        - Name
    - input type submit
        - Name
    """
    visible_text_inputs = []
    input_tags = driver.find_elements(By.TAG_NAME,"input")
    for input_tag in input_tags:
        if input_tag.is_displayed():
            if (input_tag.get_attribute("type") == "submit"):
                store = f"Form Submit Button Element: {input_tag.get_attribute("name")}"
            else:
                store = f"Input Field ELement: {input_tag.get_attribute("name")} and Input field type: {input_tag.get_attribute("type")}"
            visible_text_inputs.append(store)
            
    # Close the browser
    driver.quit()
    
    ###########################################
    ## COMPILE ELEMENTS INTO A LIST OF LISTS ##
    ###########################################

    data.append(visible_buttons)
    data.append(visible_links)
    data.append(visible_text_inputs)
    data.append(visible_text)

    return data

### Test : V1 (Batch)

In [None]:
url = get_url()
print(data_scrape(url))

### Test : V1 (Individual)

In [16]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

buttons = driver.find_elements(By.CSS_SELECTOR, "button")
visible_buttons = []
for button in buttons:
    if button.is_displayed():
        
        button_link = button.get_attribute('href')
        button_size = button.size
        button_color = button.value_of_css_property("background-color")
        button_enabled = "is clickable" if button.is_enabled() else "not clickable"
        button_template = f"with Size: {button_size} with Color: {button_color} and the button {button_enabled}"

        if (button_link != None):
            store = f"Button Element: {button.text} with URL : {button_link} {button_template}"
        else:
            store = f"Button Element: '{button.text}' {button_template}"
        visible_buttons.append(store)

driver.quit()

## BUTTON TEST ##
print_list(visible_buttons)


Button Element: 'Search' with Size: {'height': 32.0, 'width': 70.2833251953125} with Color: rgb(248, 249, 250) and the button is clickable
Button Element: 'hide' with Size: {'height': 22.40000915527344, 'width': 36.80000305175781} with Color: rgb(234, 236, 240) and the button is clickable
Button Element: 'Toggle History subsection' with Size: {'height': 22.0, 'width': 22.0} with Color: rgba(0, 0, 0, 0) and the button is clickable
Button Element: 'Toggle Geography subsection' with Size: {'height': 22.0, 'width': 22.0} with Color: rgba(0, 0, 0, 0) and the button is clickable
Button Element: 'Toggle Government and politics subsection' with Size: {'height': 22.0, 'width': 22.0} with Color: rgba(0, 0, 0, 0) and the button is clickable
Button Element: 'Toggle Economy subsection' with Size: {'height': 22.0, 'width': 22.0} with Color: rgba(0, 0, 0, 0) and the button is clickable
Button Element: 'Toggle Infrastructure subsection' with Size: {'height': 22.0, 'width': 22.0} with Color: rgba(0, 0,

In [None]:

driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

links = driver.find_elements(By.TAG_NAME,"a")
visible_links = []
for link in links:
    if link.is_displayed():
    
        link_url = link.get_attribute('href')
        # TODO : Add better conditioning if there is no url

        link_text = link.text
        # TODO : use link_text for determining if a link <a> has a name

        if (link_url != None):
            store = f"Link Element: {link.text} with URL : {link.get_attribute('href')}"
        else:
            store = f"Link Element: {link.text} without URL"
        
        visible_links.append(store)

driver.quit()

## LINKS TEST ##
print_list(visible_links)

# Test Results:
# 3min 11.1s is a lot of time but it's not too important

Link Element: Jump to content with URL : https://en.wikipedia.org/wiki/Philippines#bodyContent
Link Element:  with URL : https://en.wikipedia.org/wiki/Main_Page
Link Element: Donate with URL : https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
Link Element: Create account with URL : https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Philippines
Link Element: Log in with URL : https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Philippines
Link Element: (Top) with URL : https://en.wikipedia.org/wiki/Philippines#
Link Element: Etymology with URL : https://en.wikipedia.org/wiki/Philippines#Etymology
Link Element: History with URL : https://en.wikipedia.org/wiki/Philippines#History
Link Element: Geography with URL : https://en.wikipedia.org/wiki/Philippines#Geography
Link Element: Government and politics with URL : https://en.wikipedia.org/wiki/Philippines#Government_and_politics
Link Element:

In [21]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

# TODO : what if link element is inside an <a>

visible_text = []
for level in range(1, 7):  # HTML has 6 levels of headings (h1 to h6)
    headings = driver.find_elements(By.TAG_NAME,f"h{level}")
    for heading in headings:
        if heading.is_displayed():
            store = f"Heading Element (h{level}): {heading.text}"
            visible_text.append(store)
paragraphs = driver.find_elements(By.TAG_NAME,"p")
for paragraph in paragraphs:
    if paragraph.is_displayed():
        store = f"Paragraph Element: {paragraph.text}"
        visible_text.append(store)

driver.quit()

print_list(visible_text)

Heading Element (h1): Philippines
Heading Element (h2): Contents
Heading Element (h2): Etymology
Heading Element (h2): History
Heading Element (h2): Geography
Heading Element (h2): Government and politics
Heading Element (h2): Economy
Heading Element (h2): Infrastructure
Heading Element (h2): Demographics
Heading Element (h2): Culture
Heading Element (h2): See also
Heading Element (h2): Notes
Heading Element (h2): References
Heading Element (h2): Further reading
Heading Element (h2): External links
Heading Element (h3): Prehistory (pre–900)
Heading Element (h3): Early states (900–1565)
Heading Element (h3): Spanish and American colonial rule (1565–1934)
Heading Element (h3): Commonwealth and World War II (1935–1946)
Heading Element (h3): Independence (1946–present)
Heading Element (h3): Biodiversity
Heading Element (h3): Climate
Heading Element (h3): Foreign relations
Heading Element (h3): Military
Heading Element (h3): Administrative divisions
Heading Element (h3): Science and technol

In [None]:
driver = webdriver.Firefox()
driver.get("https://en.wikipedia.org/wiki/Philippines")

# TODO : Form Get Elements

visible_text_inputs = []
input_tags = driver.find_elements(By.TAG_NAME,"input")
for input_tag in input_tags:
    if input_tag.is_displayed():

        # TODO : Form Get Elements
        if (input_tag.get_attribute("type") == "submit"):
            store = f"Form Submit Button Element: {input_tag.get_attribute("name")}"
        else:
            store = f"Input Field Element: {input_tag.get_attribute("name")} and Input Field Type: {input_tag.get_attribute("type")}"
        visible_text_inputs.append(store)

driver.quit()

print_list(visible_text_inputs)

Input Field ELement:  and Input field type: search
Input Field ELement:  and Input field type: radio
Input Field ELement:  and Input field type: radio
Input Field ELement:  and Input field type: radio


### V1 Variations
- V1.a - Simple
- V1.b - Complex