In [1]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------LIBRARIES-------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------SELENIUM LIBRARIES--------------------------------------------------------------------
# !pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

#----------------------------------------------------------------------PYTHON LIBRARIES---------------------------------------------------------------------
import time
import random
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor

In [2]:
#-----------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-----------------------------------------------------------------
from IPython.display import display, HTML                                 
display(HTML("<style>.container { width:100% !important; }</style>"))   

In [144]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#-----------------------------------------------------------------------MAIN CODE---------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#-------------------------------------------------------------------SELENIUM SETTINGS-----------------------------------------------------------------------
driver = webdriver.Chrome()                                                                       # start Chrome Driver (Firefox is not good for this)

rel_download_path = "downloaded_reports/"                                                         # specify the relative download directory path
abs_download_path = os.path.abspath(rel_download_path)                                            # convert to an absolute path for the download directory
print("Absolute Path for Downloaded Reports:", abs_download_path)

chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "plugins.plugins_disabled": ["Chrome PDF Viewer"],                                            # disable Chrome's PDF Viewer
    "download.default_directory": abs_download_path,
    "download.extensions_to_open": "applications/pdf",
    "download.prompt_for_download": False,                                                        # disable download prompt
    "plugins.always_open_pdf_externally": True,                                                   # automatically download PDFs instead of opening them
    "profile.default_content_settings.popups": 0,
    "profile.content_settings.exceptions.automatic_downloads.*.setting": 1 
})

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)

#------------------------------------------------------------------WEBPAGE AUTOMATION-----------------------------------------------------------------------
webpage = "https://es.catapult.org.uk/reports/?posttypes=report"
driver.get(webpage)
driver.implicitly_wait(2)                                                                         # wait for 2 seconds for the page to load

# Find the "Allow cookies" button by its class and click on it
allow_cookies_button = driver.find_element(
    By.XPATH,
    "//a[contains(@class, 'cc-btn') and contains(@class, 'cc-allow')]"
)
allow_cookies_button.click()
print("Clicked on 'Allow cookies' button.")

# Initialize WebDriverWait instance with the driver and timeout
wait = WebDriverWait(driver, 5)

# Find all elements matching the CSS selector representing the page selection
element_class = "page-number.page-numbers"                                    
number_element = wait.until(EC.presence_of_all_elements_located((
    By.CSS_SELECTOR,                                                                              # get all of the elements within the CSS selector
    f"li.{element_class} > a"))
)      
number_of_pages = int(number_element[-1].text)                                                    # get the final page element and convert to integer
print(number_of_pages)

report_links_found = []                                                                           # create an empty list to store all the story links             
for k in range(0, number_of_pages):
    print("Gathering story links from page:", k+1)      
    
    url = f"https://es.catapult.org.uk/reports/page/{k+1}/?posttypes=report"                      # go to specific page number based on loop iteration
    driver.get(url)
    driver.implicitly_wait(1)
    
    # Find all elements with the class name "card_noBackground" 
    # Which represent the links to the reports for the small stories
    report_element = "div.card_noBackground"
    report_cards = driver.find_elements(By.CSS_SELECTOR, report_element)
    
    # Find the element with the class 'cta' and get its 'href' attribute
    # Which represents the link to the big report center-left of the page
    link_element = driver.find_element(By.CLASS_NAME, "cta")
    link_big_story = link_element.get_attribute("href")
    report_links_found.append(link_big_story)
    
    # Save all the links from the page
    for card in report_cards:
        try:
            link_small_story = card.find_element(By.TAG_NAME, "a").get_attribute("href")          # assuming each card contains only one `a` tag inside it
            report_links_found.append(link_small_story)
        except: 
            pass                                                                                  # handle cases where a card might not contain a link

print(report_links_found)

# # # # # # # # THE FOLLOWING CODE IS A VERY SPECIFIC AND not-good WAY OF ACCESSING SOME OF THE ELEMENTS
i = 0
for j, report_link in enumerate(report_links_found):
    i = i+1                     # experiemental just to stop the loop early - for debugging
    if i >= 3:
        break
    # Go to each report link found previously
    driver.get(report_link)
    driver.implicitly_wait(5)
    

    # Use CSS selector to find the <a> tag with class 'document-item-download' inside 'textContainer' div
    try:
        # First, locate the 'textContainer' div that contains the 'Read the Report' h3
        container_with_h3 = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((
                By.XPATH, "//div[@class='page-news-single']//div[@class='textContainer'][.//h3[contains(text(), 'Read the Report')]]"
            ))
        )

        # Assuming the structure, locate the <summary> tag within the same overall container but not necessarily a direct sibling
        # This assumes the modal triggering element is under the same 'page-news-single' parent but not directly a sibling to 'textContainer'
        download_summary = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((
                By.XPATH, "//div[@class='page-news-single']//summary[contains(@class, 'foreground-color_white')]"
            ))
        )

        download_summary.click()
        print("Clicked the download button.")
        
        
        
        # # # # # # # # CAPTCHA
        
        form_fields = {
            "first_name": "Mike",
            "last_name": "Wazowski",
            "email": "mike.wazowski@monsterinc.com",
            "title": "Director of Comedy Resources",
            "company": "Monster Inc."
        }

        def fill_form_field(field_id, value):
            try:
                # Check for iframe and switch if necessary
                iframes = driver.find_elements(By.TAG_NAME, 'iframe')
                if iframes:
                    print(f"Found {len(iframes)} iframes. Attempting to switch to the first iframe.")
                    driver.switch_to.frame(iframes[0])                                                # switch to the first iframe

                # Ensure the element is in the viewport
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, field_id))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", element)

                # Wait for the element to be clickable
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, field_id))
                )

                # Clear any pre-existing value and fill in the new value
                element.clear()
                element.send_keys(value)

                # If switched to iframe, switch back to the main content
                if iframes:
                    driver.switch_to.default_content()
            except Exception as e:
                print(f"Could not fill the {field_id} field. Error: {str(e)}")

        # Loop through each form field and attempt to fill it
        for field_id, value in form_fields.items():
            fill_form_field(field_id, value)
            time.sleep(1)                                                                            # adding a slight delay to mimic human interaction

        # Function to randomly select options from a multi-select element
        def random_select_from_multiselect(select_element):
            options = select_element.options
            # Skip the first option if it is "--None--"
            if options and options[0].text == "--None--":
                options = options[1:]
            if options:                                                                            # ensure there are selectable options
                # Randomly select one option 
                selected_option = random.choice(options)
                selected_option.click()

        # Function to fill in a textarea
        def fill_textarea(element_id, user_text):
            textarea = driver.find_element(By.ID, element_id)
            textarea.send_keys(user_text)

        # Check if the select elements are present and select an option randomly from each
        try:
            themes_select = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.ID, "00N4I00000FBoZC")))
            themes_select_element = Select(themes_select)
            random_select_from_multiselect(themes_select_element)

            tools_labs_select = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.ID, "00N4I00000FBoZD")))
            tools_labs_select_element = Select(tools_labs_select)
            random_select_from_multiselect(tools_labs_select_element)
        except Exception as e:
            pass

        # User-defined text to be inserted into the textarea
        user_text = "Nothing to write"

        # Fill the textarea with the user-defined text
        try:
            fill_textarea("description", user_text)
        except Exception as e:
            pass
        
        # Function to select a random option from a select element, excluding the first "default" option
        def select_random_option(select_element):
            options = [option for option in select_element.options if option.get_attribute("value")]
            if options:                                                                           # ensure there are selectable options
                random_option = random.choice(options)
                random_option.click()

        # Interact with the first select input and select a random option
        try:
            select_element_1 = Select(driver.find_element(By.ID, "00N5800000Bh8yj"))
            select_random_option(select_element_1)             
        except Exception as e:
            pass    

        # Interact with the second select input and select a random option
        try:
            select_element_2 = Select(driver.find_element(By.ID, "lead_source"))
            select_random_option(select_element_2)
        except Exception as e:
            pass 

        # Wait up to 3 seconds for the iframe with title 'reCAPTCHA' to be available and switch to it
        WebDriverWait(driver, 3).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[@title='reCAPTCHA']")))
        
        # Find the element with the specified ID and click on it
        check = driver.find_element(By.ID, "recaptcha-anchor")
        check.click()
        
        # Switch back to the default content if needed
        driver.switch_to.default_content()
        
        # Find the submit button by its class name and click on it
        submit_button = driver.find_element(By.CLASS_NAME, "contactSubmitBtn")
        submit_button.click()
        
        print("Form submitted.")

    except Exception as e:
        print("Error:", e)


# Close the driver
driver.quit()

Clicked on 'Allow cookies' button.
15
['https://es.catapult.org.uk/report/innovating-beyond-retail-reaching-net-zero-consumer-energy/', 'https://es.catapult.org.uk/report/electric-heavy-goods-vehicles-industry-perspectives-on-the-prospects-for-their-adoption-in-the-uk/', 'https://es.catapult.org.uk/report/alternative-energy-markets-innovation-portfolio-report/', 'https://es.catapult.org.uk/report/making-energy-performance-certificates-work-for-net-zero/', 'https://es.catapult.org.uk/report/starting-a-hydrogen-dialogue/', 'https://es.catapult.org.uk/report/a-sectoral-review-of-indias-energy-transition/', 'https://es.catapult.org.uk/report/assessment-of-locational-wholesale-electricity-market-design-options-in-gb/', 'https://es.catapult.org.uk/report/future-energy-grids-for-wales/', 'https://es.catapult.org.uk/report/heat-the-streets-project-findings/', 'https://es.catapult.org.uk/report/data-for-good-smart-meter-data-access/', 'https://es.catapult.org.uk/report/improving-the-business-ca

Error: Message: 
Stacktrace:
0   chromedriver                        0x0000000100bdc524 chromedriver + 3966244
1   chromedriver                        0x0000000100bd4ab0 chromedriver + 3934896
2   chromedriver                        0x0000000100857da0 chromedriver + 277920
3   chromedriver                        0x000000010089a394 chromedriver + 549780
4   chromedriver                        0x00000001008d2bf0 chromedriver + 781296
5   chromedriver                        0x000000010088efb0 chromedriver + 503728
6   chromedriver                        0x000000010088fa28 chromedriver + 506408
7   chromedriver                        0x0000000100ba170c chromedriver + 3725068
8   chromedriver                        0x0000000100ba5c00 chromedriver + 3742720
9   chromedriver                        0x0000000100b8a1f4 chromedriver + 3629556
10  chromedriver                        0x0000000100ba66fc chromedriver + 3745532
11  chromedriver                        0x0000000100b7d56c chromedriver + 

KeyboardInterrupt: 