In [2]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------LIBRARIES-------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------SELENIUM LIBRARIES--------------------------------------------------------------------
# !pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager

#----------------------------------------------------------------------PYTHON LIBRARIES---------------------------------------------------------------------
import time
import random
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor

In [3]:
#-----------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-----------------------------------------------------------------
from IPython.display import display, HTML                                 
display(HTML("<style>.container { width:100% !important; }</style>"))   

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
#-----------------------------------------------------------------------MAIN CODE---------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

#-------------------------------------------------------------------SELENIUM SETTINGS-----------------------------------------------------------------------
driver = webdriver.Chrome()                                                                       # start Chrome Driver (Firefox is not good for this)

rel_download_path = "downloaded_reports/"                                                         # specify the relative download directory path
abs_download_path = os.path.abspath(rel_download_path)                                            # convert to an absolute path for the download directory
print("Absolute Path for Downloaded Reports:", abs_download_path)

chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "plugins.plugins_disabled": ["Chrome PDF Viewer"],                                            # disable Chrome's PDF Viewer
    "download.default_directory": abs_download_path,
    "download.extensions_to_open": "applications/pdf",
    "download.prompt_for_download": False,                                                        # disable download prompt
    "plugins.always_open_pdf_externally": True,                                                   # automatically download PDFs instead of opening them
    "profile.default_content_settings.popups": 0,
    "profile.content_settings.exceptions.automatic_downloads.*.setting": 1 
})

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)

#------------------------------------------------------------------WEBPAGE AUTOMATION-----------------------------------------------------------------------
webpage = "https://es.catapult.org.uk/reports/?posttypes=report"
driver.get(webpage)
driver.implicitly_wait(2)                                                                         # wait for 2 seconds for the page to load

# Find the "Allow cookies" button by its class and click on it
allow_cookies_button = driver.find_element(
    By.XPATH,
    "//a[contains(@class, 'cc-btn') and contains(@class, 'cc-allow')]"
)
allow_cookies_button.click()
print("Clicked on 'Allow cookies' button.")

# Initialize WebDriverWait instance with the driver and timeout
wait = WebDriverWait(driver, 5)                                     

# Find all elements matching the CSS selector representing the page selection
element_class = "page-number.page-numbers"                                    
number_element = wait.until(EC.presence_of_all_elements_located((
    By.CSS_SELECTOR,                                                                              # get all of the elements within the CSS selector
    f"li.{element_class} > a"))
)      
number_of_pages = int(number_element[-1].text)                                                    # get the final page element and convert to integer
print(number_of_pages)

report_links_found = []                                                                           # create an empty list to store all the story links             
for k in range(0, number_of_pages):
    print("Gathering story links from page:", k+1)      
    
    url = f"https://es.catapult.org.uk/reports/page/{k+1}/?posttypes=report"                      # go to specific page number based on loop iteration
    driver.get(url)
    driver.implicitly_wait(1)
    
    # Find all elements with the class name "card_noBackground" 
    # Which represent the links to the reports for the small stories
    report_element = "div.card_noBackground"
    report_cards = driver.find_elements(By.CSS_SELECTOR, report_element)
    
    # Find the element with the class 'cta' and get its 'href' attribute
    # Which represents the link to the big report center-left of the page
    link_element = driver.find_element(By.CLASS_NAME, "cta")
    link_big_story = link_element.get_attribute("href")
    report_links_found.append(link_big_story)
    
    # Save all the links from the page
    for card in report_cards:
        try:
            link_small_story = card.find_element(By.TAG_NAME, "a").get_attribute("href")          # assuming each card contains only one `a` tag inside it
            report_links_found.append(link_small_story)
        except: 
            pass                                                                                  # handle cases where a card might not contain a link

print(report_links_found)

#---------------------------------------------------------------------DOWNLOAD PDFS-------------------------------------------------------------------------
pages_no_pdfreport = []                                                                           # create an empty list to store all the URLs with no PDFs  
for j, report_link in enumerate(report_links_found):                                              # go to each report link found previously
    driver.get(report_link)
    driver.implicitly_wait(1)
    
    try:
        details_dialog = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                "section.pageCTA .details-dialog")
            )
        )
        
        # From the 'details-dialog', find the form by its class 'report-form'
        form = details_dialog.find_element(
            By.CLASS_NAME,
            "report-form"
        )

        form_id = form.get_attribute("id")                                                        # extract ID from form
        # print("Form ID:", form_id)
        download_link = form_id.split("report-form-",1)[-1]                                       # remove everything before 'https:'
        # print("Cleaned Form ID:", cleaned_form_id)
        driver.get(download_link)
        
    except TimeoutException:                                                                      # if the 1st method fails (due to timeout), try 2nd method
        
        button_elements = driver.find_elements(
            By.XPATH,
            "//a[contains(@class, 'button') and contains(text(), 'Download')]"
        )

        if button_elements:                                                                       # if elements are found, navigate to each download link                                                        
            for button in button_elements:
                download_link = button.get_attribute("href")
                driver.get(download_link)
        else:                                                                                     # if no elements are found after both attempt
            pages_no_pdfreport.append(driver.current_url)                                         # save the current page URL

    except Exception as e:                                                                        # this catch-all exception is for other unexpected errors
        pages_no_pdfreport.append(driver.current_url)
        
#-----------------------------------------------------------------CLOSE SELENIUM DRIVER---------------------------------------------------------------------
time.sleep(5)                                                                                     # needed to finish downloading the final file/s 
driver.quit()                                                                                     # quit the chrome driver

#---------------------------------------------------------------HANDLE URLS WITH NO PDFS--------------------------------------------------------------------
if pages_no_pdfreport:
    print("Failed to find PDFs on the following pages:", pages_no_pdfreport)                      # inform user of the URLs with no PDFs
    
    with open('website_log/pages_no_pdfreport.txt', 'w') as file:                                 # create a txt file
        for page in pages_no_pdfreport:
            file.write(page + "\n")                                                               # write all the URLs in the file
    print("The URLs for pages with no PDF reports have been saved to 'pages_no_pdfreport.txt'.")  # inform the user that the operation has been executed

Absolute Path for Downloaded Reports: /Users/ciprianifrim/_projects/esc_report_scraper/downloaded_reports
Clicked on 'Allow cookies' button.
15
Gathering story links from page: 1
Gathering story links from page: 2
Gathering story links from page: 3
Gathering story links from page: 4
Gathering story links from page: 5
Gathering story links from page: 6
Gathering story links from page: 7
Gathering story links from page: 8
Gathering story links from page: 9
Gathering story links from page: 10
Gathering story links from page: 11
Gathering story links from page: 12
Gathering story links from page: 13
Gathering story links from page: 14
Gathering story links from page: 15
['https://es.catapult.org.uk/report/innovating-beyond-retail-reaching-net-zero-consumer-energy/', 'https://es.catapult.org.uk/report/electric-heavy-goods-vehicles-industry-perspectives-on-the-prospects-for-their-adoption-in-the-uk/', 'https://es.catapult.org.uk/report/alternative-energy-markets-innovation-portfolio-report/'

In [None]:
#-----------------------------------------------------------------ARCHIVING THE REPORTS---------------------------------------------------------------------
def prepare_file_info(directory_path):
    """
    Prepare the list of files to be zipped, including their paths and arcnames.
    This function runs in parallel but only prepares information.
    """
    
    # Query all paths of the files present in the input directory
    files = []                                                                                    # create an empty list to store all the paths of the PDFs 
    for f in os.listdir(directory_path):
        if os.path.isfile(os.path.join(directory_path, f)):
            file_path = os.path.join(directory_path, f)
            arcname = os.path.basename(file_path)
            files.append((file_path, arcname))
    return files                                                                                  # return all the PDF paths

def zip_directory_contents(directory_path, output_zipfile):
    """
    Zips all files in the given directory using multithreading to prepare file paths.
    """
    
    # Ensure the directory exists
    if not os.path.isdir(directory_path):                                                
        print("Directory does not exist:", directory_path)
        return

    # Initialize ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        future = executor.submit(prepare_file_info, directory_path)                               # submit job to prepare file info in parallel
        files_to_zip = future.result()

    # Sequentially add files to the zip archive
    with zipfile.ZipFile(output_zipfile, 'w', zipfile.ZIP_DEFLATED, compresslevel=9) as myzip:
        for file_path, arcname in files_to_zip:
            myzip.write(file_path, arcname)
    
    print(f"All files in {directory_path} have been zipped into {output_zipfile}")                # inform the user of the completed operation

# Execute the function
rel_output_path = "scraped_reports_archived/"                                                    # set the relative output directory
output_filename = "esc_scraped_reports.zip"                                                      # set the output filename and extension
abs_output_path = os.path.abspath(rel_output_path + output_filename)                             # convert previous to absolute path
zip_directory_contents(abs_download_path, abs_output_path)