In [1]:
import datetime as dt
import pandas as pd
import re
import os
from pathlib import Path
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.remote.errorhandler import NoSuchElementException
from seleniumwire import webdriver
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager

from config import ConfigReader

In [2]:
configReader = ConfigReader()
DOWNLOAD_DIRECTORY = Path(configReader.load_key('scrapers')['ariba']['CONFIG_DOWNLOAD_DIRECTORY'])
REPO_DIRECTORY = configReader.get_working_directory()
CONFIG_TORONTO_ARIBA_URL = configReader.load_key('scrapers')['ariba']['CONFIG_TORONTO_ARIBA_URL']
CONFIG_SCRAPING_RETRY_INTERVAL_SEC = configReader.load_key('scrapers')['ariba']['CONFIG_SCRAPING_RETRY_INTERVAL_SEC']
CONFIG_OUTPUT_HTML_DIRECTORY = configReader.load_key('scrapers')['ariba']['CONFIG_OUTPUT_HTML_DIRECTORY']

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.05M/7.05M [00:00<00:00, 82.5MB/s]


In [3]:
driver.get(CONFIG_TORONTO_ARIBA_URL)

In [4]:
clicked = set()

if Path('data_summary.csv').exists():
    df = pd.read_csv('data_summary.csv')
    for title in df.title.values:
        if title == title:
            title = title.replace('...', '')
            clicked.add(title[0:100])

In [5]:
has_clicked = False

In [6]:
def patiently_click(driver, button, wait_after=0):
    WebDriverWait(driver, timeout=60).until(EC.element_to_be_clickable((By.XPATH, button)))
    driver.find_element(By.XPATH, button).click()
    if wait_after > 0:
        sleep(wait_after)

In [7]:
def wait_for_download(command, max_wait=1200):
    initial_length = len(list(DOWNLOAD_DIRECTORY.iterdir()))
    command()
    total_wait = 0
    while len(list(DOWNLOAD_DIRECTORY.iterdir())) == initial_length:
        sleep(15)
        total_wait += 15
        if total_wait > max_wait:
            return False
    return True

In [8]:
def patiently_find_regex(driver, regex):
    attempts = 0
    results = []
    while len(results) == 0 and attempts < 30:
        sleep(15)
        results = re.findall(regex, driver.page_source)
        attempts += 1
    if len(results) == 0:
        return None
    return results[0]

In [9]:
def count_directory_files(root: Path):
    if not root.exists():
        return 0
    return len(list(root.iterdir()))

In [10]:
def main_loop(has_clicked=False):
    while not has_clicked:
        elements = driver.find_elements(By.CLASS_NAME, 'ADTableBodyWhite')
        elements += driver.find_elements(By.CLASS_NAME, 'ADHiliteBlock')
        for element in elements:
            try:
                title = element.find_element(By.CLASS_NAME, 'QuoteSearchResultTitle')
            except NoSuchElementException:
                continue
            title_text = title.text
            if title_text[0:100] in clicked:
                continue
            
            print(f'Accessing {title.text}')
            try:
                date = element.find_elements(By.CLASS_NAME, 'paddingRight5')[2].text
                request_expired = dt.datetime.strptime(date[:-4], '%d %b %Y %I:%M %p') < dt.datetime.now()
                print(f'\tdate: {date}')

            except IndexError:
                request_expired = True
                print('\tNo date found')

            clicked.add(title_text[0:100])
            title.click()
            has_clicked = True

            document_id = patiently_find_regex(driver, '(Doc\d{10})')
            print(f'\tDocument id is {document_id}')

            html_exists = Path(os.path.join(REPO_DIRECTORY,f'/data/{document_id}.html')).exists() or Path(os.path.join({REPO_DIRECTORY},f'/data/{document_id}/{document_id}.html')).exists()
            
            zip_exists = Path(os.path.join(REPO_DIRECTORY,f'/data/{document_id}.zip')).exists() or count_directory_files(Path(os.path.join(REPO_DIRECTORY,f'{REPO_DIRECTORY}/data/{document_id}'))) > 1
            
            print('\tHTML exists' if html_exists else '\tHTML does not exist')

            if zip_exists:
                print('\tZip exists')
            elif not request_expired:
                print('\tZip does not exist')
            else:
                print('\tZip does not exist, but RFP is expired')

            if not html_exists:
                with open(os.path.join(REPO_DIRECTORY,f'/data/{document_id}.html'), 'w') as f:
                    f.write(driver.page_source)
            if (not zip_exists) and (not request_expired):
                patiently_click(driver, '//*[@id="_hfdr9c"]')  #respond to posting
                patiently_click(driver, '//*[@id="_xjqay"]')  #download content
                patiently_click(driver, '//*[@id="_hgesab"]', wait_after=15)  #click download attachment
                patiently_click(driver, '//*[@id="_h_l$m"]/span/div/label', wait_after=5)  #click select all
                wait_for_download(
                    lambda: patiently_click(driver, '//*[@id="_5wq_j"]')
                )  #download attachments (for real)
            driver.get(CONFIG_TORONTO_ARIBA_URL)
            sleep(2)
            break
        if not has_clicked:
            patiently_click(driver, '//*[@id="next"]', wait_after=5)

In [None]:
while True:
    try:
        main_loop()
    except:
        sleep(CONFIG_SCRAPING_RETRY_INTERVAL_SEC)
        driver.get(CONFIG_TORONTO_ARIBA_URL)

Accessing RFT for  Ashbridges Bay Treatment Plant Boiler Demolition and Installation Contract
	date: 9 May 2022 8:59 PM PDT
	Document id is Doc3374301730
	HTML exists
	Zip does not exist, but RFP is expired
Accessing RFP for Actuarial Valuation for the Toronto Fire Department Superannuation and Benefit  Fund
	date: 29 Jul 2022 9:03 AM PDT
	Document id is Doc3313053012
	HTML exists
	Zip does not exist, but RFP is expired
Accessing Doc3528836035 -  RFQ - Thermal Imaging Cameras and Related Accessories for the City of Toronto’s Fire Services Division
	date: 20 Jul 2022 8:59 PM PDT
	Document id is Doc3528836035
	HTML exists
	Zip does not exist, but RFP is expired
Accessing RFP for Compactor Replacement at  Victoria Park, Dufferin, Commissioners and Ingram Transfer Stations
	date: 10 Mar 2023 9:00 AM PST
	Document id is Doc3800872667
	HTML exists
	Zip exists
Accessing N/A
	No date found
	Document id is Doc3385356228
	HTML exists
	Zip does not exist, but RFP is expired
Accessing Request for 