In [1]:
import datetime as dt
import pandas as pd
import re
from pathlib import Path
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from seleniumwire import webdriver
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager

DOWNLOAD_DIRECTORY = Path('/Users/alex/Downloads')
REPO_DIRECTORY = '/Users/alex/repos/toronto-bids/scrapers'

In [2]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|██████████| 8.72M/8.72M [00:00<00:00, 21.4MB/s]


In [3]:
driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")

In [4]:
df = pd.read_csv('data_summary.csv')

In [5]:
clicked = set()
for title in df.title.values:
    title = title.replace('...', '')
    clicked.add(title[0:100])

In [6]:
has_clicked = False

In [7]:
def patiently_click(driver, button, wait_after=0):
    WebDriverWait(driver, timeout=60).until(EC.element_to_be_clickable((By.XPATH, button)))
    driver.find_element(By.XPATH, button).click()
    if wait_after > 0:
        sleep(wait_after)

In [8]:
def wait_for_download(command, max_wait=600):
    initial_length = len(list(DOWNLOAD_DIRECTORY.iterdir()))
    command()
    total_wait = 0
    while len(list(DOWNLOAD_DIRECTORY.iterdir())) == initial_length:
        sleep(15)
        total_wait += 15
        if total_wait > max_wait:
            return False
    return True

In [9]:
def patiently_find_regex(driver, regex):
    attempts = 0
    results = []
    while len(results) == 0 and attempts < 30:
        sleep(15)
        results = re.findall(regex, driver.page_source)
        attempts += 1
    if len(results) == 0:
        return None
    return results[0]

In [11]:
while True:
    while not has_clicked:
        elements = driver.find_elements(By.CLASS_NAME, 'ADTableBodyWhite')
        for element in elements:
            title = element.find_element(By.CLASS_NAME, 'QuoteSearchResultTitle')
            title_text = title.text
            if title_text[0:100] in clicked:
                continue

            date = element.find_elements(By.CLASS_NAME, 'paddingRight5')[2].text
            request_expired = dt.datetime.strptime(date[:-4], '%d %b %Y %I:%M %p') < dt.datetime.now()
            if request_expired:
                continue

            print(f'Accessing {title.text}')
            print(f'\tdate: {date}')

            clicked.add(title_text[0:100])
            title.click()
            has_clicked = True

            document_id = patiently_find_regex(driver, '(Doc\d{10})')
            print(f'\tDocument id is {document_id}')

            html_exists = Path(f'{REPO_DIRECTORY}/data/{document_id}.html').exists() or Path(
                f'{REPO_DIRECTORY}/data/{document_id}/{document_id}.html'
            ).exists()
            zip_exists = Path(f'{REPO_DIRECTORY}/data/{document_id}.zip').exists() or Path(
                f'{REPO_DIRECTORY}/data/{document_id}'
            ).exists()

            print('\tZip exists' if zip_exists else '\tZip does not exist')
            print('\tHTML exists' if html_exists else '\tHTML does not exist')

            if not html_exists:
                with open(f'{REPO_DIRECTORY}/data/{document_id}.html', 'w') as f:
                    f.write(driver.page_source)
            if (not zip_exists) and (not request_expired):
                patiently_click(driver, '//*[@id="_hfdr9c"]')  #respond to posting
                patiently_click(driver, '//*[@id="_xjqay"]')  #download content
                patiently_click(driver, '//*[@id="_hgesab"]', wait_after=15)  #click download attachment
                patiently_click(driver, '//*[@id="_h_l$m"]/span/div/label', wait_after=5)  #click select all
                wait_for_download(
                    lambda: patiently_click(driver, '//*[@id="_5wq_j"]')
                )  #download attachments (for real)
            driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")
            sleep(2)
            break
        if not has_clicked:
            patiently_click(driver, '//*[@id="next"]', wait_after=5)
    has_clicked = False

KeyboardInterrupt: 