In [1]:
import datetime as dt
import pandas as pd
import re
from pathlib import Path
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.remote.errorhandler import NoSuchElementException
from ariba_driver import Ariba
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager

DOWNLOAD_DIRECTORY = Path('/home/alex/Downloads')
REPO_DIRECTORY = '/home/alex/repos/toronto-bids/scrapers/ariba'

In [24]:
driver = Ariba(service=ChromeService(ChromeDriverManager().install()))

In [25]:
driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")
sleep(5)
driver.find_element(By.CSS_SELECTOR, ".sap-icon--log").click()
sleep(5)
with open('username.key','r') as f:
    driver.find_element(By.NAME, "UserName").send_keys(f.read())
with open('password.key','r') as f:
    driver.find_element(By.NAME, "Password").send_keys(f.read())
driver.find_element(By.NAME, "Password").send_keys(Keys.ENTER)
sleep(5)
driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")

In [4]:
clicked = set()

if Path('data_summary.csv').exists():
    df = pd.read_csv('data_summary.csv')
    for title in df.title.values:
        if title == title:
            title = title.replace('...', '')
            clicked.add(title[0:100])

In [26]:
has_clicked = False

In [6]:
def wait_for_download(command, max_wait=1200):
    initial_length = len(list(DOWNLOAD_DIRECTORY.iterdir()))
    command()
    total_wait = 0
    while len(list(DOWNLOAD_DIRECTORY.iterdir())) == initial_length:
        sleep(15)
        total_wait += 15
        if total_wait > max_wait:
            return False
    return True

In [7]:
def count_directory_files(root: Path):
    if not root.exists():
        return 0
    return len(list(root.iterdir()))

In [8]:
def main_loop(has_clicked=False):
    while not has_clicked:
        elements = driver.find_elements(By.CLASS_NAME, 'ADTableBodyWhite')
        elements += driver.find_elements(By.CLASS_NAME, 'ADHiliteBlock')
        for element in elements:
            try:
                title = element.find_element(By.CLASS_NAME, 'QuoteSearchResultTitle')
            except NoSuchElementException:
                continue
            title_text = title.text
            if title_text[0:100] in clicked:
                continue
            
            print(f'Accessing {title.text}')
            try:
                date = element.find_elements(By.CLASS_NAME, 'paddingRight5')[2].text
                request_expired = dt.datetime.strptime(date[:-4], '%d %b %Y %I:%M %p') < dt.datetime.now()
                print(f'\tdate: {date}')

            except IndexError:
                request_expired = True
                print('\tNo date found')

            clicked.add(title_text[0:100])
            title.click()
            has_clicked = True

            document_id = driver.patiently_find_regex('(Doc\d{10})')
            print(f'\tDocument id is {document_id}')

            noip = driver.find_elements(By.XPATH, '//a[contains(text(),".pdf")]')
            for link in noip:
                print(f'\tPDF found, downloading {link.text}')
                wait_for_download(lambda: link.click())

            
            html_exists = Path(f'{REPO_DIRECTORY}/data/{document_id}.html').exists() or Path(
                f'{REPO_DIRECTORY}/data/{document_id}/{document_id}.html'
            ).exists()
            zip_exists = Path(f'{REPO_DIRECTORY}/data/{document_id}.zip').exists() or count_directory_files(Path(
                f'{REPO_DIRECTORY}/data/{document_id}'
            )) > 1
            
            print('\tHTML exists' if html_exists else '\tHTML does not exist')

            if zip_exists:
                print('\tZip exists')
            elif not request_expired:
                print('\tZip does not exist')
            else:
                print('\tZip does not exist, but RFP is expired')

            if not html_exists:
                with open(f'{REPO_DIRECTORY}/data/{document_id}.html', 'w') as f:
                    f.write(driver.page_source)
            if (not zip_exists) and (not request_expired):
                driver.patiently_click('//*[@id="_hfdr9c"]')  #respond to posting
                driver.patiently_click('//*[@id="_xjqay"]')  #download content
                driver.patiently_click('//*[@id="_hgesab"]', wait_after=15)  #click download attachment
                driver.patiently_click('//*[@id="_h_l$m"]/span/div/label', wait_after=5)  #click select all
                wait_for_download(
                    lambda: driver.patiently_click('//*[@id="_5wq_j"]')
                )  #download attachments (for real)
            driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")
            sleep(2)
            break
        if not has_clicked:
            driver.patiently_click('//*[@id="next"]', wait_after=5)

In [27]:
while True:
    try:
        main_loop()
    except Exception as e:
        print(e)
        sleep(60 * 10)
        driver.get("https://service.ariba.com/Discovery.aw/ad/profile?key=AN01050912625#b0")

Accessing RFSQ - Supplier Prequalification for Upgrades to Group 7 Stormwater-Sewage Pumping Stations
	No date found
	Document id is Doc3092353323
	HTML does not exist
	Zip does not exist, but RFP is expired
Accessing RFSQ for Masonry and Architectural Maintenance Improvements
	No date found
	Document id is Doc3704136229
	HTML does not exist
	Zip does not exist, but RFP is expired
Message: stale element reference: element is not attached to the page document
  (Session info: chrome=110.0.5481.177)
Stacktrace:
#0 0x556668bc4d93 <unknown>
#1 0x5566689932d7 <unknown>
#2 0x5566689968d3 <unknown>
#3 0x556668996642 <unknown>
#4 0x55666899695c <unknown>
#5 0x5566689cf887 <unknown>
#6 0x5566689cfdb1 <unknown>
#7 0x5566689c4986 <unknown>
#8 0x5566689f361d <unknown>
#9 0x5566689c4873 <unknown>
#10 0x5566689f381e <unknown>
#11 0x556668a0b619 <unknown>
#12 0x5566689f3353 <unknown>
#13 0x5566689c2e40 <unknown>
#14 0x5566689c4038 <unknown>
#15 0x556668c188be <unknown>
#16 0x556668c1c8f0 <unknown>
#1

KeyboardInterrupt: 