In [None]:
import pandas as pd
import random
from tqdm.notebook import tqdm
Tenders = pd.read_excel("C:/Users/Mitch/git/UWACapstoneG2/data/UpdatedTenders.xlsx")

In [None]:
CleanTenders = Tenders[["Reference Number", "TenderLink"]].dropna(subset=["TenderLink"]).drop_duplicates()
TenderDict = dict(zip(CleanTenders["Reference Number"], CleanTenders["TenderLink"]))

In [None]:
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import logging

LOG_FILENAME = 'download_error_log.txt'
logging.basicConfig(filename=LOG_FILENAME)

BUTTON_LOCATORS  = {
    "Download Now": By.LINK_TEXT,
    "Download for Information Only": By.XPATH,
    "Download Documents": By.XPATH
}

CUSTOM_USER_AGENT  = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"

def click_button(driver, wait, button_text, ref):
    
    locator = BUTTON_LOCATORS[button_text]
    if locator == "xpath":
        button_text = f"//input[@value='{button_text}']"
    
    try:
        button = wait.until(EC.element_to_be_clickable((locator, button_text)))
        button.click()
    except Exception as e:
        error_message = f"Error: {ref}, {e}"
        logging.error(error_message)
        return False
    return True

def open_link(driver, link, ref):
    while True:
        try:
            driver.get(link)
        except Exception as e:
            error_message = f"Error: {ref}, {e}"
            logging.error(error_message)
            print(f"Quitting {ref} Driver")
            driver.quit()
            break
        return

def download_tender(link, ref, path):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument(f"user-agent={CUSTOM_USER_AGENT }")
    prefs = {"download.default_directory": path}
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=options)
    open_link(driver, link, ref)
    
    # wait for button to be clickable
    wait = WebDriverWait(driver, 10)

    buttons = ["Download Now", "Download for Information Only", "Download Documents"]
    for button in buttons:
        if not click_button(driver, wait, button, ref):
            print(f"Quitting {ref} Driver")
            driver.quit()
            break
        
    # Wait for downloads to complete
    time.sleep(15)
    driver.quit()

def download_multiple_tenders(max_workers, tender_dict, path):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []        
        for ref, link in tender_dict.items():
            future = executor.submit(download_tender, link, ref, path)
            futures.append(future)
            
        with tqdm(total=len(futures), desc="Downloading Tenders", colour='green') as pbar:
            for future in futures:
                future.result()
                pbar.update(1)

In [None]:
# Production Enviroment 60 files
ProTenders = {key: value for key, value in TenderDict.items() if "qas" not in value}
# QA Enviroment 3312 files
#QATenders = {key: value for key, value in TenderDict.items() if "qas" in value}
# QA Test
#n = 20
#RandKeys = random.sample(list(QATenders.keys()), n)
#TestTendersQA = {key: QATenders[key] for key in RandKeys}
# Test error handling of tender without button
#TestBadTender = {key: value for key, value in ProTenders.items() if "DOC202324623" in key}

In [None]:
download_multiple_tenders(max_workers=10, 
                          tender_dict=ProTenders, 
                          path="C:\\Users\\Mitch\\Capstone\\Tenders")