In [39]:
import os
import time
import re
import requests
import shutil
from math import ceil
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pygame

In [40]:
# Function to convert month name or abbreviation to two-digit numeral
def month_to_number(month):
    months = {
        'January': '01', 'February': '02', 'March': '03', 'April': '04',
        'May': '05', 'June': '06', 'July': '07', 'August': '08',
        'September': '09', 'October': '10', 'November': '11', 'December': '12',
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05',
        'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10',
        'Nov': '11', 'Dec': '12'
    }
    return months[month]


In [41]:
# Set the download directory
download_directory = r"C:\Users\potat\Downloads"  # Set this to your default download directory
final_directory = r"T:\1911-1920 (8516)"

if not os.path.exists(final_directory):
    os.makedirs(final_directory)

user_data_dir = r"C:\Users\potat\AppData\Local\Google\Chrome\User Data"
profile_dir = "Profile 4"


options = webdriver.ChromeOptions()
options.add_argument(f"user-data-dir={user_data_dir}")
options.add_argument(f"profile-directory={profile_dir}")
options.add_experimental_option("prefs", {
    "download.default_directory": download_directory,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
})



In [42]:
def download_and_rename_pdf(item_number):
    try:
        # Attempt to find the element and click on it
        retry_count = 3
        while retry_count > 0:
            try:
                pdf_button = driver.find_element(By.XPATH, f'//a[@name="item_{item_number}"]/following-sibling::div[@id="mlditem{item_number}"]//a[contains(@aria-label, "Full text - PDF")]')
                driver.execute_script("window.open(arguments[0].href, '_blank');", pdf_button)
                break
            except StaleElementReferenceException:
                retry_count -= 1
                if retry_count == 0:
                    raise
                time.sleep(2)  # Wait before retrying

        driver.switch_to.window(driver.window_handles[-1])

        wait_for_captcha_to_be_resolved(driver)

        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//a[contains(@class, "pdf-download")]')))

        download_link = driver.find_element(By.XPATH, '//a[contains(@class, "pdf-download")]')
        pdf_url = download_link.get_attribute('href')

        details = driver.find_element(By.XPATH, '//span[@class="newspaperArticle"]/span').text
        href_link = driver.find_element(By.XPATH, '//span[@class="newspaperArticle"]/span/a').get_attribute('href')
        match = re.search(r'(\d{2})\s+(\w+)\s+(\d{4})', details)
        unique_id_match = re.search(r'PagePdf/(\d+)/fulltextPDF', href_link)
        if match and unique_id_match:
            day, month, year = match.groups()
            month_num = month_to_number(month)
            unique_id = unique_id_match.group(1)

            filename = f"{year}-{month_num}-{day}--{unique_id}--{item_number}.PDF"
            final_path = os.path.join(final_directory, filename)

            response = requests.get(pdf_url)
            response.raise_for_status()
            download_path = os.path.join(download_directory, 'ProQuestDocument.pdf')
            with open(download_path, 'wb') as file:
                file.write(response.content)

            if wait_for_download_to_complete(download_path):
                shutil.copy2(download_path, final_path)
                print(f"Copied to: {final_path}")
                os.remove(download_path)
            else:
                raise Exception(f"Download failed or took too long for item {item_number}")

        driver.close()
        driver.switch_to.window(driver.window_handles[0])

        # Only write to checkpoint file if no exceptions were raised
        with open('checkpoint.txt', 'w') as f:
            f.write(f"{item_number}")

    except StaleElementReferenceException as e:
        print(f"StaleElementReferenceException for item {item_number}: {e}")
    except requests.exceptions.RequestException as e:
        print(f"RequestException for item {item_number}: {e}")
    except NoSuchElementException as e:
        print(f"NoSuchElementException for item {item_number}: {e}")
    except TimeoutException as e:
        print(f"TimeoutException for item {item_number}: {e}")
    except Exception as e:
        print(f"Exception for item {item_number}: {e}")


In [43]:
def play_sound(file_path):
    pygame.mixer.music.load(file_path)
    pygame.mixer.music.play()

import pygame
pygame.mixer.init()
sound_file = r"C:\Users\potat\Downloads\sound.wav"


In [44]:
def wait_for_captcha_to_be_resolved(driver):
    sound_play_count = 0
    max_retries = 3

    while sound_play_count < max_retries:
        try:
            # Check if CAPTCHA is present
            driver.find_element(By.XPATH, '//iframe[contains(@src, "recaptcha")]')
            play_sound(sound_file)  # Play sound when CAPTCHA is detected
            print("CAPTCHA detected, please resolve it.")
            sound_play_count += 1

            
        except:
            # CAPTCHA not found, continue with the script
            break
        time.sleep(30)  # Wait for 30 seconds before checking again

    if sound_play_count >= max_retries:
        print("CAPTCHA detected multiple times. Aborting mission.")
        return False


    return True


In [46]:
def jump_to_page_and_item(start_item_number):
    try:
        items_per_page = 20
        current_item_number = start_item_number


        while True:
            page_number = (current_item_number - 1) // items_per_page + 1

            page_field = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "pageNbrField"))
            )
            page_field.clear()
            page_field.send_keys(str(page_number))
            jump_button = driver.find_element(By.ID, "submit_5")
            jump_button.click()

            wait_for_captcha_to_be_resolved(driver)

            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, f'//a[@name="item_{current_item_number}"]'))
            )

            while True:
                try:
                    download_and_rename_pdf(current_item_number)
                    time.sleep(2)
                    current_item_number += 1

                    with open('checkpoint.txt', 'w') as f:
                        f.write(f"{current_item_number}")

                except Exception as e:
                    print(f"Error downloading item {current_item_number}: {e}")

                if (current_item_number - 1) % items_per_page == 0:
                    if not go_to_next_page():
                        return
                    break

    except Exception as e:
        print(f"Error in jump_to_page_and_item: {e}")

In [47]:
def jump_to_page_and_item_list(item_list):
    try:
        items_per_page = 20
        current_item_number = start_item_number


        for last_item_number in item_list:
            page_number = (current_item_number - 1) // items_per_page + 1

            page_field = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "pageNbrField"))
            )
            page_field.clear()
            page_field.send_keys(str(page_number))
            jump_button = driver.find_element(By.ID, "submit_5")
            jump_button.click()


            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, f'//a[@name="item_{current_item_number}"]'))
            )

            try:
                download_and_rename_pdf(current_item_number)
                time.sleep(2)

                with open('checkpoint.txt', 'w') as f:
                    f.write(f"{current_item_number}")

            except Exception as e:
                print(f"Error downloading item {current_item_number}: {e}")

    except Exception as e:
        print(f"Error in jump_to_page_and_item_list: {e}")

In [48]:
def wait_for_download_to_complete(filepath, timeout=60):
    start_time = time.time()
    while True:
        if os.path.exists(filepath):
            if not os.path.exists(filepath + '.crdownload'):
                return True
        if time.time() - start_time > timeout:
            return False
        time.sleep(1)

In [50]:
def go_to_next_page():
    try:
        next_button = driver.find_element(By.XPATH, '//a[@title="Next Page"]')
        next_button.click()
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, f'//a[@name="item_{current_item_number}"]'))
        )
        wait_for_captcha_to_be_resolved(driver)

        return True
    except Exception as e:
        print(f"Error during next page navigation: {e}")
        return False

In [69]:
import subprocess
subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"], capture_output=True)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

initial_url = "https://www.proquest.com/results/6D8BDD4CC543422APQ?accountid=10932"

# Open the initial page
driver.get(initial_url)



In [29]:
recent_searches_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//a[@id='gaRecentSearches']"))
)
recent_searches_button.click()

# Click the second button: "Saved Searches"
saved_searches_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//a[@href='https://www.proquest.com/myresearch/savedsearches?accountid=10932']"))
)
saved_searches_button.click()

# Click the third button: "Sign In"
try:
    sign_in_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[@class='auth0-lock-submit']"))
    )
    sign_in_button.click()
except:
    print("Sign In button not found within 5 seconds. Proceeding to the next step.")

# Click the fourth button: specific search link
specific_search_link = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//a[@href='https://www.proquest.com/myresearch/savedsearches.checkdbssearchlink:rerunsearch/2647518/SavedSearches/$N?t:ac=SavedSearches']"))
)
specific_search_link.click()



KeyboardInterrupt: 

In [70]:
if os.path.exists('checkpoint.txt'):
    with open('checkpoint.txt', 'r') as f:
        lines = f.readlines()
        last_item_number = int(lines[0].strip())
else:
    current_page = 1
    last_item_number = 0
last_item_number

3440

In [64]:
def download_from_checkpoint_loop():
    while True:
        if os.path.exists('checkpoint.txt'):
            with open('checkpoint.txt', 'r') as f:
                lines = f.readlines()
                last_item_number = int(lines[0].strip())
        else:
            last_item_number = 1  # Start from the beginning if no checkpoint exists

        jump_to_page_and_item(last_item_number)

In [None]:
download_from_checkpoint_loop()

CAPTCHA detected, please resolve it.
Copied to: T:\1911-1920 (8516)\1915-08-25--97773504--3440.PDF
Error during next page navigation: name 'current_item_number' is not defined
Copied to: T:\1911-1920 (8516)\1915-08-26--97770727--3441.PDF
Copied to: T:\1911-1920 (8516)\1915-08-26--97775830--3442.PDF
Copied to: T:\1911-1920 (8516)\1915-08-27--97785173--3443.PDF
Copied to: T:\1911-1920 (8516)\1915-08-27--97785809--3444.PDF
Copied to: T:\1911-1920 (8516)\1915-08-29--97784333--3445.PDF
Copied to: T:\1911-1920 (8516)\1915-08-29--97767098--3446.PDF
Copied to: T:\1911-1920 (8516)\1915-08-29--97767164--3447.PDF
Copied to: T:\1911-1920 (8516)\1915-08-29--97769274--3448.PDF
Copied to: T:\1911-1920 (8516)\1915-08-30--97787080--3449.PDF
Copied to: T:\1911-1920 (8516)\1915-08-30--97791004--3450.PDF
Copied to: T:\1911-1920 (8516)\1915-08-31--97648705--3451.PDF
Copied to: T:\1911-1920 (8516)\1915-09-01--97641781--3452.PDF
Copied to: T:\1911-1920 (8516)\1915-09-01--97646229--3453.PDF
Copied to: T:\1911

Copied to: T:\1911-1920 (8516)\1915-10-21--97749384--3557.PDF
Copied to: T:\1911-1920 (8516)\1915-10-21--97754670--3558.PDF
Copied to: T:\1911-1920 (8516)\1915-10-21--97745680--3559.PDF
Copied to: T:\1911-1920 (8516)\1915-10-21--97749166--3560.PDF
Error during next page navigation: name 'current_item_number' is not defined
Error downloading item 3561: name 'StaleElementReferenceException' is not defined
Error during next page navigation: name 'current_item_number' is not defined
Copied to: T:\1911-1920 (8516)\1915-10-21--97748912--3561.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97762050--3562.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97764756--3563.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97761343--3564.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97762336--3565.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97763133--3566.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97764014--3567.PDF
Copied to: T:\1911-1920 (8516)\1915-10-22--97762143--3568.PDF
Copied to: T:\1911-1

Copied to: T:\1911-1920 (8516)\1915-11-21--97760299--3674.PDF
Copied to: T:\1911-1920 (8516)\1915-11-21--97760130--3675.PDF
Copied to: T:\1911-1920 (8516)\1915-11-22--97635790--3676.PDF
Copied to: T:\1911-1920 (8516)\1915-11-22--97639568--3677.PDF
Copied to: T:\1911-1920 (8516)\1915-11-22--97685274--3678.PDF
Copied to: T:\1911-1920 (8516)\1915-11-22--97644852--3679.PDF
Copied to: T:\1911-1920 (8516)\1915-11-25--97670784--3680.PDF
Error during next page navigation: name 'current_item_number' is not defined
Error downloading item 3681: name 'StaleElementReferenceException' is not defined
Error during next page navigation: name 'current_item_number' is not defined
Copied to: T:\1911-1920 (8516)\1915-11-26--97765385--3681.PDF
Copied to: T:\1911-1920 (8516)\1915-11-27--97764646--3682.PDF
Copied to: T:\1911-1920 (8516)\1915-11-27--97766233--3683.PDF
Copied to: T:\1911-1920 (8516)\1915-11-27--97763817--3684.PDF
Copied to: T:\1911-1920 (8516)\1915-11-27--97764058--3685.PDF
Copied to: T:\1911-1