In [1]:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
import pandas as pd
import pyautogui
import getpass
import time
import traceback
import os

In [2]:
DATAFILE_PATH = "third_dataset.csv"
STARTER_LINK ='https://my.usc.edu/service/library/'

In [3]:
def encodeString(string):
    newStr = string.encode(encoding = 'UTF-8', errors = 'strict')
    return newStr

def makefileName(title):
    return title.replace("/"," ").replace(" ","_").replace(".", '_').replace(".", '_')

In [4]:
class Paper:
    def __init__(self, doi, codedDOI, title, authors):
        self.doi = doi
        self.encodedDOI = codedDOI
        self.title = title
        self.authors = authors
        self.table = None

    def getCoded(self):
        return self.encodedDOI

    def getContent(self):
        return self.table

    def getAuthors(self):
        return self.authors

    def getTitle(self):
        return self.title

    def getDOI(self):
        return self.doi
    
    def setContent(self, table):
        self.table = table

In [5]:
def createPaperObjects(filedata_path):
    papers = {}
    df = pd.read_csv(filedata_path)
    rows = df.shape[0]
    for i in range(rows):
        test_doi = df.loc[i, ["DOI"]][0]
        title = df.loc[i, ["Title"]][0]
        authors = df.loc[i, ['Authors']][0]
        papers[test_doi] = Paper(test_doi, encodeString(test_doi),title, authors)
    return papers

papers = createPaperObjects(DATAFILE_PATH)

In [6]:
def getButton(title, driver, xpath):
    try:
        button=driver.find_element("xpath", xpath)
        return button
    except Exception as error:
        print(f"Failed to find button {title}")
        raise Exception(error)

In [7]:
def goToLogin(driver):
    signIn = getButton("SignIn Button",driver,'//a[@class="main-navigation__navbar "]/span[text()="Sign In"]')
    signIn.click()
    return driver

In [8]:
def douLogin(driver):
    username_slot = getButton("Username Button", driver, "//input[@id='username']")
    uid= input('Enter the User id: ')
    username_slot.send_keys(uid)
    
    password_slot = getButton("Password Button", driver, "//input[@id='password']")
    pswd = getpass.getpass('Password:')
    password_slot.send_keys(pswd)

    try:
        submit_button = getButton("Submit Button", driver, "//button[@type='submit']")
        submit_button.click()
    except Exception as e:
        print('Wrong user id or password')
        raise Exception(e)

    time.sleep(2)
    iframe='duo_iframe'
    driver.switch_to.frame(iframe)
    time.sleep(10)
    return driver

In [9]:
def downloadPaper(driver, counter):
    failed = False

    # Tru to see if there is a download button
    try:
        download_button=driver.find_element("xpath", "//button[@id='documentDownload']")
        download_button.click()
    except Exception:
        failed = True

    if failed:
        # try if it can be stored as PDF
        try:
            pyautogui.hotkey('ctrl', 's')
            time.sleep(1)
            pyautogui.write(f'fileNumber_{counter}')
            pyautogui.press('enter')
            failed = False
        except Exception:
            failed = True

    if failed:
        print("Failed All attempts to download")


In [10]:
def searchByDOI(driver, doi):
    search_bar = getButton("For Search Bar",driver,'//input[@id="library-query"]')
    search_bar.clear()
    search_bar.send_keys(doi)
    search_bar.send_keys(Keys.RETURN)
    time.sleep(2)
    return driver 

In [11]:
def closeTabs(driver):
    all_tabs = driver.window_handles
    for tab in all_tabs[1:]:
        driver.switch_to.window(tab)
        driver.close()
    driver.switch_to.window(driver.window_handles[0])
    return driver

def findRightPaper(driver, doi, counter, title):
    try:
        driver = searchByDOI(driver, doi)
        driver.switch_to.window(driver.window_handles[1])
        time.sleep(6)
        xpath_download_button = "//a[contains(@class, 'article-link')]//span[@class='dl-article']"
        download_button = getButton("Library Download Button", driver,xpath_download_button)
        download_button.click()
        # Switch to new tab
        driver.switch_to.window(driver.window_handles[2])
        time.sleep(15) # give time to load
        downloadPaper(driver, counter)
        time.sleep(2)
        driver = closeTabs(driver)
        return driver, True
    except Exception as error:
        driver = closeTabs(driver)
        return driver, False

In [12]:
def preStartUp():
    driver = uc.Chrome()
    time.sleep(2)
    driver.get(STARTER_LINK)
    driver.maximize_window()
    # time.sleep(5)
    # driver = goToLogin(driver)
    driver = douLogin(driver)
    time.sleep(10)
    return driver

In [13]:
tracked_done = {}
for doi, obj in papers.items():
    tracked_done[doi] = 0

In [14]:
def GetAllPaper(tracked_done):
    if os.path.isfile("downloaded_papers.txt"):
        with open("downloaded_papers.txt", 'r') as f:
            tracked_done = eval(f.read())
    try:
        driver = preStartUp()
        if driver.current_url != STARTER_LINK:
            driver.get(STARTER_LINK)
        switch_to_search_tab = driver.find_element(By.XPATH, "//li[@class='search']//a")
        switch_to_search_tab.click()
        time.sleep(2)
        counter = 0
        for doi, obj in papers.items():
            counter += 1
            try:
                if (tracked_done[doi] == 0):
                    driver, downloaded = findRightPaper(driver, obj.getDOI(), counter, obj.getTitle())
                    if downloaded:
                        tracked_done[doi] = 1
                else:
                    print(f"Done: {doi}")
            except Exception as e:
                driver = closeTabs(driver)
                print(f"Failed: {doi}")
    except Exception as err:
        with open("downloaded_papers.txt", 'w') as f:
            f.write(str(tracked_done))
        raise Exception(err)
    finally:
        with open("downloaded_papers.txt", 'w') as f:
            f.write(str(tracked_done))

In [15]:
try:
    GetAllPaper(tracked_done)
except Exception:
    traceback.print_exc()

Done: 10.1371/journal.pone.0053510
Done: 10.1371/journal.pone.0053940
Done: 10.1371/journal.pone.0057285
Done: 10.1371/journal.pone.0062170
Done: 10.1371/journal.pone.0064904
Done: 10.1371/journal.pone.0063581
Done: 10.1371/journal.pone.0070508
Done: 10.1371/journal.pone.0068296
Done: 10.1371/journal.pone.0069485
Done: 10.1371/journal.pone.0069583
Done: c
Done: 10.1371/journal.pone.0069134
Done: 10.1371/journal.pone.0071127
Done: 10.1371/journal.pone.0084982
Done: 10.1371/journal.pone.0086342
Done: 10.1371/journal.pone.0087003
Done: 10.1371/journal.pone.0089546
Done: 10.1371/journal.pone.0090318
Done: 10.1371/journal.pone.0092066
Done: 10.1371/journal.pone.0092305
Done: 10.1371/journal.pone.0092626
Done: 10.1371/journal.pone.0090608
Done: 10.1371/journal.pone.0094830
Done: 10.1371/journal.pone.0095434
Done: 10.1371/journal.pone.0092189
Done: 10.1371/journal.pone.0094472
Done: 10.1371/journal.pone.0095926
Done: 10.1371/journal.pone.0093995
Done: 10.1371/journal.pone.0093563
Done: 10.137