In [98]:
from selenium import webdriver

from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import time
import os
import shutil

from concurrent.futures import ThreadPoolExecutor
from argparse import ArgumentParser
from utilities.loaders import download_dataset

import re

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# initialize driver
chrome_options = ChromeOptions()

# service = ChromeService(executable_path="C:/Executables/chromedriver-win64/chromedriver.exe")
# chrome_options.add_experimental_option('detach', True)
service = ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [3]:
populations_01_10_url = "https://www.census.gov/data/tables/time-series/demo/popest/intercensal-2000-2010-state.html"
populations_10_19_url = "https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-detail.html"
populations_20_23_url = "https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html"

In [4]:
keyword_01_10_url = "Intercensal Estimates of the Resident Population by Sex and Age for States: April 1, 2000 to July 1, 2010"
keyword_10_19_url = "Annual Estimates of the Resident Population by Single Year of Age and Sex: April 1, 2010 to July 1, 2019"
keyword_20_23_url = "Annual Estimates of the Resident Population by Single Year of Age and Sex: April 1, 2020 to July 1, 2023 (SC-EST2023-SYASEX)"

In [5]:
# go to page
driver.get(populations_01_10_url)

# sleep
time.sleep(5)

#### select only the sections ith certain keywords

In [6]:
section_index = -1

In [7]:
sections = driver.find_elements(By.CSS_SELECTOR, value="#data-uscb-state-list-selector")

In [8]:
for i, section in enumerate(sections):
    for header in section.find_elements(by=By.CSS_SELECTOR, value="p"):
        if keyword_01_10_url in header.text:
            section_index = i

In [9]:
chosen_section = sections[section_index]
chosen_section.text

'Sex and Age\nIntercensal Estimates of the Resident Population by Sex and Age for States: April 1, 2000 to July 1, 2010\nAlabama\nAlaska\nArizona\nArkansas\nCalifornia\nColorado\nConnecticut\nDelaware\nDistrict of Columbia\nFlorida\nGeorgia\nHawaii\nIdaho\nIllinois\nIndiana\nIowa\nKansas\nKentucky\nLouisiana\nMaine\nMaryland\nMassachusetts\nMichigan\nMinnesota\nMississippi\nMissouri\nMontana\nNebraska\nNevada\nNew Hampshire\nNew Jersey\nNew Mexico\nNew York\nNorth Carolina\nNorth Dakota\nOhio\nOklahoma\nOregon\nPennsylvania\nRhode Island\nSouth Carolina\nSouth Dakota\nTennessee\nTexas\nUtah\nVermont\nVirginia\nWashington\nWest Virginia\nWisconsin\nWyoming'

In [10]:
states = chosen_section.find_elements(by=By.CSS_SELECTOR, value="ul.uscb-margin-5 li.uscb-list-attachment a")

In [39]:
states[0]

<selenium.webdriver.remote.webelement.WebElement (session="d00ed53b094183665d4244c62e4af6af", element="f.A280F4EC4F6F4982858C7CE22991810E.d.EC483D2BB742D2C8AC801C52DC657D67.e.77")>

In [11]:
len(states)

51

In [103]:
def download_files(elements: list, downloads_path="C:/Users/LARRY/Downloads", output_dir="./data/population-data"):
    
    # def helper(element):
    #     # extract href from anchor tag
    #     link = element.get_attribute("href")
    #     state = element.text

    #     # derivve necessary information from url path to create file names
    #     file_name = link.split('/')[-1]
    #     file_dir = "/".join(link.split('/')[:-1])
    #     years = re.search(r"\d+-\d+", file_dir)[0]
    #     extension = re.search(r".[A-Za-z]+$", file_name)[0]
    #     new_file_name = f"{state}_{years}{extension}"
    #     # record = {
    #     #     "file_name": file_name,
    #     #     "file_dir": file_dir,
    #     #     "new_file_name": new_file_name,
    #     #     "state": state
    #     # }

    #     # download file by clicking element
    #     element.click()
    #     time.sleep(3)

    #     # print(records)
    #     return file_name, file_dir, new_file_name, state
    
    # with ThreadPoolExecutor(max_workers=5) as exe:
    #     file_names, file_dirs, new_file_names, states = zip(*list(exe.map(helper, elements)))

    for element in elements:
        link = element.get_attribute("href")
        state = element.text

        # derivve necessary information from url path to create file names
        file_name = link.split('/')[-1]
        file_dir = "/".join(link.split('/')[:-1])
        years = re.search(r"\d+-\d+", file_dir)[0]
        extension = re.search(r".[A-Za-z]+$", file_name)[0]

        # download file by clicking element
        element.click()
        time.sleep(3)

        # get the recently downloaded file and rename it
        file_path = max([os.path.join(downloads_path, f) for f in os.listdir(downloads_path)], key=os.path.getctime)
        new_file_name = f"{state}_{years}{extension}"
        new_file_path = os.path.join(downloads_path, new_file_name)
        os.rename(file_path, new_file_path)

        # move downloaded file to current working directory of the script
        relocated_path = os.path.join(output_dir, new_file_name)
        shutil.move(new_file_path, relocated_path)

In [104]:
downloaded_files = download_files(states)

In [105]:
# go to page
driver.get(populations_10_19_url)

# sleep
time.sleep(5)

In [106]:
sections = driver.find_elements(By.CSS_SELECTOR, value="div.responsivegrid.aem-GridColumn--tablet--12")
len(sections)

10

In [None]:
for i, section in enumerate(sections):
    if keyword_10_19_url in section.text:
        section_index = i

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome=134.0.6998.178); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x0114C7F3+24435]
	(No symbol) [0x010D2074]
	(No symbol) [0x00FA06E3]
	(No symbol) [0x00FB2321]
	(No symbol) [0x00FB1400]
	(No symbol) [0x00FA7B12]
	(No symbol) [0x00FA5F1A]
	(No symbol) [0x00FA935A]
	(No symbol) [0x00FA93D7]
	(No symbol) [0x00FE383E]
	(No symbol) [0x0100D7BC]
	(No symbol) [0x00FDE114]
	(No symbol) [0x0100DA34]
	(No symbol) [0x0102F20A]
	(No symbol) [0x0100D5B6]
	(No symbol) [0x00FDC54F]
	(No symbol) [0x00FDD894]
	GetHandleVerifier [0x014570A3+3213347]
	GetHandleVerifier [0x0146B0C9+3295305]
	GetHandleVerifier [0x0146558C+3271948]
	GetHandleVerifier [0x011E7360+658144]
	(No symbol) [0x010DB27D]
	(No symbol) [0x010D8208]
	(No symbol) [0x010D83A9]
	(No symbol) [0x010CAAC0]
	BaseThreadInitThunk [0x76A35D49+25]
	RtlInitializeExceptionChain [0x7753CE3B+107]
	RtlGetAppContainerNamedObjectPath [0x7753CDC1+561]
