In [None]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException

def download_data():
    """
    Automates the process of downloading all datasets and their feature descriptions
    from the Harvard Atlas data downloads page.
    """
    # --- 1. Setup WebDriver and Download Directory ---
    data_dir = os.path.abspath('data')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    print(f"Data will be downloaded to: {data_dir}")

    # Configure Chrome to automatically download files to the specified directory
    prefs = {
        'download.default_directory': data_dir,
        'download.prompt_for_download': False,
        'download.directory_upgrade': True,
        'safebrowsing.enabled': True
    }
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option('prefs', prefs)
    # To run without opening a browser window, uncomment the following lines:
    # chrome_options.add_argument("--headless")
    # chrome_options.add_argument("--window-size=1920,1080")

    # Use webdriver_manager to automatically handle the driver
    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 20)

    try:
        # --- 2. Navigate to the Website ---
        url = 'https://atlas.hks.harvard.edu/data-downloads'
        driver.get(url)

        page_num = 1
        while True:
            print(f"\n--- Processing Page {page_num} ---")
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.MuiTable-root')))
            time.sleep(3)  # Allow time for the page's JavaScript to stabilize

            # Get the number of rows on the current page
            download_buttons = driver.find_elements(By.CSS_SELECTOR, 'button.css-1gbc8ep')
            num_rows = len(download_buttons)
            print(f"Found {num_rows} datasets on this page.")

            # --- 3. Iterate Through Each Dataset on the Page ---
            for i in range(num_rows):
                # Re-find buttons each time to avoid StaleElementReferenceException
                download_buttons = wait.until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'button.css-1gbc8ep'))
                )
                
                button_to_click = download_buttons[i]
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button_to_click)
                time.sleep(0.5)
                
                try:
                    button_to_click.click()
                except ElementClickInterceptedException:
                    print(f"  Row {i+1}: Click was intercepted, trying JavaScript click.")
                    driver.execute_script("arguments[0].click();", button_to_click)

                # --- 4. Process the Modal Dialog ---
                modal_selector = "div.MuiDialog-paper"
                try:
                    modal = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, modal_selector)))
                    print(f"  Processing row {i+1}...")

                    # Extract the filename to name the features file and track download
                    file_name_element = wait.until(EC.presence_of_element_located((By.XPATH, "//p[starts-with(., 'File Name:')]")))
                    data_filename = file_name_element.text.replace('File Name:', '').strip()
                    features_csv_name = f"{os.path.splitext(data_filename)[0]}_features.csv"
                    features_csv_path = os.path.join(data_dir, features_csv_name)

                    # Extract the features table into a pandas DataFrame and save as CSV
                    try:
                        features_table_el = modal.find_element(By.CSS_SELECTOR, 'table.MuiTable-stickyHeader')
                        table_html = features_table_el.get_attribute('outerHTML')
                        features_df = pd.read_html(table_html)[0]
                        features_df.to_csv(features_csv_path, index=False)
                        print(f"    - Saved features to {features_csv_name}")
                    except Exception as e:
                        print(f"    - Could not extract or save features table: {e}")

                    # Click the final download button inside the modal
                    modal.find_element(By.CSS_SELECTOR, 'button.css-rp01fk').click()
                    print(f"    - Download command issued for {data_filename}")

                    # Wait for the download to complete
                    data_file_path = os.path.join(data_dir, data_filename)
                    timeout = 60
                    start_time = time.time()
                    downloaded = False
                    while time.time() - start_time < timeout:
                        if os.path.exists(data_file_path) and not os.path.exists(data_file_path + '.crdownload'):
                            if os.path.getsize(data_file_path) > 0:
                                print(f"    - Download complete: {data_filename}")
                                downloaded = True
                                time.sleep(2)
                                break
                        time.sleep(1)
                    if not downloaded:
                        print(f"    - Download timed out or failed for {data_filename}")

                    # Close the modal
                    modal.find_element(By.CSS_SELECTOR, 'button.css-wnmj7d').click()
                    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, modal_selector)))
                    time.sleep(1)

                except TimeoutException:
                    print(f"  Modal did not appear for row {i+1}. Skipping.")
                    driver.refresh()
                    break

            # --- 5. Navigate to the Next Page ---
            try:
                next_page_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Go to next page"]')
                if 'Mui-disabled' in next_page_button.get_attribute('class'):
                    print("\n--- Last page reached. ---")
                    break
                else:
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page_button)
                    time.sleep(0.5)
                    next_page_button.click()
                    page_num += 1
            except NoSuchElementException:
                print("\n--- No more pages found. ---")
                break

    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
    finally:
        print("Script finished. Closing browser.")
        driver.quit()

download_data()

Processing page 1...
Found 7 datasets on page 1
  Processing dataset 1/7...
    Saved feature description: growth_proj_eci_rankings_features.csv
    Error processing dataset 1: Message: invalid selector: An invalid or illegal selector was specified
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidselectorexception
Stacktrace:
#0 0x56ebda7992ca <unknown>
#1 0x56ebda240550 <unknown>
#2 0x56ebda247198 <unknown>
#3 0x56ebda249a28 <unknown>
#4 0x56ebda249ab3 <unknown>
#5 0x56ebda291b25 <unknown>
#6 0x56ebda2922e1 <unknown>
#7 0x56ebda285d76 <unknown>
#8 0x56ebda2b7bed <unknown>
#9 0x56ebda285c6a <unknown>
#10 0x56ebda2b7d8e <unknown>
#11 0x56ebda2dd9e6 <unknown>
#12 0x56ebda2b7993 <unknown>
#13 0x56ebda283d6b <unknown>
#14 0x56ebda285141 <unknown>
#15 0x56ebda75e2ab <unknown>
#16 0x56ebda7620b9 <unknown>
#17 0x56ebda745139 <unknown>
#18 0x56ebda762c68 <unknown>
#19 0x56ebda7

In [3]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def download_data():
    data_dir = os.path.abspath('data')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    prefs = {
        'download.default_directory': data_dir,
        'download.prompt_for_download': False,
        'download.directory_upgrade': True,
        'safebrowsing.enabled': True
    }
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 10)

    try:
        url = 'https://atlas.hks.harvard.edu/data-downloads'
        driver.get(url)
        
        input("Select the datasets you want to download, then press Enter to continue...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.MuiTable-root')))
        
        page_num = 1
        downloaded_datasets = []
        while True:
            print(f"Processing page {page_num}...")
            time.sleep(1)
            
            download_buttons = driver.find_elements(By.CSS_SELECTOR, 'button[type="button"] svg[viewBox="0 0 24 24"] path[d="M5 20h14v-2H5zM19 9h-4V3H9v6H5l7 7z"]')
            download_buttons = [btn.find_element(By.XPATH, '../..') for btn in download_buttons]
            print(f"Found {len(download_buttons)} datasets on page {page_num}")
            
            # Process each dataset on the current page
            for i, button in enumerate(download_buttons):
                try:
                    print(f"  Processing dataset {i+1}/{len(download_buttons)}...")
                    driver.execute_script("arguments[0].click();", button)
                    modal = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')))

                    dataset_info = extract_dataset_info(driver, modal)
                    dateset_file_name = dataset_info['filename']
                    save_feature_description(driver, modal, dateset_file_name, data_dir)
                    
                    final_download_button = modal.find_element(By.CSS_SELECTOR, "div[aria-labelledby] button.css-rp01fk")
                    final_download_button.click()
                    
                    # # Wait for the download to complete
                    # data_file_path = os.path.join(data_dir, dateset_file_name)
                    # timeout = 60
                    # start_time = time.time()
                    # downloaded = False
                    # while time.time() - start_time < timeout:
                    #     if os.path.exists(data_file_path) and not os.path.exists(data_file_path + '.crdownload'):
                    #         if os.path.getsize(data_file_path) > 0:
                    #             print(f"    - Download complete: {dateset_file_name}")
                    #             downloaded = True
                    #             time.sleep(2)
                    #             break
                    #     time.sleep(1)
                    # if not downloaded:
                    #     print(f"    - Download timed out or failed for {dateset_file_name}")
                    time.sleep(3)
                    
                    downloaded_datasets.append(dataset_info)
                    print(f"    Downloaded: {dataset_info['name']}")
                    
                    # Close modal by clicking the X button
                    close_btn = modal.find_element(By.CSS_SELECTOR, 'button svg[viewBox="0 0 24 24"] path[d*="19 6.41"]')
                    close_btn = close_btn.find_element(By.XPATH, '../..')
                    driver.execute_script("arguments[0].click();", close_btn)
                    
                    # Wait for modal to close
                    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')))
                    
                except Exception as e:
                    print(f"    Error processing dataset {i+1}: {str(e)}")
                    # Try to close modal if it's still open
                    try:
                        close_btn = driver.find_element(By.CSS_SELECTOR, 'div[role="dialog"] button svg[viewBox="0 0 24 24"] path[d*="19 6.41"]')
                        close_btn = close_btn.find_element(By.XPATH, '../..')
                        driver.execute_script("arguments[0].click();", close_btn)
                        wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')))
                    except:
                        pass
                    continue
            
            # Check if there's a next page
            if not go_to_next_page(driver, wait):
                break
            
            page_num += 1
        
        # Save summary of downloaded datasets
        if downloaded_datasets:
            summary_df = pd.DataFrame(downloaded_datasets)
            summary_df.to_csv(os.path.join(data_dir, 'downloaded_datasets_summary.csv'), index=False)
            print(f"\nDownload complete! Downloaded {len(downloaded_datasets)} datasets.")
            print(f"Summary saved to: {os.path.join(data_dir, 'downloaded_datasets_summary.csv')}")
        else:
            print("\nNo datasets were downloaded.")
    
    finally:
        driver.quit()

def extract_dataset_info(driver, modal):
    """Extract dataset information from the modal"""
    try:
        # Get dataset name from modal title
        title_element = modal.find_element(By.CSS_SELECTOR, 'h2[id*=":"]')
        full_title = title_element.text
        
        # Split title into name and classification info
        title_spans = title_element.find_elements(By.TAG_NAME, 'span')
        if len(title_spans) >= 2:
            name = title_spans[0].text.strip()
            classification = title_spans[1].text.strip('()')
        else:
            name = full_title
            classification = ""
        
        # Get file information
        file_info = {}
        info_elements = modal.find_elements(By.CSS_SELECTOR, 'p.MuiTypography-body1')
        
        for elem in info_elements:
            text = elem.text
            if 'File Name:' in text:
                file_info['filename'] = text.split('File Name:')[1].strip()
            elif 'File Size:' in text:
                file_info['file_size'] = text.split('File Size:')[1].strip()
            elif 'Last Update:' in text:
                file_info['last_update'] = text.split('Last Update:')[1].strip()
        
        # Get description
        description_elem = modal.find_element(By.CSS_SELECTOR, 'p.MuiTypography-paragraph')
        description = description_elem.text
        
        return {
            'name': name,
            'classification': classification,
            'description': description,
            'filename': file_info.get('filename', 'unknown'),
            'file_size': file_info.get('file_size', ''),
            'last_update': file_info.get('last_update', '')
        }
    
    except Exception as e:
        print(f"    Error extracting dataset info: {str(e)}")
        return {
            'name': 'Unknown',
            'classification': '',
            'description': '',
            'filename': 'unknown',
            'file_size': '',
            'last_update': ''
        }

def save_feature_description(driver, modal, filename, data_dir):
    """Save the feature description table as CSV"""
    try:
        # Find the table in the modal
        table = modal.find_element(By.CSS_SELECTOR, 'table.MuiTable-root')
        
        # Extract table headers
        headers = []
        header_cells = table.find_elements(By.CSS_SELECTOR, 'thead th')
        for cell in header_cells:
            headers.append(cell.text.strip())
        
        # Extract table rows
        rows = []
        body_rows = table.find_elements(By.CSS_SELECTOR, 'tbody tr')
        for row in body_rows:
            cells = row.find_elements(By.CSS_SELECTOR, 'td')
            row_data = [cell.text.strip() for cell in cells]
            if row_data:  # Only add non-empty rows
                rows.append(row_data)
        
        # Create DataFrame and save
        if headers and rows:
            df = pd.DataFrame(rows, columns=headers)
            
            # Create filename for feature description
            base_name = filename.replace('.csv', '') if filename.endswith('.csv') else filename
            feature_filename = f"{base_name}_features.csv"
            feature_path = os.path.join(data_dir, feature_filename)
            
            df.to_csv(feature_path, index=False)
            print(f"    Saved feature description: {feature_filename}")
    
    except Exception as e:
        print(f"    Error saving feature description: {str(e)}")

def go_to_next_page(driver, wait):
    """Navigate to the next page if available"""
    try:
        # Look for the next page button
        # The next button is the one with arrow pointing right that's not disabled
        next_buttons = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="next"] svg path[d*="10 6"]')
        
        if not next_buttons:
            return False
        
        next_button = next_buttons[0].find_element(By.XPATH, '../..')
        
        # Check if button is disabled
        button_classes = next_button.get_attribute('class')
        if 'Mui-disabled' in button_classes:
            return False
        
        # Click next button
        driver.execute_script("arguments[0].click();", next_button)
        
        # Wait for page to load
        time.sleep(3)
        
        # Verify we're on a new page by waiting for table to reload
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.MuiTable-root')))
        
        return True
    
    except Exception as e:
        print(f"Error navigating to next page: {str(e)}")
        return False

print('starting')
download_data()

starting


KeyboardInterrupt: Interrupted by user

In [16]:
# Scrape product list

driver = webdriver.Chrome()
url = 'https://atlas.hks.harvard.edu/data-downloads'
driver.get(url)
driver.maximize_window()



# # Start scraping tables
# def extract_table_data():
#     table = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, 'tableContacts')))
#     headers = [header.text for header in table.find_elements(By.TAG_NAME, 'th')]
#     rows = table.find_elements(By.TAG_NAME, 'tr')
#     data = []
#     for row in rows[1:]:  # Skip the header row
#         cells = row.find_elements(By.TAG_NAME, 'td')
#         data.append([cell.text for cell in cells])
#     return pd.DataFrame(data, columns=headers)


# master_df = extract_table_data()
# total_pages = int(driver.find_element(By.CSS_SELECTOR, '.form-pagination span').text.strip())
# for i in range(1, total_pages):
#     forward_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn.btn-xs.btn-default.btn-forward')))
#     driver.execute_script("arguments[0].click();", forward_button)

#     page_df = extract_table_data()
#     master_df = pd.concat([master_df, page_df], ignore_index=True)

# products_found_text = driver.find_element(By.CSS_SELECTOR, '.text-start.fs-9.text-primary.d-inline strong').text
# products_found = int(products_found_text.replace(',', ''))
# driver.quit()

# if len(master_df) == products_found:
#     try:
#         existing_df = load('data/ib_products.csv')
#         master_df = pd.concat([existing_df, master_df]).drop_duplicates()
#         print('Updating previous scrape')
#     except FileNotFoundError:
#         print('Previous scrape file not found. Saving this scrape')
#         pass
#     master_df.to_csv('data/ib_products.csv', index=False)
# else:
#     print(f"Number listed in site({products_found}) doesn't match number extracted({len(master_df)}). Nothing will be saved")