In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import pandas as pd
import time
import random

# Set up the Selenium WebDriver with custom User-Agent to reduce bot detection
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(options=options)

# Function to scroll safely and collect object numbers
def safe_scroll_and_collect_objects(url, min_pause=10, max_pause=20, target_object_count=None, max_scrolls=5):
    driver.get(url)
    object_numbers = set()  # Use a set to avoid duplicates
    scrolls = 0

    # Slowly scroll down and collect links
    while scrolls < max_scrolls:
        # Scroll down in small increments for gradual loading
        for i in range(1, 6):  # Smaller increments to simulate real user activity
            driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i / 5});")
            time.sleep(random.uniform(1, 3))  # Short delay between increments

        # After the incremental scroll, wait longer to simulate human reading time
        time.sleep(random.uniform(min_pause, max_pause))

        # Find all links to object pages
        links = driver.find_elements(By.TAG_NAME, 'a')

        for link in links:
            href = link.get_attribute('href')

            # Check for pattern "/objects/{number}" in href
            if href:
                match = re.search(r'/objects/(\d+)', href)
                if match:
                    # Add the extracted object number
                    object_numbers.add(match.group(1))

        scrolls += 1
        print(f"Scroll {scrolls}/{max_scrolls}: Collected {len(object_numbers)} unique object numbers.")

        # Optional early stop if we have enough objects
        if target_object_count and len(object_numbers) >= target_object_count:  # Check for target object count
            print(f"Collected {target_object_count} objects; stopping early.")
            break

    return list(object_numbers)

# URL of the catalog page with lazy loading
url = "https://collections.carlos.emory.edu/objects/table"

# Collect object numbers by scrolling
object_numbers = safe_scroll_and_collect_objects(url, min_pause=15, max_pause=30, target_object_count=14306, max_scrolls=100000)

# Save results to a CSV file
df = pd.DataFrame(object_numbers, columns=['Object Number'])
df.to_csv('2_all_object_numbers.csv', index=False)

print(f"Scraping complete. {len(object_numbers)} unique object numbers found.")

# Close the browser
driver.quit()

ModuleNotFoundError: No module named 'selenium'

In [None]:
pip install selenium

Collecting selenium
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.26.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m29.3 MB/s

Extract first 10 IMG ID and h1 (title)

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(options=options)

# Load the object numbers from the CSV file and limit to the first 10
object_numbers = pd.read_csv('2_all_object_numbers.csv')['Object Number'][:10]

# Define the base URL
base_url = "https://collections.carlos.emory.edu/objects/"

# List to hold results
data = []

# Visit each object page and extract <h1> text and exactly one img id from <img> src attribute
for obj_num in object_numbers:
    url = f"{base_url}{obj_num}"
    driver.get(url)
    time.sleep(2)  # Pause to allow the page to load

    # Extract the <h1> text
    try:
        h1_text = driver.find_element(By.TAG_NAME, 'h1').text
    except:
        h1_text = None  # In case <h1> is not found

    # Extract exactly one img ID from the `src` attribute
    img_id = None
    images = driver.find_elements(By.TAG_NAME, 'img')
    for img in images:
        src = img.get_attribute('src')
        if src:
            # Use regex to find the number in the `src` URL pattern like "/internal/media/dispatcher/74430/preview"
            match = re.search(r'/dispatcher/(\d+)/', src)
            if match:
                img_id = match.group(1)  # Extracted number
                break  # Stop after the first valid img id is found

    # Add extracted data to the list
    data.append({
        'Object Number': obj_num,
        'H1 Text': h1_text,
        'Image ID': img_id
    })

# Convert the data to a DataFrame and save to a new CSV
df = pd.DataFrame(data)
df.to_csv('first_10_object_h1_single_img_id.csv', index=False)

# Close the browser
driver.quit()

print("Data extraction complete for first 10 objects. Saved to 'first_10_object_h1_single_img_id.csv'.")

ModuleNotFoundError: No module named 'selenium'