In [15]:
import requests
import pandas as pd

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"


# Function to extract fields from a hit
def extract_fields(hit):
    fields = hit["fields"]
    # Ensure id is a single value, not a list
    assert len(fields["id"]) == 1, "Expected exactly one id field"
    fields_dict = {"id": fields["id"][0]}
    # Add all other fields
    for key, value in fields.items():
        if key != "id":
            fields_dict[key] = (
                value[0] if isinstance(value, list) and len(value) == 1 else value
            )
    return fields_dict


# Initialize empty list to store all records
all_records = []

# Pagination parameters
start = 0
step = 48  # Same as size parameter in the search URL
total_found = float("inf")

while start < total_found:
    # Get search results for current page
    # Note: You'll need to implement the actual API call here
    url = f"https://3d.nih.gov/api/search/type:entry%20AND%20submissionstatus:%22Published%22%20AND%20collectionid:33?start={start}&size={step}&sort=created%20desc"
    response = requests.get(url, headers={"User-Agent": USER_AGENT})
    if response.status_code != 200:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")
        break
    obj = response.json()

    # Process hits from current page
    hits = obj["hits"]["hit"]
    for hit in hits:
        record = extract_fields(hit)
        all_records.append(record)

    start += step
    total_found = obj["hits"]["found"]

# Create DataFrame from all records
df = pd.DataFrame(all_records)


# Display first few rows and shape of the DataFrame
print(f"DataFrame shape: {df.shape}")
print("\nFirst few rows:")
df.head(5)

DataFrame shape: (207, 36)

First few rows:


Unnamed: 0,id,type,title,lowercasetitle,submissionstatus,collection,collectionid,collectionbadge,description,category,...,threedpxid,paddedentryid,hasdicom,hasdata,hasmesh,hasmedicalimaging,hasbuild,publisheddate,_score,source
0,21628,entry,Cowslip (Primula veris) pollen grain: shrunken,cowslip (primula veris) pollen grain: shrunken,Published,"[3D Pollen Library, Amgueddfa Cymru, National ...","[33, 39]",/images/collections/badges/pollen_collection_i...,"<p class=""ql-align-justify"">3D reconstructed&n...",Cells and Organelles,...,3DPX-021628,21628,False,False,True,False,False,2024-12-19T11:27:56.496Z,4.2853723,
1,21618,entry,Slender barb grass (Parapholis strigosa) polle...,slender barb grass (parapholis strigosa) polle...,Published,"[3D Pollen Library, Amgueddfa Cymru, National ...","[33, 39]",/images/collections/badges/pollen_collection_i...,"<p class=""ql-align-justify"">3D reconstructed&n...",Cells and Organelles,...,3DPX-021618,21618,False,False,True,False,False,2024-12-18T14:53:59.2Z,4.2853723,
2,21611,entry,European pear (Pyrus communis) pollen grain,european pear (pyrus communis) pollen grain,Published,"[3D Pollen Library, Amgueddfa Cymru, National ...","[33, 39]",/images/collections/badges/pollen_collection_i...,"<p class=""ql-align-justify"">3D reconstructed&n...",Cells and Organelles,...,3DPX-021611,21611,False,False,True,False,False,2024-12-12T10:41:07.198Z,4.2853723,
3,21603,entry,Common wintergreen (Pyrola minor) pollen grain...,common wintergreen (pyrola minor) pollen grain...,Published,"[3D Pollen Library, Amgueddfa Cymru, National ...","[33, 39]",/images/collections/badges/pollen_collection_i...,"<p class=""ql-align-justify"">3D reconstructed&n...",Cells and Organelles,...,3DPX-021603,21603,False,False,True,False,False,2024-12-06T11:31:37.286Z,4.2853723,
4,21602,entry,Common fleabane (Pulicaria dysenterica) pollen...,common fleabane (pulicaria dysenterica) pollen...,Published,"[3D Pollen Library, Amgueddfa Cymru, National ...","[33, 39]",/images/collections/badges/pollen_collection_i...,"<p class=""ql-align-justify"">3D reconstructed&n...",Cells and Organelles,...,3DPX-021602,21602,False,False,True,False,False,2024-12-06T11:21:12.477Z,4.2853723,


In [16]:
# save to csv
df.to_csv("data/3d_pollen_library.csv", index=False)

In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
import time
import os
import glob
import shutil


def download_model(row, driver, download_path, timeout=30):
    """Downloads a 3D model from the NIH 3D Print Exchange website.

    Args:
        row: A pandas Series or named tuple containing model metadata, must have 'id' field
        driver: Selenium WebDriver instance to control browser
        download_path: Path where downloaded files should be saved
        timeout: Maximum time in seconds to wait for download (default 30)

    Returns:
        None

    Raises:
        TimeoutException: If page elements cannot be found within timeout period
        Exception: For other errors during download process
    """
    # Convert named tuple to dictionary-like access
    row_dict = row._asdict()

    try:
        # Use row_dict instead of row
        url = f"https://3d.nih.gov/entries/{row_dict['id']}"

        # Navigate to the URL
        driver.get(url)

        # Add wait time for page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Find and click the Download link
        download_link = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[text()='Download']"))
        )
        download_link.click()

        # Find and click the STL label
        stl_label = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//label[text()='stl']"))
        )
        stl_label.click()

        # Find and click the download files button
        download_files_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "downloadfilesBtn"))
        )
        download_files_btn.click()

        # Find and click the terms checkbox
        terms_checkbox = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "termsCheckbox"))
        )
        terms_checkbox.click()

        # Find and click the final Download button
        final_download_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[text()='Download']"))
        )
        final_download_btn.click()
        # Wait for an STL file to appear in the temp directory
        stl_file = None
        start_time = time.time()

        while time.time() - start_time < timeout:
            stl_files = glob.glob(os.path.join(download_path, "*.stl"))
            if stl_files:
                stl_file = stl_files[0]
                break
            time.sleep(0.5)

        if not stl_file:
            raise TimeoutError("Download timed out - no STL file found")

        # Create models directory if it doesn't exist
        models_dir = os.path.join("data", "models")
        os.makedirs(models_dir, exist_ok=True)

        # Get original filename and create new name with ID prefix
        original_name = os.path.basename(stl_file)
        new_name = f"{row_dict['id']}_{original_name}"

        # Move and rename the STL file to models directory
        shutil.move(stl_file, os.path.join(models_dir, new_name))

        # print(f"Extracted STL for entry {row_dict['id']}: {row_dict['title']}")
        # print(f"Saved to: {os.path.join(models_dir, stl_files[0])}")

        # Add small delay between requests to be polite to the server
        time.sleep(2)

    except Exception as e:
        print(f"Error processing entry {row_dict['id']}: {str(e)}")
        return row


# Create models directory if it doesn't exist
models_dir = os.path.join("data", "models")
os.makedirs(models_dir, exist_ok=True)

In [18]:
# Function to create and configure Chrome driver
def create_driver(row_dict):
    # Create a unique download path for this thread
    download_path = os.path.join(os.getcwd(), f"temp_{row_dict['id']}")
    os.makedirs(download_path, exist_ok=True)

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
        },
    )
    # Add these options to prevent Chrome from closing immediately
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)
    return driver, download_path

In [19]:
# Test with a single row
first_row = next(df.head(1).itertuples())
driver, download_path = create_driver(first_row._asdict())
try:
    download_model(first_row, driver, download_path)
finally:
    driver.quit()
    try:
        shutil.rmtree(download_path)
    except:
        pass

In [20]:
# Use ThreadPoolExecutor to parallelize downloads
max_workers = 8  # Adjust this number based on your system's capabilities

# Create a driver for each worker thread
drivers = []
for i in range(max_workers):
    driver, download_path = create_driver({"id": f"worker_{i}"})
    drivers.append((driver, download_path))

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all download tasks
    rows_to_retry = list(df.itertuples())
    max_retries = 3
    retry_count = 0
    timeout = 60

    while retry_count <= max_retries and rows_to_retry:
        # Modify download_model to use the driver for its worker index
        futures = []
        for i, row in enumerate(rows_to_retry):
            worker_idx = i % max_workers
            future = executor.submit(
                download_model,
                row,
                drivers[worker_idx][0],
                drivers[worker_idx][1],
                timeout,
            )
            futures.append(future)

        failed_rows = [
            row
            for row, future in zip(rows_to_retry, futures)
            if future.result() is not None
        ]

        if not failed_rows:
            break

        rows_to_retry = failed_rows
        retry_count += 1
        timeout *= 2  # exponential backoff
        if retry_count <= max_retries and failed_rows:
            print(
                f"\nRetrying {len(failed_rows)} failed downloads (attempt {retry_count}/{max_retries}):"
            )


Error processing entry 21611: Download timed out - no STL file found
Error processing entry 21536: Download timed out - no STL file found
Error processing entry 21464: Download timed out - no STL file found
Error processing entry 21285: Download timed out - no STL file found
Error processing entry 21258: Download timed out - no STL file found
Error processing entry 21252: Download timed out - no STL file found
Error processing entry 21191: Download timed out - no STL file found
Error processing entry 20944: Download timed out - no STL file found
Error processing entry 20473: Download timed out - no STL file found
Error processing entry 20858: Download timed out - no STL file found
Error processing entry 20610: Download timed out - no STL file found
Error processing entry 20096: Download timed out - no STL file found
Error processing entry 20057: Download timed out - no STL file found
Error processing entry 17900: Download timed out - no STL file found
Error processing entry 17833: Mess

In [21]:

# Clean up drivers
for driver, download_path in drivers:
    driver.quit()
    try:
        shutil.rmtree(download_path)
    except:
        pass

if failed_rows:
    print(
        f"\nFailed downloads after {max_retries} attempts ({len(failed_rows)} total):"
    )
    for failed_row in failed_rows:
        print(f"- Entry {failed_row}")