# Download datasets with images hosted on website
Most of the benthic habitat datasets have image urls along with other metadata stored in tabular format (dataframe). However some datasets host the images on the website. So they have to be scraped.

In [None]:
import os

import pandas as pd
import pangaeapy
import requests
from bs4 import BeautifulSoup
from requests.compat import urljoin

These are the URLs/DOIs of some datasets without image urls in tabular format. We will scrape one of them to test the algorithm.

In [None]:
# Christiansen, B (2006)
dois = [
    "https://doi.org/10.1594/PANGAEA.371062",
    "https://doi.org/10.1594/PANGAEA.371063",
    "https://doi.org/10.1594/PANGAEA.371064",
]

In [None]:
ds_ids = [int(dsid.split(".")[-1]) for dsid in dois]
ds_ids

## 1. Request dataset url

In [None]:
# Select one of the dois
idx = 0
doi = dois[idx]
ds_id = ds_ids[idx]

In [None]:
dataset = pangaeapy.PanDataSet(ds_id)
print("Dataset title:", dataset.title)
print("Requesting:", doi)
resp = requests.get(doi)
soup = BeautifulSoup(resp.text, "lxml")

# Get download link to photos page
download_link = soup.find("div", attrs={"class": "text-block top-border"}).a["href"]
print("URL to photos page:", download_link)
# Get to photos page (page 1)
resp = requests.get(download_link)
photos_page = BeautifulSoup(resp.text, "lxml")

## 2. Get metadata

In [None]:
def get_metadata(page_soup):
    coordinates = page_soup.find("div", attrs={"class": "hanging geo"})
    lat = float(coordinates.find("span", attrs={"class": "latitude"}).text)
    long = float(coordinates.find("span", attrs={"class": "longitude"}).text)
    return lat, long

In [None]:
lat, long = get_metadata(soup)
print(f"Lat: {lat}, Long: {long}")

## 3. Get pagination info

In [None]:
def get_pagination(page_soup, src_url="https://www.pangaea.de/helpers/Benthos.php"):
    """
    Take a BeautifulSoup object and return a dictionary with page numbers and URLs.
    """
    # <p> tag containing pagination info
    pagination = page_soup.find("p", attrs={"class": "navigation"})
    # Page numbers (strs)
    page_nums = [i.strip() for i in pagination.text.split("|")][2:-1]
    # List of page URLs
    page_urls = [urljoin(src_url, a["href"]) for a in pagination.find_all("a")][:-1]
    # Page number : Page URL
    page_dict = {k: v for k, v in zip(page_nums, page_urls)}
    return page_dict

In [None]:
pagination = get_pagination(photos_page)
for k in pagination:
    print(k, ":", pagination[k])

## 4. Get image URLs from page

In [None]:
def get_image_urls(page_soup, verbose=False):
    """
    Take a BeautifulSoup object and return list of image urls.
    """
    urls = []

    table = page_soup.find("table", class_="pictable")
    photos = table.find_all("td")
    if verbose:
        print("[INFO] Number of photos on page:", len(photos))

    # urls = ["https:"+td.a['href'] for td in photos]
    for td in photos:
        try:
            url = "https:" + td.a["href"]
        except TypeError:
            # The last <td> of the last page is sometimes empty
            # No photos, just a blank <td> tag
            print("[WARNING] Empty <td> tag encountered!")
        urls.append(url)

    return urls

In [None]:
img_urls = get_image_urls(photos_page, verbose=True)

In [None]:
# for url in img_urls:
#     print(url)

## 5. Scrape all pages

In [None]:
def scrape_dataset(page_soup):
    pagination = get_pagination(page_soup)
    # Scrape current page
    print("[INFO] Processing Page 1...")
    img_urls = get_image_urls(page_soup, verbose=True)
    # Scraper subsequent pages
    for n in pagination:
        print(f"[INFO] Processing Page {n}...")
        url = pagination[n]
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "lxml")
        urls = get_image_urls(soup, verbose=True)
        img_urls.extend(urls)
    return img_urls

In [None]:
data = scrape_dataset(photos_page)
print(f"[INFO] Total {len(data)} images scraped.")

In [None]:
# Store data
df = pd.DataFrame(data, columns=["url"])
df["image"] = df["url"].apply(lambda url: url.split("/")[-1])
df["long"] = long
df["lat"] = lat
df["site"] = dataset.events[0].label
df["campagin"] = dataset.events[0].campaign
df["dataset"] = dataset.title

# Rearranging columns
df = df[df.columns[::-1]]
df.head()

In [None]:
# Make sure output directory exists
out_dir = "../outputs"
os.makedirs(out_dir, exist_ok=True)
# Save to file
file = f"{out_dir}/[scraped]{ds_id}.csv"
df.to_csv(file, index=False)
print(f"Saved at: {file}")

## References:

In [None]:
[print(pangaeapy.PanDataSet(doi).citation, end="\n\n") for doi in dois];