In [None]:
from typing import Tuple

import pandas as pd
import pangaeapy
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from tqdm.auto import tqdm

### Load list of dataset IDs

In [None]:
file = "../pangaea_2022-01-27.csv"
df = pd.read_csv(file, low_memory=False)
ds_ids = [ds_name.split("-")[-1] for ds_name in df.dataset.unique()]
print(f"Total {len(ds_ids)} dataset licenses to fetch.")

### Functions to extract dataset metadata
- Paper citation
- Project URL

In [None]:
def get_dataset_url(ds_id):
    """Return dataset URL given six digit dataset ID."""
    return f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}"

In [None]:
def get_info_tag(soup: BeautifulSoup, field: str) -> Tag:
    """
    Find and return the div tag of class="row" containing the given data field.

    Paramaters
    ----------
    soup: bs4.BeautifulSoup
        The parsed html to search within.
    field: str
        The fields to the left of each Pangaea dataset webpage.
        Possible values: ["citation", "project", "license", "size" etc.]

    Returns
    -------
    row: bs4.element.Tag
        the div tag containing the information relating to the given field.
    """
    for div in soup.find_all("div", class_="title"):
        if not field.lower() in div.text.lower():
            continue
        row = div.parent.parent
        return row

In [None]:
def get_paper_citation(soup: BeautifulSoup) -> str:
    """Given a parsed html object return the dataset research paper citation."""
    row = get_info_tag(soup, "citation")
    word = "supplement to:"
    for line in row.find("h1", class_="hanging citation").text.split("\n"):
        if word.lower() in line.lower():
            return line.split(word)[-1].strip()

In [None]:
def get_project_info(soup: BeautifulSoup) -> Tuple[str, str]:
    """Given a parsed html object return a tuple with the dataset project name and URL (if available)."""
    proj = get_info_tag(soup, "Project")
    if proj is not None:
        proj_name = proj.find("div", class_="descr").text.strip()
        pop_link = proj.find("a", class_="popover-link")
        proj_href = pop_link["href"] if pop_link is not None else None
        return proj_name, proj_href

### Scrape information for one dataset

In [None]:
# Scraper
# ds_id = 865440
ds_id = 227308
ds_url = get_dataset_url(ds_id)
print(ds_url)
resp = requests.get(ds_url)
soup = BeautifulSoup(resp.text, "lxml")

### Single example

In [None]:
ds = pangaeapy.PanDataSet(ds_url)
assert ds is not None
info = {
    "id": ds_id,
    "citation_dataset": ds.citation,
}
info["license"] = ds.licenses[0].label.text if len(ds.licenses) > 0 else None
info["license_url"] = ds.licenses[0].URI.text if len(ds.licenses) > 0 else None

In [None]:
# Citation for the research article in which this dataset is supplement
info["citation_paper"] = get_paper_citation(soup)

# Information about the project that this dataset was a part of
proj = get_project_info(soup)
info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)

In [None]:
info

## For all datasets

In [None]:
info_list = []
for ds_id in tqdm(ds_ids):
    ds_url = get_dataset_url(ds_id)
    # print(f"[{i+1}/{len(ds_ids)}] Requesting: {ds_url}")

    ds = pangaeapy.PanDataSet(ds_url)
    resp = requests.get(ds_url)
    soup = BeautifulSoup(resp.text, "lxml")

    assert ds is not None
    info = {
        "id": ds_id,
        "citation_dataset": ds.citation,
    }
    info["license"] = ds.licenses[0].label.text if len(ds.licenses) > 0 else None
    info["license_url"] = ds.licenses[0].URI.text if len(ds.licenses) > 0 else None
    # Extract Project info
    info["citation_paper"] = get_paper_citation(soup)
    proj = get_project_info(soup)
    info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)
    # Add to list
    info_list.append(info)

### Save info to file

In [None]:
info_df = pd.DataFrame(info_list)
info_df

In [None]:
output_file = "../pangaea-dataset-details.csv"
info_df.to_csv(output_file, index=False)
print(f"Saved successfully to '{output_file}'!")