# Dataset details scraper
**Input:** Pangaea benthic image dataset csv file.

**Output:**
- `.csv` file with:
    - "id" (the 6 digit pangaea dataset ID)
    - "license" (the license code)
    - "license_url" (the url to the full license text)
    - "citation_paper" (plain text citation for the paper)
    - "proj_name" (the name of the project the dataset is part of)
    - "proj_url" (url to the project website/wiki etc.)

- `.bib` file with all the bibtex citations for each dataset

- `.txt` file with information for Table 2 of the BenthicNet Dataset Paper.

In [None]:
from typing import Optional, Tuple, Union

import pandas as pd
import pangaeapy
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from tqdm.auto import tqdm

### Load list of dataset IDs

In [None]:
file = "../full-dataset/pangaea_2022-01-27.csv"
df = pd.read_csv(file, low_memory=False)
ds_ids = [ds_name.split("-")[-1] for ds_name in df.dataset.unique()]
print(f"Total {len(ds_ids)} dataset licenses to fetch.")

### Functions to extract dataset metadata
- Dataset citation (BibTex)
- Paper citation (Plain text)
- Project URL

In [None]:
def get_dataset_url(ds_id: Union[str, int]) -> str:
    """Return dataset URL given six digit dataset ID."""
    if isinstance(ds_id, int):
        ds_id = str(ds_id)
    if ds_id.startswith("pangaea"):
        ds_id = ds_id.split("-")[-1]
    return f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}"

In [None]:
def get_bibtex(ds_id: str, verbose=False) -> str:
    """Get the BibTex Citation of a Pangaea dataset using the dataset ID."""
    bib_url = get_dataset_url(ds_id) + "?format=citation_bibtex"
    resp = requests.get(bib_url)
    if verbose:
        print("\tStatus code:", resp.status_code)
    return resp.text

In [None]:
def get_info_tag(soup: BeautifulSoup, field: str) -> Tag:
    """
    Find and return the div tag of class="row" containing the given data field.

    Paramaters
    ----------
    soup: bs4.BeautifulSoup
        The parsed html to search within.
    field: str
        The fields to the left of each Pangaea dataset webpage.
        Possible values: ["citation", "project", "license", "size" etc.]

    Returns
    -------
    row: bs4.element.Tag
        the div tag containing the information relating to the given field.
    """
    for div in soup.find_all("div", class_="title"):
        if not field.lower() in div.text.lower():
            continue
        row = div.parent.parent
        return row

In [None]:
def get_paper_citation(soup: BeautifulSoup) -> Optional[str]:
    """Given a parsed html object return the dataset research paper citation."""
    row = get_info_tag(soup, "citation")
    if row is None:
        return
    word = "Supplement to:"
    for line in row.find("h1", class_="hanging citation").text.split("\n"):
        if word.lower() in line.lower():
            return line.split(word)[-1].strip().replace(word, "")

In [None]:
def get_project_info(soup: BeautifulSoup) -> Tuple[str, str]:
    """Given a parsed html object return a tuple with the dataset project name and URL (if available)."""
    proj_name, proj_href = None, None
    proj = get_info_tag(soup, "Project")
    if proj is not None:
        proj_name = proj.find("div", class_="descr").text.strip()
        pop_link = proj.find("a", class_="popover-link")
        if pop_link is not None:
            href_tag = pop_link["data-content"].split("\n")[1].split(" ")[4]
            proj_href = href_tag.split('"')[1]
    return proj_name, proj_href

### Scrape information for one dataset

In [None]:
# Scraper
# ds_id = 865440
# ds_id = 227308
ds_id = 778725
ds_url = get_dataset_url(ds_id)
print(ds_url)

resp = requests.get(ds_url)
soup = BeautifulSoup(resp.text, "lxml")

bibtex = get_bibtex(ds_id)
print(bibtex)
bib_tag = bibtex.split("{")[1].split(",")[0]
print(bib_tag)

In [None]:
ds = pangaeapy.PanDataSet(ds_url)
assert ds is not None
# Extract and store dataset info in dict
info = {
    "dataset": f"pangaea-{ds_id}",
    "bibtex_tag": bib_tag,
    "citation_dataset": ds.citation,
}
info["citation_paper"] = get_paper_citation(soup)
info["license"] = ds.licenses[0].label.text if len(ds.licenses) > 0 else None
info["license_url"] = ds.licenses[0].URI.text if len(ds.licenses) > 0 else None
proj = get_project_info(soup)
info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)

In [None]:
info

## For all datasets

In [None]:
bibtex_list = []
info_list = []

for i, ds_id in enumerate(tqdm(ds_ids)):
    # Produce the dataset URL from the ID
    ds_url = get_dataset_url(ds_id)
    # print(f"[{i+1}/{len(ds_ids)}] Requesting: {ds_url}")

    # Fetch the PanDataSet object
    ds = pangaeapy.PanDataSet(ds_url)
    assert ds is not None

    # Request the dataset webpage and parse
    resp = requests.get(ds_url)
    soup = BeautifulSoup(resp.text, "lxml")

    # Fetch the bibtex citation for the dataset
    bibtex = get_bibtex(ds_id)
    bib_tag = bibtex.split("{")[1].split(",")[0]

    # Extract and store dataset info in dict
    info = {
        "dataset": f"pangaea-{ds_id}",
        "bibtex_tag": bib_tag,
        "citation_dataset": ds.citation,
    }
    info["citation_paper"] = get_paper_citation(soup)
    info["license"] = ds.licenses[0].label.text if len(ds.licenses) > 0 else None
    info["license_url"] = ds.licenses[0].URI.text if len(ds.licenses) > 0 else None
    proj = get_project_info(soup)
    info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)

    # Add info to list
    bibtex_list.append(bibtex)
    info_list.append(info)

In [None]:
print(ds_url)

### Save dataset details to file

In [None]:
info_df = pd.DataFrame(info_list)
info_df

In [None]:
output_file = "../pangaea-dataset-details.csv"
info_df.to_csv(output_file, index=False)
print(f"[INFO] All dataset details written to file: '{output_file}'")

### Save bibtex citations to file

In [None]:
# Write citations to file
bibtex_file = "../pangaea-citations.bib"
with open(bibtex_file, "w") as f:
    f.writelines(bibtex_list)
print(f"[INFO] All dataset BibTex citations written to file: '{bibtex_file}'")

In [None]:
! cat "../pangaea-citations.bib"

# TODO:
- write code to generate latex code for table 2: dataset details in BenthicNet Paper
- generate `.txt` file the following delimited values (latex formatted):
    - Dataset (pangaea ID)
    - Repository
    - NoSites
    - NoImages
    - License
    - Citations