# Dataset details scraper
**Input:** Pangaea benthic image dataset csv file.

**Output:**
- `.csv` file with:
    - "id" (the 6 digit pangaea dataset ID)
    - "license" (the license code)
    - "license_url" (the url to the full license text)
    - "citation_paper" (plain text citation for the paper)
    - "proj_name" (the name of the project the dataset is part of)
    - "proj_url" (url to the project website/wiki etc.)

- `.bib` file with all the bibtex citations for each dataset

- `.tex` file with Latex table for Table 2 of the BenthicNet Dataset Paper.

In [None]:
! gem install anystyle-cli
! anystyle --version

In [None]:
import datetime as dt
import os
import subprocess
import sys
import time
from typing import Optional, Tuple, Union

import pandas as pd
import pangaeapy
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from tqdm.auto import tqdm

sys.path.append("..")
from pangaea_downloader.tools.checker import is_url

## 1. Load Pangaea Benthic Image dataset

In [None]:
# pangaea_file = "../full-dataset/pangaea_2022-03-03_filtered_no-repeats_sorted-first_subsampled-1.25m-40b-200-40m_100-200m_fewfact2-nonspa-exh2.csv"
pangaea_file = "../full-dataset/pangaea_2022-01-24_filtered.csv"
# pangaea_file = "../full-dataset/pangaea_2022-01-24.csv"
df = pd.read_csv(pangaea_file, low_memory=False)
sorted_ids = sorted([int(ds_id.split("-")[-1]) for ds_id in df.dataset.unique()])
ds_ids = [f"pangaea-{id_}" for id_ in sorted_ids]
print(f"Total {len(ds_ids)} dataset licenses to fetch.")

## 2. Scrape dataset details

### 2.1 Functions to extract dataset metadata
- Dataset citation (BibTex)
- Paper citation (Plain text)
- Project URL

In [None]:
def get_dataset_url(ds_id: Union[str, int]) -> str:
    """Return dataset URL given six digit dataset ID."""
    if isinstance(ds_id, int):
        ds_id = str(ds_id)
    if ds_id.startswith("pangaea"):
        ds_id = ds_id.split("-")[-1]
    return f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}"

In [None]:
def get_bibtex(ds_id: str, verbose=False) -> str:
    """Get the BibTex Citation of a Pangaea dataset using the dataset ID."""
    bib_url = get_dataset_url(ds_id) + "?format=citation_bibtex"
    resp = requests.get(bib_url)
    if verbose:
        print("\tStatus code:", resp.status_code)
    return resp.text

In [None]:
def get_info_tag(soup: BeautifulSoup, field: str) -> Tag:
    """
    Find and return the div tag of class="row" containing the given data field.

    Paramaters
    ----------
    soup: bs4.BeautifulSoup
        The parsed html to search within.
    field: str
        The fields to the left of each Pangaea dataset webpage.
        Possible values: ["citation", "project", "license", "size" etc.]

    Returns
    -------
    row: bs4.element.Tag
        the div tag containing the information relating to the given field.
    """
    for div in soup.find_all("div", class_="title"):
        if not field.lower() in div.text.lower():
            continue
        row = div.parent.parent
        return row

In [None]:
def get_paper_citation(soup: BeautifulSoup) -> Optional[str]:
    """Given a parsed html object return the dataset research paper citation."""
    row = get_info_tag(soup, "citation")
    if row is None:
        return
    word = "Supplement to:"
    for line in row.find("h1", class_="hanging citation").text.split("\n"):
        if word.lower() in line.lower():
            citation = line.split(word)[-1].strip().replace(word, "")
            citation = citation.replace("In supplement to: ", "")
            return citation

In [None]:
def get_project_info(ds: pangaeapy.PanDataSet, soup: BeautifulSoup):
    """Given a parsed html object return a tuple with the dataset project name and URL (if available)."""
    name, url = None, None
    if len(ds.projects) > 0:
        name = ds.projects[0].name.text if ds.projects[0].name is not None else None
        href = ds.projects[0].URL.text if ds.projects[0].URL is not None else None
        if isinstance(href, str) and is_url(href):
            url = href
        return name, url

    proj = get_info_tag(soup, "Project")
    if proj is not None:
        name = proj.find("div", class_="descr").text.strip()
        pop_link = proj.find("a", class_="popover-link")
        if pop_link is not None:
            try:
                href_tag = pop_link["data-content"].split("\n")[1].split(" ")[4]
                href = href_tag.split('"')[1]
            except IndexError:
                href = None
    return name, url

### 2.2 Scrape information for one dataset

In [None]:
ds_id = 198643
ds_id = f"pangaea-{ds_id}"
ds_url = get_dataset_url(ds_id)
print(ds_url)

resp = requests.get(ds_url)
print("Status_code:", resp.status_code)
soup = BeautifulSoup(resp.text, "lxml")

bibtex = get_bibtex(ds_id)
bib_tag = bibtex.split("{")[1].split(",")[0]
bibtex = bibtex.replace(bib_tag, ds_id)
print(bibtex)
print(bib_tag)

In [None]:
ds = pangaeapy.PanDataSet(ds_url, include_data=False)
assert ds is not None
# Extract and store dataset info in dict
info = {
    "dataset": ds_id,
    "bibtex_tag": bib_tag,
    "citation_dataset": ds.citation,
    "parent": None,
    "is_parent": False,
    "children": [],
    "license": None,
    "license_url": None,
}
# Parent dataset
if len(ds.children) > 0:
    children = [f"pangaea-{child.split('.')[-1]}" for child in ds.children]
    info["is_parent"] = True
    info["children"] = children
# Child: Identify parents
if "In:" in info["citation_dataset"]:
    info["parent"] = f"pangaea-{info['citation_dataset'].split('.')[-1]}"
info["citation_paper"] = get_paper_citation(soup)
if (ds.error != "Data set is protected") and (len(ds.licenses) > 0):
    info["license"] = ds.licenses[0].label.text
    info["license_url"] = ds.licenses[0].URI.text
elif ds.error == "Data set is protected":
    info["license"] = "Protected (License Unknown)"
    info["license_url"] = None
proj = get_project_info(ds, soup)
info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)

In [None]:
info

In [None]:
def correct_citation_paper(citation: str, ds: pangaeapy.PanDataSet) -> str:
    """Replace 'author et al.' in input string with full author list."""
    if "et al." in citation:
        authors = [f"{auth.lastname}, {auth.firstname[0]}." for auth in ds.authors]
        auth_str = ", ".join(authors[:-1]) + f", and {authors[-1]}"
        corrected = auth_str + citation.split("et al.")[-1]
        return corrected
    return citation

In [None]:
if isinstance(info["citation_paper"], str):
    print("Original:", info["citation_paper"])
    print()
    print("Corrected:", correct_citation_paper(info["citation_paper"], ds))

### 2.3 For all datasets

In [None]:
bibtex_list = []
info_list = []
parent_child_mappings = {}

errors = []
for i, ds_id in enumerate(tqdm(ds_ids)):
    try:
        time.sleep(0.005)
        # Produce the dataset URL from the ID
        ds_url = get_dataset_url(ds_id)
        print(f"[{i+1}/{len(ds_ids)}] Requesting: {ds_url}")

        # Fetch the PanDataSet object
        ds = pangaeapy.PanDataSet(ds_url, include_data=False)
        assert ds is not None

        # Request the dataset webpage and parse
        resp = requests.get(ds_url)
        while resp.status_code != 200:
            print(f"[ERROR] Status code: {resp.status_code}! Retrying...")
            time.sleep(0.0025)
            resp = requests.get(ds_url)
        soup = BeautifulSoup(resp.text, "lxml")

        # Fetch the bibtex citation for the dataset
        bibtex = get_bibtex(ds_id)
        bib_tag = bibtex.split("{")[1].split(",")[0]
        bibtex = bibtex.replace(bib_tag, ds_id)

        # Extract and store dataset info in dict
        info = {
            "dataset": ds_id,
            "bibtex_tag": bib_tag,
            "citation_dataset": ds.citation,
            "parent": None,
            "is_parent": False,
            "children": [],
            "license": None,
            "license_url": None,
        }
        if len(ds.children) > 0:  # Parent dataset
            children = [f"pangaea-{child.split('.')[-1]}" for child in ds.children]
            parent_child_mappings[ds_id] = children
            info["is_parent"] = True
            info["children"] = children
        # Child: Identify parents
        if "In:" in info["citation_dataset"]:
            info["parent"] = f"pangaea-{info['citation_dataset'].split('.')[-1]}"
        info["citation_paper"] = get_paper_citation(soup)
        if isinstance(info["citation_paper"], str):
            info["citation_paper"] = correct_citation_paper(info["citation_paper"], ds)
        if (ds.error != "Data set is protected") and (len(ds.licenses) > 0):
            info["license"] = ds.licenses[0].label.text
            info["license_url"] = ds.licenses[0].URI.text
        elif ds.error == "Data set is protected":
            info["license"] = "Protected (License Unknown)"
            info["license_url"] = None
        proj = get_project_info(ds, soup)
        info["proj_name"], info["proj_url"] = proj if proj is not None else (None, None)

        # Add info to list
        bibtex_list.append(bibtex)
        info_list.append(info)
    except Exception as e:
        print("[ERROR]", e)
        errors.append({"dataset": ds_id, "error": e})

## 3. Outputs

In [None]:
out_dir = "../dataset_details/"
os.makedirs(out_dir, exist_ok=True)

### Error logs

In [None]:
# Save errors
if len(errors) > 0:
    error_logs = os.path.join(out_dir, f"errors_{dt.date.today()}.csv")
    err_df = pd.DataFrame(errors)
    err_df.to_csv(error_logs)
    print(f"[INFO] Error logs saved to: {error_logs}")

Check datasets that raised errors

In [None]:
idx = 0
if len(errors) > 0:
    err = errors[idx]
    print(err)
    ds_id = err["dataset"]
    ds_url = get_dataset_url(ds_id)
    print(ds_url)
    ds = pangaeapy.PanDataSet(ds_url)

    resp = requests.get(ds_url)
    print("Status_code:", resp.status_code)
    soup = BeautifulSoup(resp.text, "lxml")

In [None]:
# Issue: many datasets have > 1 project
for i, proj in enumerate(ds.projects):
    print(f"[{i}] {proj.name.text}")
    print(f" URL: {proj.URL}")
    print()

### 3.1 `.csv`file: dataset details/metadata

In [None]:
info_df = pd.DataFrame(info_list)
output_file = os.path.join(out_dir, f"pangaea-dataset-details_{dt.date.today()}.csv")
info_df.to_csv(output_file, index=False)
print(f"[INFO] All {len(info_df)} dataset details written to file: '{output_file}'")
info_df

#### 3.1.1 Missing values

In [None]:
info_df.isna().sum()

#### 3.1.2 License types and counts

In [None]:
info_df.fillna("NaN").license.value_counts()

### 3.2 `.bib`file: dataset bibtex citations

In [None]:
# Write citations to file
bibtex_file = os.path.join(out_dir, f"pangaea-datasets_{dt.date.today()}.bib")
with open(bibtex_file, "w") as f:
    f.writelines(bibtex_list)
print(
    f"[INFO] All {len(bibtex_list)} dataset BibTex citations written to file: '{bibtex_file}'"
)

In [None]:
for bib in bibtex_list:
    print(bib)

### 3.3 `.txt` file: plain text research paper citations

In [None]:
text_citations_file = "../dataset_details/pangaea-refs.txt"
paper_citations = info_df.citation_paper.dropna().unique()
with open(text_citations_file, "w", encoding="UTF-8") as f:
    for i, citation in enumerate(paper_citations):
        f.write(citation + "\n")
print(
    f"[INFO] All {len(paper_citations)} dataset research paper citations written to file: '{text_citations_file}'"
)

In [None]:
! cat "../dataset_details/pangaea-refs.txt"

### 3.4 Convert plain text research paper citations to bibtex and save to `.bib` file

In [None]:
# Convert to bibtex
command = "anystyle"
txt_file = "../dataset_details/pangaea-refs.txt"
args = ["-f", "bib", "parse", txt_file]
ret = subprocess.run([command, *args], shell=True, capture_output=True)
out = ret.stdout.decode("utf-8").replace("\r", "").replace("date =", "year =")
paper_bibtex_list = ["@" + bib for bib in out.split("@")][1:]

# Write to file
bib_file = "../dataset_details/pangaea-refs.bib"
with open(bib_file, "w", encoding="UTF-8") as f:
    f.writelines(paper_bibtex_list)
print(
    f"[INFO] All {len(paper_bibtex_list)} dataset research paper BibTex citations written to file: '{bib_file}'"
)

In [None]:
! cat "../dataset_details/pangaea-refs.bib"

In [None]:
# Keep track of BibTex tags for each plain text citation
paper_bib_mappings = {}
for text, bib in zip(paper_citations, paper_bibtex_list):
    bib_tag = bib.split("{")[1].split(",")[0]
    paper_bib_mappings[bib_tag] = text

In [None]:
# Replace plain text citations with paper BibTex citation tags
for info in info_list:
    if info["citation_paper"] is not None:
        for bib_tag, text in paper_bib_mappings.items():
            if text == info["citation_paper"]:
                info["citation_paper_tag"] = bib_tag

### 3.5 Dataset details latex table
###### Table columns:
- Dataset (pangaea ID)
- Repository
- NoSites
- NoImages
- License
- Citations

In [None]:
top = r"""\begin{table}[tbhp]
  \centering
  \caption{
Dataset details.
}
\label{tab:datasets-appendix}
\centerline{
\begin{tabular}{llrrll}
\toprule
Dataset & Repository & \textnumero{} Sites & \textnumero{} Images & License & Citations \\
\midrule
"""

# Generate latex table row entries
rows = []
for info in info_list:
    ds_id = info["dataset"]
    # License info
    license = "License Missing!".upper()
    if info["license"] is not None:
        license = info["license"]
    if info["license_url"] is not None:
        license = r"\href{" + info["license_url"] + "}" "{" + info["license"] + "}"
    # Citation info
    citations = r"\citet{" + info["dataset"] + "}"
    if info["citation_paper"] is not None:
        citations = r"\citet{" + f'{info["dataset"]},{info["citation_paper_tag"]}' + "}"
    row = "{} & PANGAEA & x & y & {} & {} \\\\ \n".format(ds_id, license, citations)
    rows.append(row)

bot = r"""\bottomrule
\end{tabular}
}
\end{table}
"""

In [None]:
# Write table to tex file
tex_path = os.path.join(out_dir, f"table_2-dataset-details_{dt.date.today()}.tex")
with open(tex_path, "w") as f:
    f.write(top)
    f.writelines(rows)
    f.write(bot)
print(f"All {len(rows)} rows written to {tex_path}")

In [None]:
print(top, end="")
for row in rows:
    print(row, end="")
print(bot, end="")

## 4. Validation

In [None]:
for info in info_list:
    for key in info.keys():
        if not isinstance(info[key], str):
            continue
        if info[key].startswith("<!DOCTYPE html>"):
            print(info["dataset"], key, info[key])

### 4.1 Check year field for dataset bibtex citaions

In [None]:
print(
    f"""All {len(bibtex_list)} dataset BibTex citations have "year" field:""",
    all(["year={" in bib for bib in bibtex_list]),
)

In [None]:
print(
    f"""All {len(paper_bibtex_list)} paper BibTex citations have "year" field:""",
    all(["year = {" in bib for bib in paper_bibtex_list]),
)

### 4.2 HTML instead of bibtex

In [None]:
n_faulty = 0
for bib in bibtex_list:
    if bib.startswith("<!DOCTYPE html>"):
        n_faulty += 1

In [None]:
print("Number of datasets with html instead of bibtex:", n_faulty)

### 4.3 Check URLs

In [None]:
# Check if the license and project URLs are valid
license_url_is_valid = info_df.license_url.dropna().apply(is_url).all()
proj_url_is_valid = info_df.proj_url.dropna().apply(is_url).all()
print("All URLs are valid:")
print("-------------------")
print("license_url:\t", license_url_is_valid)
print("proj_url:\t", proj_url_is_valid)

In [None]:
# Show invalid license URLs
if not license_url_is_valid:
    for idx, url in (
        info_df[~info_df.license_url.apply(is_url)].license_url.dropna().items
    ):
        dataset = info_df.loc[idx, "dataset"]
        print(
            f"Dataset: {dataset}, License URL: {url}\nDataset URL: {get_dataset_url(dataset)}"
        )
        print("-" * 85)

In [None]:
# Show invalid project URLs
if not proj_url_is_valid:
    dds = []
    for idx, url in info_df[~info_df.proj_url.apply(is_url)].proj_url.dropna().items():
        dataset = info_df.loc[idx, "dataset"]
        print(
            f"Dataset: {dataset}, Project URL: {url}\nDataset URL: {get_dataset_url(dataset)}"
        )
        print("-" * 85)
        dds.append(dataset)