In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

### Load list of dataset IDs

In [None]:
file = "../pangaea_2022-01-27.csv"
df = pd.read_csv(file, low_memory=False)
ds_ids = [ds_name.split("-")[-1] for ds_name in df.dataset.unique()]
print(f"Total {len(ds_ids)} dataset licenses to fetch.")

### Functions to extract license info

In [None]:
def get_dataset_url(ds_id):
    """Return dataset URL given six digit dataset ID."""
    return f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}"

In [None]:
def get_license_info(url, verbose=False):
    """Return a dictionary with license information given the dataset URL."""
    # Make a request to the URL and parse the html
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "lxml")
    # Get the tag containing the license info
    license_tag = soup.find("a", attrs={"rel": "license"})
    if license_tag is None:
        return
    # Extract components from tag
    license_info = {}
    license_info["url"] = license_tag["href"]
    license_info["text"] = license_tag.text
    return license_info

### Scrape license information for each dataset

In [None]:
license_list = []
for ds_id in tqdm(ds_ids):
    ds_url = get_dataset_url(ds_id)
    info = get_license_info(ds_url)
    if info is None:
        info = {"url": None, "text": None}
    info["id"] = "pangaea-" + ds_id
    license_list.append(info)

### Save license info to file

In [None]:
license_df = pd.DataFrame(license_list)
license_df.to_csv("../dataset-licenses.csv", index=False)
print(f"License information scrapped for {len(license_df)} datasets.")