# PoC for fetching data from Zenodo based on a DOI

In [1]:
import intake
import requests
import pycurl
from urllib.parse import urlparse
import os
from pathlib import Path
import logging

In [2]:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

In [3]:
!pwd

/work/esm-vfc-catalogs/examples


In [4]:
# parameters
catalog_file = "../catalogs/fesom2_catalog.yaml"
data_path = Path("../esm_vfc_data/").resolve()

In [5]:
os.environ["ESM_VFC_DATA_DIR"] = str(data_path)

In [6]:
cat = intake.open_catalog(catalog_file)
# cat["FESOM2_PIi_mesh_a_ice"].read()  # No data yet

In [7]:
ACCESS_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

In [8]:
def download_zenodo_files(
    zenodo_doi, target_directory=None,
    force_download=False, filter_files=None):
    
    # check if we filter files
    if filter_files is not None:
        raise NotImplementedError("Filtering is not implemented yet")

    # get zenodo record ID from doi
    zenodo_record = zenodo_doi.split('.')[-1]
    logging.debug(f"will download record {zenodo_record}")
    
    # get full record from zenodo
    # see https://developers.zenodo.org/#quickstart-upload for pointers
    r = requests.get(
        f"https://zenodo.org/api/records/{zenodo_record}",
        params={'access_token': ACCESS_TOKEN}
    )
    logging.debug(f"got status code {r.status_code}")
    # should we debug-log the full json dump?

    # TODO: Check that we got the correct DOI
    
    # get list of source urls
    all_urls = [file["links"]["self"] for file in r.json()["files"]]
    all_target_files = [
        Path(target_directory) / Path(parsed_url.path).name
        for parsed_url in map(urlparse, all_urls)
    ]
    
    # ensure target dir exists
    Path(target_directory).mkdir(exist_ok=True, parents=True)
    
    # download all wanted files with curl
    for url, file in zip(all_urls, all_target_files):
        if not file.exists() or force_download:
            with open(file, "wb") as f:
                logging.debug(f"will download {url} to {file}")
                c = pycurl.Curl()
                c.setopt(c.URL, url)
                c.setopt(c.WRITEDATA, f)
                c.perform()
                c.close()
                logging.debug(f"download of {url} to {file} done")
    
    return all_target_files

In [9]:
download_zenodo_files(
    zenodo_doi=cat["FESOM2_Pi_mesh_a_ice"].metadata["zenodo_doi"],
    target_directory=(data_path / "FESOM2_Pi_mesh")
)

DEBUG:will download record 3819896
DEBUG:Starting new HTTPS connection (1): zenodo.org:443
DEBUG:https://zenodo.org:443 "GET /api/records/3819896?access_token=MQmEpNf5vMRqYmDqq739mNbBOXDh70ZlGGH41iL8XO6BBxvyATT9pxeRIQSZ HTTP/1.1" 200 None
DEBUG:got status code 200


[PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/temp.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/salt.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/u.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/v.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/w.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/a_ice.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/m_ice.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/vice.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/uice.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/sst.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/ssh.fesom.1948.nc'),
 PosixPath('/work/esm-vfc-catalogs/esm_vfc_data/FESOM2_Pi_mesh/MLD1.fesom.

In [10]:
cat["FESOM2_Pi_mesh_a_ice"].read()