Simple resolver using YAML source file.

Schemes, shoulders, and NAANs are handled differently.

Given identifier I, the process is:

1. Split I on the first colon, yielding (SCHEME, VALUE)
2. Create SCHEME_L = lower(SCHEME)
3. if SCHEME_L == "ark" and VALUE[0] is not a "/" then VALUE = "/" + VALUE
4. Create I2 = lower(SCHEME) + ":" + VALUE
3. Return longest prefix key that matches start of I2 otherwise None

Typical times (sec) for loading:
```
Downloading YAML source...
Elapsed: 0.856654167175293

JSON cache created
Elapsed: 4.084223985671997

Json prefixes loaded
Elapsed: 0.00966191291809082
```

Note: search fabrica for identifiers like:
```
curl -v "https://api.datacite.org/dois?query=10.34945/F5*" | jq '.'
```

In [1]:
import os
import shutil
import time
import json
import requests
import yaml

PREFIXES_SOURCE = "https://n2t.net/e/n2t_full_prefixes.yaml"
PREFIXES_CACHE = "data/n2t_full_prefixes.yaml"
PREFIXES_JSON = "data/n2t_full_prefixes.json"


def cleanRedirect(r, default_protocol="https"):
    """Ensure t is a URL.

    Verify it has a protocol
    """
    _protocols = ["http", "https", "ftp", "ftps", "sftp", ]
    try:
        a,b = r.split(":",1)
        a = a.lower()
        if not a in _protocols:
            return f"{default_protocol}://{r.lstrip('/')}"
        return r
    except ValueError as e:
        pass
    return f"{default_protocol}://{r.lstrip('/')}"


# Download the file if not already present in "data" folder.
# Delete the file to download a new copy
if not os.path.exists(PREFIXES_CACHE):
    t0 = time.time()
    os.makedirs("data", exist_ok=True)
    print("Downloading YAML source...")
    with requests.get(PREFIXES_SOURCE, stream=True) as src:
        with open(PREFIXES_CACHE, "wb") as dest:
            shutil.copyfileobj(src.raw, dest)
    print(f"Elapsed: {time.time() - t0}")

if not os.path.exists(PREFIXES_JSON):
    # Parse the yaml source into a python dictionary and
    # save as JSON (much faster for next load)
    # The load order of dictionary items is preserved in python 3.7+
    # See: https://docs.python.org/3/whatsnew/3.7.html
    yprefixes = {}
    t0 = time.time()
    with open(PREFIXES_CACHE, "r") as prefix_source:
        yprefixes = yaml.load(prefix_source, Loader=yaml.SafeLoader)
    for k,v in yprefixes.items():
        t = v.get("redirect", None)
        if t is not None:
            v["redirect"] = cleanRedirect(t)
            yprefixes[k] = v
    with open(PREFIXES_JSON, "w") as prefix_json:
        json.dump(yprefixes, prefix_json)
    print("JSON cache created")
    print(f"Elapsed: {time.time() - t0}")

prefixes = {}
t0 = time.time()
with open(PREFIXES_JSON, "r") as src:    
    prefixes = json.load(src)
print("Json prefixes loaded")
print(f"Elapsed: {time.time() - t0}")

JSON cache created
Elapsed: 4.6434149742126465
Json prefixes loaded
Elapsed: 0.008626222610473633


In [2]:


def longestMatch(keys, test):
    '''Find longest key pattern that matches test
    '''
    match = None
    match_len = 0
    for p in keys:
        if test.startswith(p):
            plen = len(p)
            if plen > match_len:
                match = p
                match_len = plen
    return match

def getResolvers(I2):
    res = longestMatch(prefixes.keys(), I2)
    return res

def resolve(identifier):
    scheme, value = identifier.split(":", 1)
    value2 = value
    scheme = scheme.lower()
    if scheme == "ark":
        if value is not None:
            if value[0] != "/":
                value = f"/{value}"
            naan = value[1:]
            try:
                naan,suffix = value[1:].split("/", 1)
            except ValueError:
                pass
            value2 = f"/{naan}"
    I2 = f"{scheme}:{value2}"
    resolver_key = getResolvers(I2)
    resolver = prefixes.get(resolver_key)
    if resolver.get("type") == "synonym":
        resolver_key = resolver.get("for")
        resolver = prefixes[resolver_key]
    url = None
    target = resolver.get("redirect", None)
    if target is not None:
        url = target.replace("$id", value)        
    return resolver_key, resolver, url


tests = [
    "do:",
    "doi:",
    "doi:10.1038/nbt1156",
    "ark:/81431/p3",
    "ark:81431",
    "IGSN:AU1243",
    "igsn:AU1243",
    "interpro:IPR000100",
    "ebi/ensembl.fungi:CADAFLAT00006211",
    "doi:10.34945/f50p40",
]
for test in tests:
    t0 = time.time()
    resolver_key, resolver, url = resolve(test)
    t1 = time.time()
    print(f"{test} -> {url}")
    print(f"  Elapsed: {t1-t0}")
    print(f"  Resolver key: {resolver_key}")
    #print(f"  Target = {resolver}")
    print()


do: -> http://disease-ontology.org/term/
  Elapsed: 0.0009489059448242188
  Resolver key: do

doi: -> https://doi.org/
  Elapsed: 0.0009031295776367188
  Resolver key: doi

doi:10.1038/nbt1156 -> https://doi.org/10.1038/nbt1156
  Elapsed: 0.0007753372192382812
  Resolver key: doi

ark:/81431/p3 -> http://www.library.upenn.edu/ark:/81431/p3
  Elapsed: 0.0005772113800048828
  Resolver key: ark:/81431

ark:81431 -> http://www.library.upenn.edu/ark:/81431
  Elapsed: 0.00067901611328125
  Resolver key: ark:/81431

IGSN:AU1243 -> https://hdl.handle.net/10273/AU1243
  Elapsed: 0.0006031990051269531
  Resolver key: igsn

igsn:AU1243 -> https://hdl.handle.net/10273/AU1243
  Elapsed: 0.0006000995635986328
  Resolver key: igsn

interpro:IPR000100 -> https://www.ebi.ac.uk/interpro/entry/IPR000100
  Elapsed: 0.0009160041809082031
  Resolver key: interpro

ebi/ensembl.fungi:CADAFLAT00006211 -> https://fungi.ensembl.org/id/CADAFLAT00006211
  Elapsed: 0.0009329319000244141
  Resolver key: ensembl.fung