# Analysis of N2T prefixes.yaml Source

The prefixes file is located at `https://n2t.net/e/n2t_full_prefixes.yaml`

Here, the file is retrieved, parsed, and a inspected.

In [1]:
import os
import shutil
import requests
import yaml

PREFIXES_SOURCE = "https://n2t.net/e/n2t_full_prefixes.yaml"
PREFIXES_CACHE = "data/n2t_full_prefixes.yaml"

# Download the file if not already present in "data" folder.
# Delete the file to download a new copy
if not os.path.exists(PREFIXES_CACHE):
    os.makedirs("data", exist_ok=True)
    with requests.get(PREFIXES_SOURCE, stream=True) as src:
        with open(PREFIXES_CACHE, "wb") as dest:
            shutil.copyfileobj(src.raw, dest)

# Parse the yaml source into a python dictionary
# The load order of dictionary items is preserved in python 3.7+
# See: https://docs.python.org/3/whatsnew/3.7.html
prefixes = {}
with open(PREFIXES_CACHE, "r") as prefix_source:
    prefixes = yaml.load(prefix_source, Loader=yaml.SafeLoader)


In [2]:
from IPython.display import display,Markdown

print(f"There are {len(prefixes.keys())} keys.")
fields = {}
for prefix, entry in prefixes.items():
    for field in entry.keys():
        n = fields.get(field, 0)
        n += 1
        fields[field] = n
print(f"There are {len(fields.keys())} distinct fields:")
_md = (
    "| Row | Field | Occurrences |\n"
    "| --- | --- | --- |\n"
)
i = 0
for k in sorted(list(fields.keys())):    
    _md += f"| {i} | {k} | {fields[k]} |\n"
    i += 1
display(Markdown(_md))


There are 4208 keys.
There are 32 distinct fields:


| Row | Field | Occurrences |
| --- | --- | --- |
| 0 | active | 9 |
| 1 | alias | 837 |
| 2 | datacenter | 138 |
| 3 | date | 1417 |
| 4 | description | 827 |
| 5 | for | 120 |
| 6 | forward | 827 |
| 7 | how | 23 |
| 8 | institution | 827 |
| 9 | is_supershoulder | 12 |
| 10 | location | 827 |
| 11 | manager | 1411 |
| 12 | minter | 461 |
| 13 | more | 837 |
| 14 | na_policy | 939 |
| 15 | name | 2271 |
| 16 | norm | 3 |
| 17 | pattern | 826 |
| 18 | prefix_shares_datacenter | 137 |
| 19 | prefixed | 827 |
| 20 | primary | 837 |
| 21 | probe | 837 |
| 22 | provider | 827 |
| 23 | provider_id | 826 |
| 24 | redirect | 2938 |
| 25 | registration_agency | 176 |
| 26 | sort_score | 826 |
| 27 | state | 826 |
| 28 | subject | 826 |
| 29 | synonym | 826 |
| 30 | test | 837 |
| 31 | type | 4208 |


In [3]:
#examine_fields = {"active":{}, "alias": {}, "datacenter": {}, "how":{}, "is_supershoulder":{}, "norm":{}, }
examine_fields = {}
for f in fields:
    examine_fields[f] = {}
for prefix, entry in prefixes.items():
    for f, fv in examine_fields.items():
        v = entry.get(f, None)
        if v is not None:
            n = fv.get(v, 0)
            examine_fields[f][v] = n+1

md = [
    "Number of distinct values in each field:",
    "",
    "| Field | Distinct |",
    "| -- | -- |",
]
for ef in sorted(list(examine_fields.keys())):
    f = examine_fields[ef]
    md.append(f"| {ef} | {len(f.keys())} |")
display(Markdown("\n".join(md)))



Number of distinct values in each field:

| Field | Distinct |
| -- | -- |
| active | 1 |
| alias | 6 |
| datacenter | 11 |
| date | 816 |
| description | 658 |
| for | 120 |
| forward | 792 |
| how | 10 |
| institution | 429 |
| is_supershoulder | 1 |
| location | 51 |
| manager | 4 |
| minter | 424 |
| more | 697 |
| na_policy | 41 |
| name | 2169 |
| norm | 3 |
| pattern | 418 |
| prefix_shares_datacenter | 1 |
| prefixed | 2 |
| primary | 2 |
| probe | 835 |
| provider | 81 |
| provider_id | 826 |
| redirect | 2311 |
| registration_agency | 2 |
| sort_score | 6 |
| state | 127 |
| subject | 350 |
| synonym | 252 |
| test | 667 |
| type | 6 |

In [35]:
md = [
    "| Field | Values | Count |",
    "| -- | -- | -- |",
]
for ef in sorted(list(examine_fields.keys())):
    f = examine_fields[ef]
    c = 0
    for k,v in f.items():
        k = str(k)
        k = k.replace("|", "&#124;")
        k = f"<code>{k}</code>"
        row = f"| {ef} | {k} | {v} |"
        if c < 1:
            md.append(row)
        #if c == 1:
        #    md.append(f"| | ... | |")
        c += 1        
display(Markdown("\n".join(md)))

| Field | Values | Count |
| -- | -- | -- |
| active | <code>false</code> | 9 |
| alias | <code>handle</code> | 1 |
| datacenter | <code>CDL.UCSC</code> | 4 |
| date | <code>2017.02.17</code> | 1 |
| description | <code>Uniform Resource Names (URNs) are intended to serve as persistent, location-independent, resource identifiers.</code> | 1 |
| for | <code>chebi</code> | 1 |
| forward | <code>http://nbn-resolving.org/resolver?identifier=urn:${ac}&verb=redirect</code> | 1 |
| how | <code>NP &#124; NR, OP, CC &#124; 2010 &#124;</code> | 1 |
| institution | <code>Deutsche Nationalbibliothek, Frankfurt</code> | 2 |
| is_supershoulder | <code>true</code> | 12 |
| location | <code>Germany</code> | 38 |
| manager | <code>n2t</code> | 939 |
| minter | <code>https://n2t.net/a/ezid/m/ark/63614/w1</code> | 1 |
| more | <code>https://wiki.ucop.edu/display/Curation/ARK</code> | 1 |
| na_policy | <code>NP &#124; (:unkn) unknown &#124; 2001 &#124;</code> | 3 |
| name | <code>Archival Resource Key</code> | 1 |
| norm | <code>mc</code> | 1 |
| pattern | <code>^urn\:nbn\:[A-Za-z_0-9]+\:([A-Za-z_0-9]+)-[A-Za-z_0-9]+$</code> | 1 |
| prefix_shares_datacenter | <code>true</code> | 137 |
| prefixed | <code>false</code> | 751 |
| primary | <code>true</code> | 116 |
| probe | <code>http://n2t.net/ark:/88435/hq37vq534</code> | 1 |
| provider | <code>n2t</code> | 1 |
| provider_id | <code>MIR:00100488</code> | 1 |
| redirect | <code>n2t.net/ark:$id</code> | 1 |
| registration_agency | <code>crossref</code> | 38 |
| sort_score | <code>1</code> | 267 |
| state | <code>99:Unknown</code> | 3 |
| subject | <code>bibliography</code> | 10 |
| synonym | <code>NBN</code> | 1 |
| test | <code>/88435/hq37vq534</code> | 1 |
| type | <code>scheme</code> | 837 |