# Unit Analysis: Download

Download the units and custom units used in the EDI Repository and export as JSON

## Create Package ID dictionary

Create a python dictionary of all packages organized by scope

In [1]:
import requests

r_scopes = requests.get("https://pasta.lternet.edu/package/eml")
scopes = r_scopes.text.split('\n')

pid_dict = {}
for s in scopes:
    pid_dict[s] = requests.get("https://pasta.lternet.edu/package/eml/" + s).text.split('\n')

## Create list of all metadata (latest versions)

In [2]:
edi_pids = pid_dict['edi']

## Parse metadata to units and custom units

Standard units are located in the `<standardUnit>` element of an `<attribute>`.

Custom units are located in the `<unitList>` of an `<additionalMetadata>` element.

## Metrics

Create a dictionary of unit names and overall counts (one for standard and one for custom)
Create a dictionary of unit names and dataset counts (one tally per unit per dataset)


In [40]:
import xml.etree.ElementTree as ET

standardUnits = {}
customUnits = {}
standardUnitsDatasetCount = {}
customUnitsDatasetCount = {}
unitCount = {}

failed_packages = []

print("progress: ", end="")

for p in edi_pids:

    print(p, end=", ")

    x = requests.get("https://pasta.lternet.edu/package/metadata/eml/edi/" + p + "/newest").text

    if 'User public does not have permission to read this metadata document:' in x:
        failed_packages.append(p)
        continue
    
    root = ET.fromstring(x)

    dataset = root[1]

    standard_set = []
    custom_set = []

    for unit in dataset.findall('dataTable/attributeList/attribute/measurementScale/ratio/unit/standardUnit'):

        standard_set.append(unit.text)

        if unit.text in standardUnits:
            standardUnits[unit.text] += 1
        else:
            standardUnits[unit.text] = 1

    # TODO can this be deleted? 
    # for custom in root.findall('additionalMetadata/metadata/unitList/unit'):

    #     try:
    #         customName = custom.attrib['id']
    #     except (KeyError):
    #         customName = custom.attrib['name']

    #     custom_set.append(customName)

    #     if customName in customUnits:
    #         customUnits[customName] += 1
    #     else:
    #         customUnits[customName] = 1

    for unit in dataset.findall('dataTable/attributeList/attribute/measurementScale/ratio/unit/customUnit'):

        custom_set.append(unit.text)

        if unit.text in customUnits:
            customUnits[unit.text] += 1
        else:
            customUnits[unit.text] = 1

    for su in set(standard_set):
        if su in standardUnitsDatasetCount:
            standardUnitsDatasetCount[su] += 1
        else:
            standardUnitsDatasetCount[su] = 1

    for cu in set(custom_set):
        if cu in customUnitsDatasetCount:
            customUnitsDatasetCount[cu] += 1
        else:
            customUnitsDatasetCount[cu] = 1


    editor = root.findall('additionalMetadata/metadata/emlEditor')
    ezEML = False
    if (len(editor) and editor[0].attrib['app'] =='ezEML'):
        ezEML = True

    unitCount[root.attrib['packageId']] = {
        "standardCount": len(standard_set),
        "customCount": len(custom_set),
        "ezEML": ezEML}
    

print("Done.")


progress: 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 153, 155, 157, 160, 163, 164, 165, 166, 167, 168, 169, 170, 176, 179, 181, 184, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 197, 198, 199, 200, 201, 202, 206, 207, 208, 210, 213, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 246, 247, 248, 250, 251, 252, 253, 2

In [41]:
import json
with open("standardUnitsCount.json", "w") as outfile:
    json.dump(standardUnits, outfile)

with open("customUnitsCount.json", "w") as outfile:
    json.dump(customUnits, outfile)

with open("standardUnitsDatasetCount.json", "w") as outfile:
    json.dump(standardUnitsDatasetCount, outfile)

with open("customUnitsDatasetCount.json", "w") as outfile:
    json.dump(customUnitsDatasetCount, outfile)

with open("unitCount.json", "w") as outfile:
    json.dump(unitCount, outfile)