## Read occurrence of Darwin Core Archive

In [76]:
import requests
import time
from dwca.read import DwCAReader

with DwCAReader('../data/0115308-200613084148143.zip') as dwca:
    occ = dwca.pd_read('occurrence.txt', parse_dates=False)\

occ.head(5)

Unnamed: 0,id,abstract,accessRights,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,...,identifiedByID,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,geodeticDatum
0,144763223,,,,,,,,,"A. Atkinson, V. Siegel , E. Pakhomov, P. Rothe...",...,,,,,,,,,,WGS84
1,144763222,,,,,,,,,"A. Atkinson, V. Siegel , E. Pakhomov, P. Rothe...",...,,,,,,,,,,WGS84
2,144763221,,,,,,,,,"A. Atkinson, V. Siegel , E. Pakhomov, P. Rothe...",...,,,,,,,,,,WGS84
3,144763220,,,,,,,,,"A. Atkinson, V. Siegel , E. Pakhomov, P. Rothe...",...,,,,,,,,,,WGS84
4,144763219,,,,,,,,,"A. Atkinson, V. Siegel , E. Pakhomov, P. Rothe...",...,,,,,,,,,,WGS84


## Get unique taxonKey from occurrences

In [77]:
occ_taxa = occ[["scientificName", "taxonKey", "kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey",
                "genusKey", "subgenusKey", "speciesKey", "kingdom", "phylum", "class", "order", "family", "genus",
                "subgenus", "specificEpithet"]]
# drop duplicated rows based on taxonKey
occ_taxa.drop_duplicates(['taxonKey'])

# to get a set of unique taxon keys (of all taxonomic ranks) of all the occurrence records in this dataset
fields = ["kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey", "genusKey", "subgenusKey", "speciesKey"]
gbif_taxon_keys = list()
for field in fields:
    # drop all NA and all duplicates of that column and convert the column to list
    gbif_taxon_keys.extend(occ_taxa[field].dropna().drop_duplicates().to_list())
gbif_taxon_keys = set(gbif_taxon_keys)  # a set of unique taxon keys based on GBIF taxonomic backbone
gbif_taxon_keys

{1, 54, 229, 868, 2073, 2228004, 2228010}

## Get AphiaID of taxonKey through GBIF taxonomic backbone

1. Using GBIF species API `/species/{taxonKey}/related` to link all related name usages in other checklists.
2. Loop through results to check if the `datasetKey` is identical to the uuid of WoRMS dataset UUID in GBIF.
3. If identical, use regex to get AphiaID from the taxonID (e.g. LSID)

In [78]:
GBIF_BASE_URL = "https://api.gbif.org/v1/species/"
WORMS_DATASET_KEY = "2d59e5db-57ad-41ff-97d6-11f5fb264527"  # this is the datasetKey of WoRMS checklist in GBIF
aphia_ids = set()
gbif_worms_map = dict()  # a dictionary with key = gbif taxonKey, value = aphiaID

for gbif_taxon_key in gbif_taxon_keys:
    time.sleep(0.5)
    # url to get related name usages in other checklists
    taxon_url = "{}{}/related".format(GBIF_BASE_URL, gbif_taxon_key)
    print("URL for taxon {}: {}".format(gbif_taxon_key, taxon_url))
    response = requests.get(taxon_url)
    if response.status_code == 200:
        results = response.json().get("results")
        # might need to loop through next page if count is more than the limit
        for taxon in results:
            if taxon.get("datasetKey") == WORMS_DATASET_KEY:
                worms_taxon_id = taxon.get("taxonID")  # lsid looks like 'urn:lsid:marinespecies.org:taxname:394119'
                aphia_id = worms_taxon_id.split("taxname:")[1]  # the integers behind 'taxname:' is the aphiaID
                aphia_ids.add(aphia_id)
                gbif_worms_map[gbif_taxon_key] = aphia_id

print('Dictionary with key = gbif taxonKey, value = aphiaID\n', gbif_worms_map)

URL for taxon 1: https://api.gbif.org/v1/species/1/related
URL for taxon 868: https://api.gbif.org/v1/species/868/related
URL for taxon 229: https://api.gbif.org/v1/species/229/related
URL for taxon 2228004: https://api.gbif.org/v1/species/2228004/related
URL for taxon 2228010: https://api.gbif.org/v1/species/2228010/related
URL for taxon 54: https://api.gbif.org/v1/species/54/related
URL for taxon 2073: https://api.gbif.org/v1/species/2073/related
Dictionary with key = gbif taxonKey, value = aphiaID
 {868: '1128', 2228004: '110673', 2228010: '236217', 2073: '110671'}


## Get Aphia Attributes from WoRMS

In [79]:
WORMS_ATTR_BASE_URL = 'http://marinespecies.org/rest/AphiaAttributesByAphiaID/'
aphia_attribute_dict = dict()  # key = AphiaID, value = attributes of the taxon

for aphia_id in aphia_ids:
    time.sleep(0.5)
    worms_taxon_attr_url = "{}{}?include_inherited=false".format(WORMS_ATTR_BASE_URL,aphia_id)
    print('URL to attributes of taxon with AphiaID = {}: {}'.format(aphia_id, worms_taxon_attr_url))
    response = requests.get(worms_taxon_attr_url)
    if response.status_code == 200:
        results = response.json()
        aphia_attribute_dict[aphia_id] = results

# key = AphiaID, value = attributes of the taxon
print('key = AphiaID, value = attributes of the taxon\n', aphia_attribute_dict)



URL to attributes of taxon with AphiaID = 236217: http://marinespecies.org/rest/AphiaAttributesByAphiaID/236217?include_inherited=false
URL to attributes of taxon with AphiaID = 1128: http://marinespecies.org/rest/AphiaAttributesByAphiaID/1128?include_inherited=false
URL to attributes of taxon with AphiaID = 110671: http://marinespecies.org/rest/AphiaAttributesByAphiaID/110671?include_inherited=false
URL to attributes of taxon with AphiaID = 110673: http://marinespecies.org/rest/AphiaAttributesByAphiaID/110673?include_inherited=false
key = AphiaID, value = attributes of the taxon
 {'236217': [{'AphiaID': '236217', 'measurementTypeID': 23, 'measurementType': 'Species importance to society', 'measurementValue': 'IUCN Red List', 'source_id': 127093, 'reference': 'IUCN Red List of Threatened Species', 'qualitystatus': 'unreviewed', 'AphiaID_Inherited': 236217, 'CategoryID': 13, 'children': [{'AphiaID': '236217', 'measurementTypeID': 1, 'measurementType': 'IUCN Red List Category', 'measurem