Example code for using the iDigBio API to harvest specimen images

In [1]:
# imports for record identification
import pandas as pd
import idigbio

# imports for media retrieval
import requests
import shutil
import os
import time

# Imports for cleaning the retrieved media
from PIL import Image
from PIL import UnidentifiedImageError
from glob import glob

In [2]:
# start by selecting the idigbio's pandas api
api = idigbio.pandas()

# iDigBio's Python API documentation:
#      https://github.com/iDigBio/idigbio-python-client
# iDigBio's General API documentation:
#      https://github.com/idigbio/idigbio-search-api/wiki#records

#### iDigBio has a pretty complicated query format.
see: https://github.com/iDigBio/idigbio-search-api/wiki/Query-Format

For example can query for data's existance using nested dict after a field name 

>  "scientificname": {
    "type": "exists"
      }

Or query nested data within records by delineating hierarchy with a period.
For example, the 'data' col contains nested record data, such as reproductive condition so one could query for it's existance

> data.dwc:reproductiveCondition : {
        "type":"exists"}

The 'flags' col indicates data quality issues. For example, any changes which were preformed upon ingestion such as a change of specificepithet in order to align with GBIF taxonomy. Flags are provided as a list of terms. Since the default field query is a partial match you can query it by just checking for your flag.

Flags documentation: https://github.com/iDigBio/idigbio-search-api/wiki/Data-Quality-Flags

> "flags":"dwc_specificepithet_replaced"

Begin by setting a few variables

In [10]:
# set variables for the query
sciName = 'Tsuga canadensis'
#sciName = 'Iris setosa'
basisOfRecord = 'PreservedSpecimen' # PreservedSpecimen indicates a vouchered record

# for testing purposes restrict query to 5 results
limit = 7

Build and submit the query

*Note: It is probably necessary to modify this query somewhat depending on the usecase.

In [15]:
# define a dictionary with the query's "key word arguments"
my_query = {'scientificname':sciName,
            'hasImage':True,
            'basisofrecord':basisOfRecord,
            'indexData.dwc:reproductiveCondition':{'type':'exists'}, # could be useful but only here as an example
            'flags':'dwc_parentnameusageid_added' # only really here as an example
           }


# call iDigbio's api, using the query we built. The result is a dataframe.
df = api.search_records(rq=my_query, limit=limit)

# spotcheck what was returned
# examine the the columns available
display(df.columns)
# examine the total quantity of results
display(df.shape)

Index(['startdayofyear', 'continent', 'country', 'collectioncode', 'dqs',
       'countrycode', 'datecollected', 'county', 'flags', 'recordset',
       'hasImage', 'indexData', 'hasMedia', 'taxonid', 'catalognumber',
       'collector', 'basisofrecord', 'datemodified', 'taxonrank', 'class',
       'order', 'highertaxon', 'mediarecords', 'locality', 'geopoint',
       'specificepithet', 'scientificname', 'occurrenceid', 'stateprovince',
       'recordnumber', 'kingdom', 'taxonomicstatus', 'eventdate',
       'canonicalname', 'phylum', 'genus', 'minelevation',
       'coordinateuncertainty', 'datasetid', 'etag', 'institutioncode',
       'family', 'collectionid', 'recordids', 'verbatimeventdate'],
      dtype='object')

(7, 45)

# Expand nested data

This example requested data containing "reproductiveCondition" information nested the indexData col. It will be easier to work with if that nested data is expanded to a top level column

In [16]:
# quick helper function will keep this code easier to read
def expand_reproductiveCondition(indexData):
    reproductiveCondition = indexData.get('dwc:reproductiveCondition', "")
    return reproductiveCondition

# apply the expansion function to the parent col (i.e., 'indexData')
df['reproductiveCondition'] = df['indexData'].apply(expand_reproductiveCondition)
# check out the results (could be fun)
df['reproductiveCondition'].sample(4)

uuid
abca404a-ce55-4c65-b5a8-442f2a260a1d               seed cone
258210cd-51f8-40f7-b88b-d7c0ce5eb417    Female cones present
5def2622-4165-4ff3-a1d8-110729e07740     Female cone present
2b525612-0a31-43bf-814b-7365f915d467                fruiting
Name: reproductiveCondition, dtype: object

#### Retrieve media from query reqsults

Modified from: https://github.com/iDigBio/idigbio-python-client/blob/master/examples/fetch_media/fetch_media.py

In [24]:

def get_media_with_naming (output_dir, media_url, uuid, SIZE):
    """
Download a media file to a directory and name it based on the input parameters.
 'output_dir' controls where the download is placed.
 'media_url' is the url / link to the media that will be downloaded.
 'uuid' is used to uniquely identify the output filename.
 'SIZE' is the class of image derivative, useful in the output filename.
"""
    # Output filenames will be of the form: {mediarecord_uuid}_{SIZE}.jpg
    local_filepath = os.path.join(output_dir,  uuid + '_' + SIZE + '.jpg')
    try:
        with requests.get(media_url, stream=True) as r:
            r.raise_for_status()
            with open(local_filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)
            # be sure file is done downloading before moving on
            cur_fs = 0
            while cur_fs != os.path.getsize(local_filepath):
                time.sleep(0.2)
                cur_fs = os.path.getsize(local_filepath)
                time.sleep(0.5) # impose longer wait timers to avoid IP Bans
    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
        if "fullsize" in media_url:
            media_url = media_url.split("?size=fullsize")[0]
            return get_media_with_naming(output_dir, media_url, uuid, "na")
        else:
            print('*** HTTP ERROR: {0}'.format(e))
        return False

    # if a success is apparent impose a sleep timer to avoid an ip ban

    return True

def retrieve_media(mediarecords, output_directory='./output', SIZE='fullsize'):
    """ when be called on the 'mediarecords' column, retrieves every media file """
    # SIZES = ["thumbnail", "webview", "fullsize"]
    print(mediarecords)
    for media_uuid in mediarecords:
        # construct media URL
        media_url = 'https://api.idigbio.org/v2/media/' + media_uuid + '?size=' + SIZE
        # The retrieval function returns bool based on success
        # could modify this to store report somewhere... for now just print success / fail
        if get_media_with_naming(output_directory, media_url, media_uuid, SIZE):
            print ("Downloaded: '{0}'".format(media_url))
        else:
            print("Failed to retrieve: '{0}'".format(media_url))
            print(media_uuid)

#### Extract the media from the identified records 

In [25]:
df['mediarecords'].apply(retrieve_media)

['8c3d5920-b675-4c32-987a-4ab7dabe1658']
Downloaded: 'https://api.idigbio.org/v2/media/8c3d5920-b675-4c32-987a-4ab7dabe1658?size=fullsize'
['431f2252-b713-4a6b-89c3-a16456ccded8']
Downloaded: 'https://api.idigbio.org/v2/media/431f2252-b713-4a6b-89c3-a16456ccded8?size=fullsize'
['f03e708a-0485-4eb7-9e67-4bc7fca62392']
Downloaded: 'https://api.idigbio.org/v2/media/f03e708a-0485-4eb7-9e67-4bc7fca62392?size=fullsize'
['9c38451a-adfd-4165-8a69-4e69a9e2c817']
Downloaded: 'https://api.idigbio.org/v2/media/9c38451a-adfd-4165-8a69-4e69a9e2c817?size=fullsize'
['d286d4ff-82a3-431f-96c0-bdc37777f4df']
Downloaded: 'https://api.idigbio.org/v2/media/d286d4ff-82a3-431f-96c0-bdc37777f4df?size=fullsize'
['3bf31e60-88f8-4be1-b944-6c9d6d380d80']
Downloaded: 'https://api.idigbio.org/v2/media/3bf31e60-88f8-4be1-b944-6c9d6d380d80?size=fullsize'
['a005b1a7-b8bc-4f54-b5ca-6b792da288ea']
Downloaded: 'https://api.idigbio.org/v2/media/a005b1a7-b8bc-4f54-b5ca-6b792da288ea?size=fullsize'


uuid
ca458b5b-98b0-438a-a976-5820f60bb8f4    None
258210cd-51f8-40f7-b88b-d7c0ce5eb417    None
3c138af5-01fd-428a-a994-fbe15d49cef2    None
2d3d3e64-b24c-44a9-8d4a-c5cdb8d5e440    None
2b525612-0a31-43bf-814b-7365f915d467    None
5def2622-4165-4ff3-a1d8-110729e07740    None
abca404a-ce55-4c65-b5a8-442f2a260a1d    None
Name: mediarecords, dtype: object

#### Store record date
Just in case it is useful later

In [5]:
df.to_csv('output.csv')

#### Cleanup the output directory
__Note:__ the output_directory parameter should align with the output_directory used here

__Note:__ this does not remove records from the record data if all the associated images are removed.

In [26]:
# modify to alter the minimum acceptable px size of image's largest dimension 
# for reference, "full size" is typically ~5,760
image_height_thres = 3000

output_directory = './output'
output_files = glob(f"{output_directory}/*")

def remove_file(fp):
    """given a file path, attempts to remove the file"""
    if os.path.isfile(fp):
        os.remove(fp)

# iterate over all identified files in the glob
# remove those not meeting herb image linke conditions
for fn in output_files:
    try:
        im = Image.open(fn)
        w, h = im.size
        # assign height to the larger variable (robust to rotated images)
        if w > h:
            w, h = h, w
        # establish 2 disqualifying conditions
        # make sure the height meets a pixel threshold
        height_condition = h < image_height_thres
        # verify the aspect ratio is in the ball park of expected values
        aspect_condition = not( 1.2 < (h/w) < 1.95)
        # if the image in question meets either of those conditions, remove it from the drive.
        if height_condition or aspect_condition:
            remove_file(fn)
    # This exception means the file is probably corrupted or incomplete
    except UnidentifiedImageError:
        remove_file(fn)