Example code for using the iDigBio API to harvest specimen images

In [1]:
# imports for record identification
import pandas as pd
import idigbio

# imports for media retrieval
import requests
import shutil
import os
import time

# Imports for cleaning the retrieved media
from PIL import Image
from PIL import UnidentifiedImageError
from glob import glob

In [2]:
# start by selecting the idigbio's pandas api
api = idigbio.pandas()

# iDigBio's Python API documentation:
#      https://github.com/iDigBio/idigbio-python-client
# iDigBio's General API documentation:
#      https://github.com/idigbio/idigbio-search-api/wiki#records

#### iDigBio has a pretty complicated query format.
see: https://github.com/iDigBio/idigbio-search-api/wiki/Query-Format

For example can query for data's existance using nested dict after a field name 

>  "scientificname": {
    "type": "exists"
      }

Or query nested data within records by delineating hierarchy with a period.
For example, the 'data' col contains nested record data, such as reproductive condition so one could query for it's existance

> data.dwc:reproductiveCondition : {
        "type":"exists"}

The 'flags' col indicates data quality issues. For example, any changes which were preformed upon ingestion such as a change of specificepithet in order to align with GBIF taxonomy. Flags are provided as a list of terms. Since the default field query is a partial match you can query it by just checking for your flag.

Flags documentation: https://github.com/iDigBio/idigbio-search-api/wiki/Data-Quality-Flags

> "flags":"dwc_specificepithet_replaced"

Begin by setting a few variables

In [11]:
# set variables for the query
#sciName = 'Tsuga canadensis'
sciName = 'Iris setosa'
basisOfRecord = 'PreservedSpecimen' # PreservedSpecimen indicates a vouchered record

# for testing purposes restrict query to 5 results
limit = 7

Build and submit the query

*Note: It is probably necessary to modify this query somewhat depending on the usecase.

In [12]:
# define a dictionary with the query's "key word arguments"
my_query = {'scientificname':sciName,
            'hasImage':True,
            'basisofrecord':basisOfRecord,
            'indexData.dwc:reproductiveCondition':{'type':'exists'}, # could be useful but only here as an example
            'flags':'dwc_parentnameusageid_added' # only really here as an example
           }

# call iDigbio's api, using the query we built. The result is a dataframe.
df = api.search_records(rq=my_query, limit=limit)

# spotcheck what was returned
# examine the the columns available
display(df.columns)
# examine the total quantity of results
display(df.shape)

Index(['startdayofyear', 'continent', 'verbatimeventdate', 'country',
       'collectioncode', 'dqs', 'countrycode', 'datecollected', 'flags',
       'recordset', 'hasImage', 'indexData', 'hasMedia', 'taxonid',
       'catalognumber', 'collector', 'basisofrecord', 'datemodified',
       'taxonrank', 'class', 'order', 'mediarecords', 'locality',
       'specificepithet', 'scientificname', 'occurrenceid', 'stateprovince',
       'recordnumber', 'kingdom', 'taxonomicstatus', 'eventdate',
       'canonicalname', 'phylum', 'genus', 'minelevation', 'datasetid', 'etag',
       'institutioncode', 'family', 'collectionid', 'recordids', 'highertaxon',
       'geopoint', 'coordinateuncertainty', 'county'],
      dtype='object')

(7, 45)

# Expand nested data

This example requested data containing "reproductiveCondition" information nested the indexData col. It will be easier to work with if that nested data is expanded to a top level column

In [13]:
# quick helper function will keep this code easier to read
def expand_reproductiveCondition(indexData):
    reproductiveCondition = indexData.get('dwc:reproductiveCondition', "")
    return reproductiveCondition

# apply the expansion function to the parent col (i.e., 'indexData')
df['reproductiveCondition'] = df['indexData'].apply(expand_reproductiveCondition)
# check out the results (could be fun)
df['reproductiveCondition'].sample(4)

uuid
b17ffd30-6207-4a13-9e96-b22fbf97bd51    flowers
54fb847b-3be3-497a-b523-739071113c2e    flowers
734b35b0-7355-4030-8d8a-0fc9e82b2d94        Veg
501039be-cae2-4ac3-9f12-46ac44e07b3d    flowers
Name: reproductiveCondition, dtype: object

#### Retrieve media from query reqsults

Modified from: https://github.com/iDigBio/idigbio-python-client/blob/master/examples/fetch_media/fetch_media.py

In [14]:

def get_media_with_naming (output_dir, media_url, uuid, SIZE):
    """
Download a media file to a directory and name it based on the input parameters.
 'output_dir' controls where the download is placed.
 'media_url' is the url / link to the media that will be downloaded.
 'uuid' is used to uniquely identify the output filename.
 'SIZE' is the class of image derivative, useful in the output filename.
"""
    # Output filenames will be of the form: {mediarecord_uuid}_{SIZE}.jpg
    local_filepath = os.path.join(output_dir,  uuid + '_' + SIZE + '.jpg')
    try:
        with requests.get(media_url, stream=True) as r:
            r.raise_for_status()
            with open(local_filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)
            # be sure file is done downloading before moving on
            cur_fs = 0
            while cur_fs != os.path.getsize(local_filepath):
                time.sleep(0.2)
                cur_fs = os.path.getsize(local_filepath)
                time.sleep(0.5) # impose longer wait timers to avoid IP Bans
    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
        print('*** HTTP ERROR: {0}'.format(e))
        return False

    # if a success is apparent impose a sleep timer to avoid an ip ban

    return True

def retrieve_media(mediarecords, output_directory='./output', SIZE='fullsize'):
    """ when be called on the 'mediarecords' column, retrieves every media file """
    # SIZES = ["thumbnail", "webview", "fullsize"]

    for media_uuid in mediarecords:
        # construct media URL
        media_url = 'https://api.idigbio.org/v2/media/' + media_uuid + '?size=' + SIZE
        # The retrieval function returns bool based on success
        # could modify this to store report somewhere... for now just print success / fail
        if get_media_with_naming(output_directory, media_url, media_uuid, SIZE):
            print ("Downloaded: '{0}'".format(media_url))
        else:
            print("Failed to retrieve: '{0}'".format(media_url))
            print(media_uuid)

#### Extract the media from the identified records 

In [15]:
df['mediarecords'].apply(retrieve_media)

Downloaded: 'https://api.idigbio.org/v2/media/5c7a95f9-0fa3-4ffd-8f7f-73fa58713b40?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/6ba1e45c-3a38-4adc-ae4f-160be0ee47b5?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/b0abd802-3452-499f-b553-fdfbd4fcd1f5?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/c60bd571-5016-4dda-be84-712ffbb6e526?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/4a224def-2f37-4954-be58-8045d27c4913?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/352c9203-3a08-44c2-9a8b-c02e944e89c2?size=fullsize'
Downloaded: 'https://api.idigbio.org/v2/media/0d1cc33a-4a7b-4135-a777-2993d629be45?size=fullsize'


uuid
734b35b0-7355-4030-8d8a-0fc9e82b2d94    None
b17ffd30-6207-4a13-9e96-b22fbf97bd51    None
79f9b071-f2cc-4b59-b22c-99345ab2bf8c    None
501039be-cae2-4ac3-9f12-46ac44e07b3d    None
545f2097-915f-45c0-b2a6-f3d35ab05f7f    None
54fb847b-3be3-497a-b523-739071113c2e    None
8b807b8a-96ac-41fc-b4ac-be5380e79db1    None
Name: mediarecords, dtype: object

In [16]:
df[3:4]

Unnamed: 0_level_0,startdayofyear,continent,verbatimeventdate,country,collectioncode,dqs,countrycode,datecollected,flags,recordset,...,etag,institutioncode,family,collectionid,recordids,highertaxon,geopoint,coordinateuncertainty,county,reproductiveCondition
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
501039be-cae2-4ac3-9f12-46ac44e07b3d,196,north america,,united states,plants,0.188406,usa,1945-07-15T00:00:00+00:00,"[geopoint_datum_missing, rev_geocode_eez, dwc_...",e2def7e2-1455-4856-9823-6d3738417d24,...,9e392dea834ca8008102cd9f05b09b080ff4aa1a,min,iridaceae,f292c83d-305c-4b22-97cc-cd9c72e2931e,[e2def7e2-1455-4856-9823-6d3738417d24\93de8e88...,,"{'lon': -173.213793, 'lat': 52.884659}",,,flowers


#### Store record date
Just in case it is useful later

In [22]:
df.to_csv('output.csv')

#### Cleanup the output directory
__Note:__ the output_directory parameter should align with the output_directory used here

__Note:__ this does not remove records from the record data if all the associated images are removed.

In [17]:
# modify to alter the minimum acceptable px size of image's largest dimension 
# for reference, "full size" is typically ~5,760
image_height_thres = 3000

output_directory = './output'
output_files = glob(f"{output_directory}/*")

def remove_file(fp):
    """given a file path, attempts to remove the file"""
    if os.path.isfile(fp):
        os.remove(fp)

# iterate over all identified files in the glob
# remove those not meeting herb image linke conditions
for fn in output_files:
    try:
        im = Image.open(fn)
        w, h = im.size
        # assign height to the larger variable (robust to rotated images)
        if w > h:
            w, h = h, w
        # establish 2 disqualifying conditions
        # make sure the height meets a pixel threshold
        height_condition = h < image_height_thres
        # verify the aspect ratio is in the ball park of expected values
        aspect_condition = not( 1.2 < (h/w) < 1.95)
        # if the image in question meets either of those conditions, remove it from the drive.
        if height_condition or aspect_condition:
            remove_file(fn)
    # This exception means the file is probably corrupted or incomplete
    except UnidentifiedImageError:
        remove_file(fn)

['./output/4a224def-2f37-4954-be58-8045d27c4913_fullsize.jpg', './output/b0abd802-3452-499f-b553-fdfbd4fcd1f5_fullsize.jpg', './output/6ba1e45c-3a38-4adc-ae4f-160be0ee47b5_fullsize.jpg', './output/431f2252-b713-4a6b-89c3-a16456ccded8_fullsize.jpg', './output/0d1cc33a-4a7b-4135-a777-2993d629be45_fullsize.jpg', './output/c60bd571-5016-4dda-be84-712ffbb6e526_fullsize.jpg', './output/5c7a95f9-0fa3-4ffd-8f7f-73fa58713b40_fullsize.jpg', './output/352c9203-3a08-44c2-9a8b-c02e944e89c2_fullsize.jpg', './output/f03e708a-0485-4eb7-9e67-4bc7fca62392_fullsize.jpg']
