In [1]:
import argparse
import json
import os
import shutil
import urllib
from concurrent.futures import ThreadPoolExecutor
import time

import pandas as pd
from dwca.read import DwCAReader

In [3]:
# Define the arguments
home_dir = os.getcwd()

data_dir = "/bask/homes/r/rybf4168/vjgo8416-amber/data/gbif-species-trainer-AMI-fork"

species_checklist_path = os.path.join(home_dir,"species_checklists","uksi-moths-keys-nodup-small.csv")

dwca_occurrence_df_path = os.path.join(data_dir,"occurrence_dataframes")

dwca_multimedia_df_path = os.path.join(data_dir,"dwca_files","multimedia_lepidoptera.csv")

output_location_path = os.path.join(data_dir,"gbif_images","sandbox")

In [4]:
# Read the multimedia file
media_df = pd.read_csv(dwca_multimedia_df_path)

In [5]:
# read species list
moth_data = pd.read_csv(species_checklist_path)

taxon_keys = list(moth_data["accepted_taxon_key"])
taxon_keys = [int(taxon) for taxon in taxon_keys]

In [6]:
def fetch_meta_data(data: pd.DataFrame):
    """returns the relevant metadata for a GBIF observation"""

    fields = [
        "decimalLatitude",
        "decimalLongitude",
        "order",
        "family",
        "genus",
        "species",
        "acceptedScientificName",
        "year",
        "month",
        "day",
        "datasetName",
        "taxonID",
        "acceptedTaxonKey",
        "lifeStage",
        "basisOfRecord",
    ]

    meta_data = {}

    for field in fields:
        if pd.isna(data[field]):
            meta_data[field] = "NA"
        else:
            meta_data[field] = data[field]

    return meta_data

In [2]:
for count, i_taxon_key in enumerate(taxon_keys):
    
    fetch_image_data(i_taxon_key)
    
print("Finished downloading for the given list!", flush=True)


NameError: name 'taxon_keys' is not defined

In [13]:
def fetch_image_data(i_taxon_key: int):
    species_meta_data = {}

    # get taxa information specific to the species
    taxon_data = moth_data[moth_data["accepted_taxon_key"] == i_taxon_key]

    family_name         = taxon_data["family_name"].item()
    genus_name          = taxon_data["genus_name"].item()
    species_name        = taxon_data["gbif_species_name"].item()
    write_location      = os.path.join(output_location_path,family_name,genus_name,species_name)

    # Read the occurrence dataframe
    if os.path.isfile(os.path.join(dwca_occurrence_df_path,
                                    str(i_taxon_key) + ".csv")): 
        i_occ_df = pd.read_csv(os.path.join(dwca_occurrence_df_path,
                                            str(i_taxon_key) + ".csv"))
        total_occ = len(i_occ_df)
        print(f"Downloading for {species_name}", flush=True) 
    else:
        print(f"No occurrence csv file found for {species_name}, taxon key {i_taxon_key}")
        return

    # creating hierarchical folder structure for image storage
    if not os.path.isdir(write_location):
        try:
            os.makedirs(write_location)
        except:
            print(f"Could not create the directory for {write_location}", flush=True)
            return
        
    image_count = 0

    if total_occ != 0:
        # print(f"{species_name} has some occurrences")
        
        for idx, row in i_occ_df.iterrows():
            obs_id = row["id"]

            # check occurrence entry in media dataframe
            try:
                media_entry = media_df.loc[media_df["coreid"] == obs_id]

                if not media_entry.empty:
                    # print(f"{species_name} has some media")
                          
                    if len(media_entry) > 1:  # multiple images for an observation
                        media_entry = media_entry.iloc[0, :]
                        image_url = media_entry["identifier"]
                    else:
                        image_url = media_entry["identifier"].item()
                else:
                    
                    # print(f"{species_name} has NO media")
                    continue
                    
            except Exception as e:
                print(e, flush=True)
                continue

            # download image
            if os.path.isfile(write_location + "/" + str(obs_id) + ".jpg"):
                image_count += 1
            else:
                try:
                    urllib.request.urlretrieve(
                        image_url, write_location + "/" + str(obs_id) + ".jpg"
                    )
                    image_count += 1
                    # m_data = fetch_meta_data(row)
                    # meta_data[str(obs_id) + ".jpg"] = m_data
                except:
                    print(f"Error downloading URL: '{image_url}'")
                    continue
                
            # Get meta data for this occurrence
            occ_meta_data = fetch_meta_data(row)
            species_meta_data[str(obs_id) + ".jpg"] = occ_meta_data
            
            if image_count >= max_data_sp:
                break
            
        # Dump metadata
        with open(write_location + "/" + "meta_data.json", "w") as outfile:
            json.dump(species_meta_data, outfile)            
            
    print(f"Downloading complete for {species_name} with {image_count} images.",
            flush=True)
    
    return
    

In [15]:
# Then, define a function that manages the parallel execution:
def download_images_concurrently(taxon_keys,use_parallel,n_workers):
    
    begin = time.time()

    
    if use_parallel:
        with ThreadPoolExecutor(max_workers=n_workers) as executor:

            # You can use the executor to parallelize your function call:
            results = list(executor.map(fetch_image_data, taxon_keys))
    
    else:
       
        for i_taxon_key in taxon_keys:
            print(f"Calling for {i_taxon_key}")
            fetch_image_data(i_taxon_key)
   

    end = time.time()
            
    print("Finished downloading for the given list! Time taken:", 
          round(end - begin), 
          "seconds",
          flush=True)
    
    
n_workers = 100    
use_parallel = True
max_data_sp = 100

# Lastly, call the function with your taxon keys:
download_images_concurrently(taxon_keys,use_parallel,n_workers)


No occurrence csv file found for Pyropteron muscaeformis, taxon key 4525870
Downloading for Pyropteron chrysidiformis
Downloading for Sesia bembeciformis
Downloading for Paranthrene tabaniformis
Downloading for Bembecia ichneumoniformis
Downloading for Synanthedon culiciformis
Downloading for Synanthedon flaviventris
Downloading for Synanthedon spheciformis
Downloading for Synanthedon andrenaeformis
Downloading for Pennisetia hylaeiformis
Downloading for Synanthedon scoliaeformis
Downloading for Synanthedon vespiformisDownloading for Synanthedon formicaeformis

Downloading for Synanthedon tipuliformis
Downloading for Sesia apiformis
Downloading for Synanthedon myopaeformis
Error downloading URL: 'nan'
Error downloading URL: 'nan'
Error downloading URL: 'http://ww2.bgbm.org/specimentool/Animals/Insects/Synanthedon_cf_vespiformis_Dürbye_Aserbaidschan_2010_1.jpg'
Error downloading URL: 'nan'
Error downloading URL: 'nan'
Error downloading URL: 'nan'
Error downloading URL: 'https://static.i

KeyboardInterrupt: 

Downloading complete for Synanthedon scoliaeformis with 100 images.


In [10]:
i_taxon_key = 1940838

i_occ_df = pd.read_csv(os.path.join(dwca_occurrence_df_path,
                                    str(i_taxon_key) + ".csv"))

In [11]:
i_occ_df

Unnamed: 0,id,datasetName,basisOfRecord,lifeStage,year,month,day,decimalLatitude,decimalLongitude,taxonID,order,family,genus,acceptedTaxonKey,species,acceptedScientificName,geodeticDatum
0,3899164541,Artportalen,HUMAN_OBSERVATION,Imago,2022.0,7.0,28.0,57.245480,17.043780,urn:lsid:dyntaxa.se:Taxon:100464,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
1,1884690922,Artportalen,HUMAN_OBSERVATION,Imago,2018.0,7.0,20.0,56.548420,13.083560,urn:lsid:dyntaxa.se:Taxon:100464,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
2,1428784258,Artportalen,HUMAN_OBSERVATION,Imago,1995.0,7.0,2.0,58.353530,11.323280,urn:lsid:dyntaxa.se:Taxon:100464,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
3,3350992143,,PRESERVED_SPECIMEN,,2007.0,7.0,10.0,59.419998,10.590000,ASV:e1b9611ada72f254fbd8d4e78660ab35,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
4,3880383834,Artportalen,HUMAN_OBSERVATION,Imago,2022.0,7.0,22.0,57.675460,15.001410,urn:lsid:dyntaxa.se:Taxon:100464,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4957,3717121326,,HUMAN_OBSERVATION,Unknown,2020.0,6.0,16.0,51.650000,5.250000,,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
4958,2836468819,,HUMAN_OBSERVATION,Imago,2017.0,7.0,29.0,50.800000,5.900000,,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
4959,2834611808,,HUMAN_OBSERVATION,Imago,2019.0,7.0,29.0,51.218922,6.959952,,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
4960,3906431714,,HUMAN_OBSERVATION,Imago,2021.0,8.0,21.0,50.944732,6.865700,,Lepidoptera,Sesiidae,Bembecia,1940838,Bembecia ichneumoniformis,Bembecia ichneumoniformis (Denis & Schiffermül...,WGS84
