In [None]:
import argparse
import json
import os
import shutil
import urllib
from multiprocessing import Pool

import pandas as pd
from dwca.read import DwCAReader

In [None]:
# Define the arguments
home_dir = os.getcwd()

species_checklist_path = os.path.join(home_dir,"species_checklists","uksi-moths-keys-nodup-small.csv")

dwca_occurrence_df_path = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/occurrence_dataframes/"

dwca_multimedia_df_path = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/dwca_files/multimedia_lepidoptera.csv"

output_location_path = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/gbif_images/sandbox/"

max_data_sp = 10

In [None]:
# Read the multimedia file
media_df = pd.read_csv(dwca_multimedia_df_path)

In [None]:
# read species list
moth_data = pd.read_csv(species_checklist_path)

taxon_keys = list(moth_data["accepted_taxon_key"])
taxon_keys = [int(taxon) for taxon in taxon_keys]

In [None]:
for count, i_taxon_key in enumerate(taxon_keys):
    
    # get taxa information specific to the species
    taxon_data = moth_data[moth_data["accepted_taxon_key"] == i_taxon_key]

    family_name         = taxon_data["family_name"].item()
    genus_name          = taxon_data["genus_name"].item()
    species_name        = taxon_data["gbif_species_name"].item()
    write_location      = os.path.join(output_location_path,family_name,genus_name,species_name)
 
    # Read the occurrence dataframe
     
    if os.path.isfile(os.path.join(dwca_occurrence_df_path,
                                   str(i_taxon_key) + ".csv")): 
     
        i_occ_df = pd.read_csv(os.path.join(dwca_occurrence_df_path,
                                            str(i_taxon_key) + ".csv"))
        total_occ = len(i_occ_df)
        
        print(f"Downloading for {species_name}", flush=True) 
    else:
        print(f"No occurrence csv file found for {species_name}, taxon key {i_taxon_key}")
        continue
    
    # creating hierarchical folder structure for image storage
    if not os.path.isdir(write_location):
        try:
            os.makedirs(write_location)
        except:
            print(f"Could not create the directory for {write_location}", flush=True)
            continue
        
    image_count = 0
    meta_data = {} 

    if total_occ != 0:
        for idx, row in i_occ_df.iterrows():
            obs_id = row["id"]

            # check occurrence entry in media dataframe
            try:
                media_entry = media_df.loc[media_df["coreid"] == obs_id]

                if not media_entry.empty:
                
                    if len(media_entry) > 1:  # multiple images for an observation
                        media_entry = media_entry.iloc[0, :]
                        image_url = media_entry["identifier"]
                    else:
                        image_url = media_entry["identifier"].item()
                else:
                    continue
                    
            except Exception as e:
                print(e, flush=True)
                continue

            # download image
            if os.path.isfile(write_location + "/" + str(obs_id) + ".jpg"):
                
                image_count += 1
                
            else:
                
                try:
                    urllib.request.urlretrieve(
                        image_url, write_location + "/" + str(obs_id) + ".jpg"
                    )
                    image_count += 1
                    # m_data = fetch_meta_data(row)
                    # meta_data[str(obs_id) + ".jpg"] = m_data
                except:
                    print(f"Error downloading {image_url}")
                    continue

            if image_count >= max_data_sp:
                break

        # with open(write_location + "/" + "meta_data.json", "w") as outfile:
        #     json.dump(meta_data, outfile)
        print(f"Downloading complete for {species_name} with {image_count} images.",
              flush=True)