In [None]:
import json
import os
import urllib
from concurrent.futures import ThreadPoolExecutor
import time
import sys

import pandas as pd

import logging
import datetime


In [None]:
# Define the arguments
home_dir = os.getcwd()

if sys.platform.startswith("linux"):
    data_dir = "/bask/projects/v/vjgo8416-amber/data/gbif-species-trainer-AMI-fork/"
elif sys.platform == "darwin":
    data_dir = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/"
else:
    print("Not linux or mac!")

species_checklist_path = os.path.join(home_dir,"species_checklists","uksi-moths-keys-nodup-small.csv")

dwca_occurrence_df_path = os.path.join(data_dir,"occurrence_dataframes")

dwca_multimedia_df_path = os.path.join(data_dir,"dwca_files","multimedia_lepidoptera.csv")

output_location_path = os.path.join(data_dir,"gbif_images","sandbox")

In [None]:
# Read the multimedia file
media_df = pd.read_csv(dwca_multimedia_df_path)

In [None]:
# read species list
moth_data = pd.read_csv(species_checklist_path)

taxon_keys = list(moth_data["accepted_taxon_key"])
taxon_keys = [int(taxon) for taxon in taxon_keys]

In [None]:
def fetch_meta_data(data: pd.DataFrame):
    """returns the relevant metadata for a GBIF observation"""

    fields = [
        "decimalLatitude",
        "decimalLongitude",
        "order",
        "family",
        "genus",
        "species",
        "acceptedScientificName",
        "year",
        "month",
        "day",
        "datasetName",
        "taxonID",
        "acceptedTaxonKey",
        "lifeStage",
        "basisOfRecord",
    ]

    meta_data = {}

    for field in fields:
        if pd.isna(data[field]):
            meta_data[field] = "NA"
        else:
            meta_data[field] = data[field]

    return meta_data

In [None]:
# Then, define a function that manages the parallel execution:
def download_images_concurrently(taxon_keys,use_parallel,n_workers):
    
    begin = time.time()

    
    if use_parallel:
        with ThreadPoolExecutor(max_workers=n_workers) as executor:

            # You can use the executor to parallelize your function call:
            results = list(executor.map(fetch_image_data, taxon_keys))
    
    else:
       
        for i_taxon_key in taxon_keys:
            print(f"Calling for {i_taxon_key}")
            fetch_image_data(i_taxon_key)
   

    end = time.time()
            
    print("Finished downloading for the given list! Time taken:", 
          round(end - begin), 
          "seconds",
          flush=True)

In [None]:
def setup_logger():
    
    # Specify the directory where you want to save the log files
    log_dir = "log_files"

    # Ensure the directory exists
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Use the timestamp string to create a unique filename for the log file
    timestamp    = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    log_filename = os.path.join(log_dir, f'download_log_{timestamp}.log')
    
    # Get the root logger
    logger = logging.getLogger()
    
    # If logger has handlers, clear them
    for handler in logger.handlers[:]:
        handler.close()
        logger.removeHandler(handler)
    
    # Configure the logger
    logging.basicConfig(filename=log_filename, level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
def fetch_image_data(i_taxon_key: int):
    global skip_non_adults, max_data_sp
    
    # get taxa information specific to the species
    taxon_data = moth_data[moth_data["accepted_taxon_key"] == i_taxon_key]

    family_name         = taxon_data["family_name"].item()
    genus_name          = taxon_data["genus_name"].item()
    species_name        = taxon_data["gbif_species_name"].item()
    write_location      = os.path.join(output_location_path,family_name,genus_name,species_name)

    # Does meta_data exist for this species?
    if os.path.isfile(os.path.join(write_location,"meta_data.json")):
        # Load it 
        with open(os.path.join(write_location,"meta_data.json")) as file:
            species_meta_data = json.load(file)
    else:
        # Creat it
        species_meta_data = {}


    # Read the occurrence dataframe
    if os.path.isfile(os.path.join(dwca_occurrence_df_path,
                                    str(i_taxon_key) + ".csv")): 
        i_occ_df = pd.read_csv(os.path.join(dwca_occurrence_df_path,
                                            str(i_taxon_key) + ".csv"))
        total_occ = len(i_occ_df)
        print(f"Downloading for {species_name}", flush=True) 
    else:
        logger.warning(
            f"No occurrence csv file found for {species_name}, taxon key {i_taxon_key}"
            )
        return

    # creating hierarchical folder structure for image storage
    if not os.path.isdir(write_location):
        try:
            os.makedirs(write_location)
        except:
            print(f"Could not create the directory for {write_location}", flush=True)
            return
        
    image_count = 0

    if total_occ != 0:
        # print(f"{species_name} has some occurrences")
        
        for idx, row in i_occ_df.iterrows():
            
            if skip_non_adults:
                
                if (not pd.isna(row["lifeStage"])) & (row["lifeStage"] != "Adult"):
                    
                    # print("Life stage is", row["lifeStage"], "skipping...")
                
                    continue
            
            obs_id = row["id"]

            # Is there already an image, or is corrupt or a thumbnail, or broken URL?
            if len(species_meta_data) != 0:
                
                if str(obs_id)+".jpg" in species_meta_data.keys():
                    
                    if species_meta_data[str(obs_id)+".jpg"]["image_is_downloaded"]:
                        # print(f"{obs_id} already downloaded")
                        image_count += 1
                        
                        if image_count >= max_data_sp:
                            break            
                        else:           
                            continue    
                    
                    if (
                        (not species_meta_data[str(obs_id)+".jpg"]["image_url_works"]) or 
                        species_meta_data[str(obs_id)+".jpg"]["image_is_corrupted"] or 
                        species_meta_data[str(obs_id)+".jpg"]["image_is_thumbnail"]
                    ):
                        # print(f"{obs_id} already downloaded/corrupt/thumbnail/broken URL")
                        continue
            
            # check occurrence entry in media dataframe
            try:
                media_entry = media_df.loc[media_df["coreid"] == obs_id]

                if not media_entry.empty:
                    # print(f"{species_name} has some media")
                          
                    if len(media_entry) > 1:  # multiple images for an observation
                        media_entry = media_entry.iloc[0, :]
                        image_url = media_entry["identifier"]
                    else:
                        image_url = media_entry["identifier"].item()
                else:
                    
                    # print(f"{species_name} has NO media")
                    continue
                    
            except Exception as e:
                print(e, flush=True)
                continue

            # download image
            try:
                urllib.request.urlretrieve(
                    image_url, write_location + "/" + str(obs_id) + ".jpg"
                )
                image_count += 1
                url_works = True
                image_downloaded = True
                # m_data = fetch_meta_data(row)
                # meta_data[str(obs_id) + ".jpg"] = m_data
            except:
                print(f"Error downloading URL: '{image_url}'")
                url_works = False
                image_downloaded = False
            
            # Get meta data for this occurrence
            occ_meta_data = fetch_meta_data(row)
            occ_meta_data["image_is_downloaded"] = image_downloaded
            occ_meta_data["image_url_works"] = url_works
            occ_meta_data["image_is_corrupted"] = ""
            occ_meta_data["image_is_thumbnail"] = ""
            
            species_meta_data[str(obs_id) + ".jpg"] = occ_meta_data
            
            if image_count >= max_data_sp:
                break
            
        # Dump metadata
        with open(write_location + "/" + "meta_data.json", "w") as outfile:
            json.dump(species_meta_data, outfile)            
            
    print(f"Downloading complete for {species_name} with {image_count} images.",
            flush=True)
    
    return
    

In [None]:
# i_taxon_key = 1940807

# i_occ_df = pd.read_csv(os.path.join(dwca_occurrence_df_path,
#                                     str(i_taxon_key) + ".csv"))

# i_occ_df

In [None]:
# dwca_occurrence_df_path

In [None]:
# Setup logger
setup_logger()
logger = logging.getLogger()

# Start the run
n_workers = 10
use_parallel = True
max_data_sp = 300
skip_non_adults = True

# Lastly, call the function with your taxon keys:
download_images_concurrently(taxon_keys,use_parallel,n_workers)