In [1]:
import pandas as pd
import os
import sys
import json
import logging
import datetime

In [2]:
if sys.platform.startswith("linux"):
    data_dir = "/bask/projects/v/vjgo8416-amber/data/gbif_download_standalone/"
elif sys.platform == "darwin":
    data_dir = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/"
else:
    print("Not linux or mac!")

checklist_name = "costarica-moths-keys-nodup"

gbif_img_loc = os.path.join(data_dir,"gbif_images","tests2")

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+".csv"))

In [3]:
def setup_logger(logger_name, log_suffix):

    # Specify the directory where you want to save the log files
    log_dir = "data_stats_log_files"

    # Ensure the directory exists
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Use the timestamp string to create a unique filename for the log file
    timestamp    = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_filename = os.path.join(log_dir, f"{log_suffix}_{timestamp}.log")

    # Get the root logger
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)

    # If logger has handlers, clear them
    for handler in logger.handlers[:]:
        handler.close()
        logger.removeHandler(handler)

    formatter    = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler = logging.FileHandler(log_filename)
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

In [4]:
# Setup logger
setup_logger('mismatch_logger', 'mismatch_log')
setup_logger('metadata_logger', 'metadata_log')
# setup_logger('image_logger', 'image_log')

mismatch_logger = logging.getLogger('mismatch_logger')
metadata_logger = logging.getLogger('metadata_logger')
# metadata_logger   = logging.getLogger('metadata_logger')

In [5]:
df["n_imgs"] = ""
for idx, row in df.iterrows():
    
    family  = row["family_name"]
    genus   = row["genus_name"]
    species = row["gbif_species_name"]
    
    # Check if directory exists 
    species_dir = (
        os.path.join(gbif_img_loc,family,genus,species)
        )
    
    if os.path.isdir(species_dir):
        n_images_on_disk = len(
            [f for f in os.listdir(species_dir) if f.lower().endswith('.jpg') ]
            )
        # print(f"{species} Count files method has", n_images_on_disk, "images")
        
        # Load metadata
        try:
            
            with open(os.path.join(species_dir,"meta_data.json")) as file:
            
                meta_data = json.load(file)
            
            # Count the number of images for this species
            md2_n_imgs_downloaded = 0

            try:
                # 2nd way of counting images
                md2 = pd.read_json(
                    os.path.join(species_dir,"meta_data.json"), orient = 'index'
                    )
                
                if md2.empty:
                    md2_n_imgs_downloaded = 0
                else:
                    md2_n_imgs_downloaded = md2["image_is_downloaded"].sum()

                # print(f"{species} Count dataframe metadata has", md2_n_imgs_downloaded, "images")            

            except Exception as e:
                print(e)
                print(f"{species} error counting dataframe way: {e}")
                        
            # Do n images match?
            if n_images_on_disk == md2_n_imgs_downloaded:
                # print(f"N images match for {species_dir}")
                pass
            else:
                print(
                    f"Mismatch! File count {n_images_on_disk}, "
                    f"metadata has {md2_n_imgs_downloaded}, {species_dir}"
                )
                
                mismatch_logger.warning(
                    f"Mismatch! File count {n_images_on_disk}, "
                    f"metadata has {md2_n_imgs_downloaded}, {species_dir}"
                )
        except Exception as e:
            pass
            print(f"No metadata for {species_dir}. Error {e}")
            
            metadata_logger.warning(f"No metadata for {species_dir}. Error {e}")
    else:
        
        n_images_on_disk = 0
    
    # print(f"{species} has {n_images_on_disk} images")
    
    # Record this
    df.loc[idx, "n_imgs"] = n_images_on_disk
    
    

In [6]:
df

Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source,n_imgs
0,1860989,Lepidoptera,Adelidae,Adela,Adela ridingsella,,Adela ridingsella,Adela ridingsella,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
1,1751932,Lepidoptera,Aididae,Aidos,Aidos amanda,,Aidos amanda,Aidos amanda,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
2,1751930,Lepidoptera,Aididae,Aidos,Aidos perfusa,,Aidos perfusa,Aidos perfusa,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
3,1992397,Lepidoptera,Alucitidae,Alucita,Alucita mulciber,,Alucita mulciber,Alucita mulciber,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
4,1868687,Lepidoptera,Apatelodidae,Drepatelodes,Drepatelodes tanais,,Drepatelodes tanais,Drepatelodes tanais,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7341,5105686,Lepidoptera,Zygaenidae,Harrisina,Harrisina chalestra,,Harrisina chalestra,Harrisina chalestra,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
7342,10567305,Lepidoptera,Zygaenidae,Neoilliberis,Neoilliberis thyesta,,Neoilliberis thyesta,Neoilliberis thyesta,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
7343,10606202,Lepidoptera,Zygaenidae,Neoprocris,Neoprocris flora,,Neoprocris flora,Neoprocris flora,99,ACCEPTED,EXACT,SPECIES,London19Oct2023,0
7344,9883601,Lepidoptera,Zygaenidae,Malthaca,Pyromorpha (Euclimaciopsis) radialis,,Pyromorpha (Euclimaciopsis) radialis,Pyromorpha radialis,98,DOUBTFUL,EXACT,SPECIES,London19Oct2023,0


In [None]:
# Save the df
df.to_csv(os.path.join("../data_stats_files/","data_stats_"+checklist_name+".csv"),
          index=False)