# Rmeove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [1]:
import pandas as pd
import os

In [2]:
# Load the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+"-keys.csv"))

In [3]:
# Remove not available species
mask = df["accepted_taxon_key"] != -1
df = df[mask]

In [5]:
# in most cases the duplication arises due to synonyms so lets order by that
df = df.sort_values(by=['status'])

# Find duplicates
duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]

# Printing duplicate rows
print("Duplicate Rows:")
duplicates

Duplicate Rows:
      accepted_taxon_key   order_name   family_name    genus_name  \
4804             1760317  Lepidoptera     Noctuidae        Cropia   
4832             1791862  Lepidoptera     Noctuidae      Neogalea   
4713             1952948  Lepidoptera  Mimallonidae      Zaphanta   
4750             1765526  Lepidoptera     Noctuidae     Eusceptis   
4593             1753604  Lepidoptera   Limacodidae        Semyra   
...                  ...          ...           ...           ...   
2412             1786220  Lepidoptera      Erebidae  Hemeroblemma   
3237             1991472  Lepidoptera   Geometridae   Neopaniasis   
5434             1824179  Lepidoptera  Notodontidae      Crinodes   
5653             1826353  Lepidoptera  Notodontidae     Malocampa   
2402             1786250  Lepidoptera      Erebidae  Hemeroblemma   

         species_name_provided  authority_name_provided  \
4804             Cropia cedica                      NaN   
4832            Neogalea sunia       

In [10]:
# Lets inspect cases where duplication is not caused by synonyms
duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source
6284,1866120,Lepidoptera,Saturniidae,Automeris,Automeris banus,,Automeris banus,Automeris banus,99,ACCEPTED,EXACT,SPECIES,London03Nov2023
6283,1866120,Lepidoptera,Saturniidae,Automeris,Automeris banus,,Automeris banus,Automeris banus,99,ACCEPTED,EXACT,SPECIES,London03Nov2023
6259,1865107,Lepidoptera,Saturniidae,Eacles,Eacles imperialis,,Eacles imperialis,Eacles imperialis,99,ACCEPTED,EXACT,SPECIES,London03Nov2023
6258,1865107,Lepidoptera,Saturniidae,Eacles,Eacles imperialis,,Eacles imperialis,Eacles imperialis,99,ACCEPTED,EXACT,SPECIES,London03Nov2023
2518,1765109,Lepidoptera,Erebidae,Dyomyx,Dyomyx jonesi,,Dyomyx jonesi,Dyomyx jonesi,99,ACCEPTED,EXACT,SPECIES,London03Nov2023
2519,1765109,Lepidoptera,Erebidae,Dyomyx,Dyomyx jonesi,,Dyomyx jonesi,Dyomyx jonesi,99,ACCEPTED,EXACT,SPECIES,London03Nov2023


These are flagged as subspecies in the input file so fine to remove

In [11]:
# Removing duplicate rows
df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

In [12]:
# Save the csv file
df_unique.to_csv(os.path.join("../species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)