# Rmeove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [1]:
import pandas as pd
import os

In [2]:
# Load the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+"-keys2.csv"))

In [3]:
# Remove not available species
mask = df["accepted_taxon_key"] != -1
df = df[mask]

In [8]:
df['subgenus_name'].value_counts()

subgenus_name
NotAvail    7579
Name: count, dtype: int64

In [4]:
# in most cases the duplication arises due to synonyms so lets order by that
df = df.sort_values(by=['status'])

# Find duplicates
duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]

# Printing duplicate rows
print("Duplicate Rows:")
duplicates

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,subfamily_name,superfamily_name,tribe_name,confidence,status,match_type,rank,source
5752,1791862,Lepidoptera,Noctuidae,Neogalea,Neogalea sunia,,Neogalea sunia,Neogalea sunia,NotAvail,NotAvail,NotAvail,NotAvail,99,ACCEPTED,EXACT,SPECIES,Leeds23092024
5724,1760317,Lepidoptera,Noctuidae,Cropia,Cropia cedica,,Cropia cedica,Cropia cedica,NotAvail,NotAvail,NotAvail,NotAvail,99,ACCEPTED,EXACT,SPECIES,Leeds23092024
5816,10099989,Lepidoptera,Noctuidae,Condica,Condica selenosa,,Condica selenosa,Condica selenosa,NotAvail,NotAvail,NotAvail,NotAvail,99,ACCEPTED,EXACT,SPECIES,Leeds23092024
5670,1765526,Lepidoptera,Noctuidae,Eusceptis,Eusceptis effusa,,Eusceptis effusa,Eusceptis effusa,NotAvail,NotAvail,NotAvail,NotAvail,99,ACCEPTED,EXACT,SPECIES,Leeds23092024
5444,9838836,Lepidoptera,Lycaenidae,Calycopis,Calycopis thama,,Calycopis thama,Calycopis thama,NotAvail,NotAvail,NotAvail,NotAvail,99,ACCEPTED,EXACT,SPECIES,Leeds23092024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8014,1936026,Lepidoptera,Lycaenidae,Mesene,Mesene mygdon,,Mesene mygdon,Mesene phareus,NotAvail,NotAvail,NotAvail,NotAvail,99,SYNONYM,EXACT,SPECIES,Leeds23092024
813,5117551,Lepidoptera,Erebidae,Dysschema,Dysschema fortis,,Dysschema fortis,Dysschema magdala,NotAvail,NotAvail,NotAvail,NotAvail,99,SYNONYM,EXACT,SPECIES,Leeds23092024
819,5117551,Lepidoptera,Erebidae,Dysschema,Dysschema lygdamis,,Dysschema lygdamis,Dysschema magdala,NotAvail,NotAvail,NotAvail,NotAvail,99,SYNONYM,EXACT,SPECIES,Leeds23092024
5669,1765526,Lepidoptera,Noctuidae,Eusceptis,Eusceptis atriora,,Eusceptis atriora,Eusceptis effusa,NotAvail,NotAvail,NotAvail,NotAvail,99,SYNONYM,EXACT,SPECIES,Leeds23092024


In [None]:
# Lets inspect cases where duplication is not caused by synonyms
duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

These are flagged as subspecies in the input file so fine to remove

In [None]:
# Removing duplicate rows
df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

In [None]:
df_unique.shape

In [None]:
# Save the csv file
df_unique.to_csv(os.path.join("../species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)