# Rmeove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [1]:
import pandas as pd
import os

In [2]:
# Load the species checklist
checklist_name = "uksi-macro-moths"

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+"-keys.csv"))

In [3]:
# Remove not available species
mask = df["accepted_taxon_key"] != -1
df = df[mask]

In [4]:
# in most cases the duplication arises due to synonyms so lets order by that
df = df.sort_values(by=['status'])

# Find duplicates
duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]

# Printing duplicate rows
print("Duplicate Rows:")
duplicates

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source
542,8007222,Lepidoptera,Noctuidae,Apamea,Apamea anceps,,Apamea anceps,Apamea anceps,99,ACCEPTED,EXACT,SPECIES,Leeds22Nov2023
508,4533720,Lepidoptera,Noctuidae,Agrotis,Agrotis catalaunensis,,Agrotis catalaunensis,Agrotis catalaunensis,99,ACCEPTED,EXACT,SPECIES,Leeds22Nov2023
756,1788425,Lepidoptera,Noctuidae,Mesapamea,Mesapamea secalis,,Mesapamea secalis,Mesapamea secalis,99,ACCEPTED,EXACT,SPECIES,Leeds22Nov2023
58,4532253,Lepidoptera,Noctuidae,Eublemma,Eublemma minutata,,Eublemma minutata,Eublemma minutata,99,ACCEPTED,EXACT,SPECIES,Leeds22Nov2023
126,1803218,Lepidoptera,Erebidae,Utetheisa,Utetheisa ornatrix,,Utetheisa ornatrix,Utetheisa ornatrix,99,ACCEPTED,EXACT,SPECIES,Leeds22Nov2023
520,4533720,Lepidoptera,Noctuidae,Agrotis,Agrotis syricola,,Agrotis syricola,Agrotis catalaunensis,98,SYNONYM,EXACT,SPECIES,Leeds22Nov2023
61,4532253,Lepidoptera,Noctuidae,Eublemma,Eublemma paula,,Eublemma paula,Eublemma minutata,99,SYNONYM,EXACT,SPECIES,Leeds22Nov2023
547,8007222,Lepidoptera,Noctuidae,Apamea,Apamea infesta,,Apamea infesta,Apamea anceps,99,SYNONYM,EXACT,SPECIES,Leeds22Nov2023
755,1788425,Lepidoptera,Noctuidae,Mesapamea,Mesapamea didyma,,Mesapamea didyma,Mesapamea secalis,99,SYNONYM,EXACT,SPECIES,Leeds22Nov2023
125,1803218,Lepidoptera,Erebidae,Utetheisa,Utetheisa bella,,Utetheisa bella,Utetheisa ornatrix,99,SYNONYM,EXACT,SPECIES,Leeds22Nov2023


In [5]:
# Lets inspect cases where duplication is not caused by synonyms
duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source


These are flagged as subspecies in the input file so fine to remove

In [6]:
# Removing duplicate rows
df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

In [7]:
# Save the csv file
df_unique.to_csv(os.path.join("../species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)