# Rmeove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [1]:
import pandas as pd
import os

In [2]:
# Load the species checklist
checklist_name = "thailand-moths"

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+"-keys.csv"))

In [3]:
# Remove not available species
mask = df["accepted_taxon_key"] != -1
df = df[mask]

In [4]:
# in most cases the duplication arises due to synonyms so lets order by that
df = df.sort_values(by=['status'])

# Find duplicates
duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]

# Printing duplicate rows
print("Duplicate Rows:")
duplicates

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source
3177,12208252,Lepidoptera,Erebidae,Spilarctia,Spilarctia obliqua,,Spilarctia obliqua,Spilarctia obliqua,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
3184,5110447,Lepidoptera,Erebidae,Plecoptera,Plecoptera flava,,Plecoptera flava,Plecoptera flava,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
3280,1881827,Lepidoptera,Crambidae,Paratalanta,Paratalanta aureolalis,,Paratalanta aureolalis,Paratalanta aureolalis,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
3065,4301310,Lepidoptera,Erebidae,Hypena,Hypena strigatus,,Hypena strigatus,Hypena strigatus,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
3147,1831458,Lepidoptera,Stathmopodidae,Stathmopoda,Stathmopoda auriferella,,Stathmopoda auriferella,Stathmopoda auriferella,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3725,1884844,Lepidoptera,Crambidae,Pagyda,Pagyda arbiter,,Pagyda arbiter,Pagyda salvalis,99,SYNONYM,EXACT,SPECIES,Leeds06062024
2923,1863907,Lepidoptera,Sphingidae,Eupanacra,Eupanacra dohertyi,,Eupanacra dohertyi,Eupanacra regularis,99,SYNONYM,EXACT,SPECIES,Leeds06062024
502,5117779,Lepidoptera,Erebidae,Euchromia,Euchromia elegantissima,,Euchromia elegantissima,Euchromia polymena,99,SYNONYM,EXACT,SPECIES,Leeds06062024
744,1819362,Lepidoptera,Erebidae,Artaxa,Laspeyria olivata,,Laspeyria olivata,Artaxa distracta,99,SYNONYM,EXACT,SPECIES,Leeds06062024


In [5]:
# Lets inspect cases where duplication is not caused by synonyms
duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,confidence,status,match_type,rank,source
3065,4301310,Lepidoptera,Erebidae,Hypena,Hypena strigatus,,Hypena strigatus,Hypena strigatus,99,ACCEPTED,EXACT,SPECIES,Leeds06062024
4302,4301310,Lepidoptera,Erebidae,Hypena,Hypena strigata,,Hypena strigata,Hypena strigatus,97,ACCEPTED,FUZZY,SPECIES,Leeds06062024


These are flagged as subspecies in the input file so fine to remove

In [6]:
# Removing duplicate rows
df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

In [7]:
# Save the csv file
df_unique.to_csv(os.path.join("../species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)