# Rmeove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [13]:
import pandas as pd
import os

In [14]:
def inspect_duplicates(duplicates):
    # Lets inspect cases where duplication is not caused by synonyms
    duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
    duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

In [15]:
def clean_and_remove_duplicated(checklist_name):
    df = pd.read_csv(os.path.join("../species_checklists",checklist_name+"-keys.csv"))

    # save not found entries to a new file
    error_df = df[df["accepted_taxon_key"] == -1]
    error_df.to_csv(os.path.join("../species_checklists/failed_searches/failed-"+checklist_name+"-searches.csv"), index=False)

    
    # Remove not available species
    mask = df["accepted_taxon_key"] != -1
    df = df[mask]

    df['authority_name_provided'] = df['authority_name_provided'].str.strip("()")

    # in most cases the duplication arises due to synonyms so lets order by that
    df = df.sort_values(by=['status', 'authority_name_provided'])
    
    # Find duplicates
    duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]
    df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

    df_unique.to_csv(os.path.join("../species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)

    print("Duplicate Rows:")
    display(duplicates)

    print("duplicates not caused by synonyms:")
    inspect_df = df[df['accepted_taxon_key'].isin(duplicates['accepted_taxon_key'])]
    display(inspect_df.sort_values(by=['accepted_taxon_key']))

    return [duplicates, df_unique, df]

In [16]:
outputs = clean_and_remove_duplicated("japan-moths")

FileNotFoundError: [Errno 2] No such file or directory: '../species_checklists/japan-moths-keys.csv'

In [9]:
outputs = clean_and_remove_duplicated("costarica-moths")

AttributeError: Can only use .str accessor with string values!

In [10]:
outputs = clean_and_remove_duplicated("madagascar-moths")

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
95,1762894,Lepidoptera,Noctuidae,Hypobleta,Hypobleta Hypobleta viettei,"Berio, 1954",Hypobleta Hypobleta viettei,Hypobleta viettei,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
2369,1762894,Lepidoptera,Noctuidae,Hypobleta,Hypobleta viettei,"Berio, 1954",Hypobleta viettei,Hypobleta viettei,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
297,6116071,Lepidoptera,Erebidae,Acantholipes,Acantholipes Acantholipes transiens,"Berio, 1955",Acantholipes Acantholipes transiens,Acantholipes transiens,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1953,6116071,Lepidoptera,Erebidae,Acantholipes,Acantholipes transiens,"Berio, 1956",Acantholipes transiens,Acantholipes transiens,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
2859,5114718,Lepidoptera,Nolidae,Ophiosema,Ophiosema viettei,"Berio, 1956",Ophiosema viettei,Ophiosema viettei,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2483,1797863,Lepidoptera,Noctuidae,Eublemma,Metachrostis decora,"Walker, 1869",Metachrostis decora,Eublemma decora,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1983,5109345,Lepidoptera,Noctuidae,Acontia,Acontia imitatrix,"Wallengren, 1856",Acontia imitatrix,Acontia zelleri,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1994,5109345,Lepidoptera,Noctuidae,Acontia,Acontia transfigurata,"Wallengren, 1856",Acontia transfigurata,Acontia zelleri,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1440,6118557,Lepidoptera,Geometridae,Oaracta,Scardamia maculata,"Warren, 1897",Scardamia maculata,Oaracta maculata,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024


duplicates not caused by synonyms:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
1573,1732280,Lepidoptera,Lasiocampidae,Borocera,Borocera madagascariensis,"Boisduval, 1833",Borocera madagascariensis,Borocera madagascariensis,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
349,1732280,Lepidoptera,Lasiocampidae,Borocera,Borocera Borocera madagascariensis,"Boisduval, 1833",Borocera Borocera madagascariensis,Borocera madagascariensis,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
351,1732284,Lepidoptera,Lasiocampidae,Borocera,Borocera Borocera cajani,"Vinson, 1863",Borocera Borocera cajani,Borocera cajani,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1572,1732284,Lepidoptera,Lasiocampidae,Borocera,Borocera cajani,"Vinson, 1863",Borocera cajani,Borocera cajani,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1570,1732786,Lepidoptera,Lasiocampidae,Apatelopteryx,Apatelopteryx phenax,"De Lajonquière, 1968",Apatelopteryx phenax,Apatelopteryx phenax,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,11384070,Lepidoptera,Momphidae,Adelomompha,Lecithocera ranavaloella,"Viette, 1968",Lecithocera ranavaloella,Adelomompha andrianella,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1678,11393089,Lepidoptera,Lecithoceridae,Torodora,Lecithocera decaryella,"Viette, 1955",Lecithocera decaryella,Torodora decaryella,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1691,11393089,Lepidoptera,Lecithoceridae,Torodora,Thubdora decavella,"Viette, 1955",Thubdora decavella,Torodora decaryella,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
2594,11841387,Lepidoptera,Erebidae,Agamana,Parafodina inscripta,"Pagenstecher, 1907",Parafodina inscripta,Agamana inscripta,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024


In [11]:
outputs = clean_and_remove_duplicated("kenya-uganda-moths")

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
2436,6117705,Lepidoptera,Tortricidae,Multiquaestia,Multiquaestia Multiquaestia agassizi,"Aarvik & Karisch, 2009",Multiquaestia Multiquaestia agassizi,Multiquaestia agassizi,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
4330,6117704,Lepidoptera,Tortricidae,Multiquaestia,Multiquaestia Multiquaestia dallastai,"Aarvik & Karisch, 2009",Multiquaestia Multiquaestia dallastai,Multiquaestia dallastai,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
8931,6117705,Lepidoptera,Tortricidae,Multiquaestia,Multiquaestia agassizi,"Aarvik & Karisch, 2009",Multiquaestia agassizi,Multiquaestia agassizi,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
8932,6117704,Lepidoptera,Tortricidae,Multiquaestia,Multiquaestia dallastai,"Aarvik & Karisch, 2009",Multiquaestia dallastai,Multiquaestia dallastai,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
748,1744575,Lepidoptera,Tortricidae,Megalota,Megalota Megalota purpurana,"Aarvik, 2004",Megalota Megalota purpurana,Megalota purpurana,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7532,1961250,Lepidoptera,Geometridae,Myrteta,Pigiopsis parallelaria,"Warren, 1902",Pigiopsis parallelaria,Myrteta parallelaria,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
7624,11519621,Lepidoptera,Geometridae,Synthalia,Synpelurga innocens,"Warren, 1902",Synpelurga innocens,Synthalia innocens,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
8960,1953727,Lepidoptera,Uraniidae,Epiplema,Leucoplema triumbrata,"Warren, 1902",Leucoplema triumbrata,Epiplema triumbrata,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
7463,5146741,Lepidoptera,Geometridae,Idaea,Idaea minimaria,"Warren, 1904",Idaea minimaria,Idaea macrostyla,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024


duplicates not caused by synonyms:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
1909,1730753,Lepidoptera,Cossidae,Azygophleps,Azygophleps Azygophleps nubilosa,"Hampson, 1910",Azygophleps Azygophleps nubilosa,Azygophleps nubilosa,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
9069,1730753,Lepidoptera,Cossidae,Azygophleps,Azygophleps nubilosa,"Hampson, 1910",Azygophleps nubilosa,Azygophleps nubilosa,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
7993,1730762,Lepidoptera,Metarbelidae,Aethiopina,Aethiopina argentifera,"Gaede, 1929",Aethiopina argentifera,Aethiopina argentifera,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1058,1730762,Lepidoptera,Metarbelidae,Aethiopina,Aethiopina Aethiopina argentifera,"Gaede, 1929",Aethiopina Aethiopina argentifera,Aethiopina argentifera,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
7153,1731019,Lepidoptera,Cossidae,Theatrista,Nomima chloroptera,"Meyrick, 1920",Nomima chloroptera,Theatrista chloroptera,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9753,12250872,Lepidoptera,Tortricidae,Argyroploce,Olethreutes phyllodoxa,"Meyrick, 1932",Olethreutes phyllodoxa,Argyroploce phyllodoxa,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
8894,12250924,Lepidoptera,Tortricidae,Laspeyresia,Cydia chrysocosma,"Meyrick, 1920",Cydia chrysocosma,Laspeyresia chrysocosma,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
6355,12250924,Lepidoptera,Tortricidae,Laspeyresia,Laspeyresia Laspeyresia chrysocosma,"Meyrick, 1920",Laspeyresia Laspeyresia chrysocosma,Laspeyresia chrysocosma,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
9732,12253655,Lepidoptera,Tortricidae,Laspeyresia,Cydia stelosema,"Meyrick, 1931",Cydia stelosema,Laspeyresia stelosema,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024


In [12]:
outputs = clean_and_remove_duplicated("nigeria-moths")

Duplicate Rows:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
1674,1812079,Lepidoptera,Erebidae,Spilosoma,Spilosoma togoensis,"Bartel, 1903",Spilosoma togoensis,Spilosoma togoensis,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
851,6116680,Lepidoptera,Erebidae,Oraesia,Oraesia Oraesia politzari,"Behounek, Hacker & Speidel, 2010",Oraesia Oraesia politzari,Oraesia politzari,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
2201,6116680,Lepidoptera,Erebidae,Oraesia,Oraesia politzari,"Behounek, Hacker & Speidel, 2010",Oraesia politzari,Oraesia politzari,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
25,1820654,Lepidoptera,Erebidae,Terphothrix,Terphothrix Terphothrix lanaria,"Bethune-Baker, 1911",Terphothrix Terphothrix lanaria,Terphothrix lanaria,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
72,1819286,Lepidoptera,Erebidae,Conigephyra,Conigephyra Conigephyra flava,"Bethune-Baker, 1911",Conigephyra Conigephyra flava,Conigephyra flava,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1747,1963132,Lepidoptera,Geometridae,Geodena,Geodena suffusa,"Swinhoe, 1904",Geodena suffusa,Geodena dama,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1976,5109345,Lepidoptera,Noctuidae,Acontia,Acontia imitatrix,"Wallengren, 1856",Acontia imitatrix,Acontia zelleri,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1979,5109345,Lepidoptera,Noctuidae,Acontia,Acontia transfigurata,"Wallengren, 1856",Acontia transfigurata,Acontia zelleri,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1744,11481581,Lepidoptera,Geometridae,Morabia,Ectropis nigripunctata,"Warren, 1897",Ectropis nigripunctata,Morabia nigripunctata,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024


duplicates not caused by synonyms:


Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,species_name_provided,authority_name_provided,search_species_name,gbif_species_name,subgenus_name,tribe_name,phylum_name,kingdom_name,class_name,confidence,status,match_type,rank,source
61,1732232,Lepidoptera,Lasiocampidae,Pachypasa,Pachypasa Pachypasa ondulosa,"Conte, 1909",Pachypasa Pachypasa ondulosa,Pachypasa ondulosa,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1840,1732232,Lepidoptera,Lasiocampidae,Pachypasa,Euphorea ondulosa,"Conte, 1909",Euphorea ondulosa,Pachypasa ondulosa,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
1023,1751902,Lepidoptera,Himantopteridae,Staphylinochrous,Staphylinochrous Staphylinochrous pygmaea,"Bethune-Baker, 1911",Staphylinochrous Staphylinochrous pygmaea,Staphylinochrous pygmaea,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1532,1751902,Lepidoptera,Himantopteridae,Staphylinochrous,Staphylinochrous pygmaea,"Bethune-Baker, 1911",Staphylinochrous pygmaea,Staphylinochrous pygmaea,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1073,1753547,Lepidoptera,Limacodidae,Narosana,Narosana Narosana agbaja,"Bethune-Baker, 1915",Narosana Narosana agbaja,Narosana agbaja,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,11839618,Lepidoptera,Nolidae,Garella,Garella Garella submediana,"Wiltshire, 1986",Garella Garella submediana,Garella submediana,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
682,12214478,Lepidoptera,Erebidae,Siccasura,Siccasura Siccasura numida,"Holland, 1893",Siccasura Siccasura numida,Siccasura numida,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
1543,12214478,Lepidoptera,Erebidae,Siccasura,Afrasura numida,"Holland, 1893",Afrasura numida,Siccasura numida,,,,,Insecta,99,SYNONYM,EXACT,SPECIES,Leeds13Nov2024
487,12229765,Lepidoptera,Sesiidae,Synanthedon,Synanthedon Synanthedon cingulata,"Gaede, 1929",Synanthedon Synanthedon cingulata,Synanthedon cingulata,,,,,Insecta,99,ACCEPTED,EXACT,SPECIES,Leeds13Nov2024
