# 3. Remove duplicated species and those not on GBIF

Short script that takes the species checklist and removed any duplicate species entries.

If also removes entries not found on GBIF.

In [None]:
import pandas as pd
import os


In [None]:
def inspect_duplicates(duplicates):
    # Lets inspect cases where duplication is not caused by synonyms
    duplicated_no_syn = duplicates.loc[duplicates['status']!='SYNONYM', ]
    duplicated_no_syn[duplicated_no_syn.duplicated(subset='accepted_taxon_key', keep=False)]

In [None]:
def clean_and_remove_duplicated(checklist_name, base_dir = '/home/users/katriona/gbif_download_standalone'):
    os.makedirs(os.path.join(base_dir, "species_checklists"), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "species_checklists/failed_searches"), exist_ok=True)


    df = pd.read_csv(os.path.join(base_dir, "species_checklists",checklist_name+"-keys.csv"))

    # save not found entries to a new file
    error_df = df[df["accepted_taxon_key"] == -1]
    error_df.to_csv(os.path.join(base_dir, "species_checklists/failed_searches/failed-"+checklist_name+"-searches.csv"), index=False)


    # Remove not available species
    mask = df["accepted_taxon_key"] != -1
    df = df[mask]


    if 'authority_name_provided' not in df.columns:
        df['authority_name_provided'] = ''
    else:
        df['authority_name_provided'] = df['authority_name_provided'].fillna('')


    df['authority_name_provided'] = df['authority_name_provided'].str.strip("()")

    # in most cases the duplication arises due to synonyms so lets order by that
    df = df.sort_values(by=['status', 'authority_name_provided'])

    # Find duplicates
    duplicates = df[df.duplicated(subset='accepted_taxon_key', keep=False)]
    df_unique = df.drop_duplicates(subset='accepted_taxon_key', keep='first')

    df_unique.to_csv(os.path.join(base_dir, "species_checklists/",checklist_name+"-keys-nodup.csv"), index=False)

    print("Duplicate Rows:")
    display(duplicates)

    print("duplicates not caused by synonyms:")
    inspect_df = df[df['accepted_taxon_key'].isin(duplicates['accepted_taxon_key'])]
    display(inspect_df.sort_values(by=['accepted_taxon_key']))

    return [duplicates, df_unique, df]

In [None]:
outputs = clean_and_remove_duplicated("japan-moths")

In [None]:
outputs = clean_and_remove_duplicated("costarica-moths")

In [None]:
outputs = clean_and_remove_duplicated("madagascar-moths")

In [None]:
outputs = clean_and_remove_duplicated("kenya-uganda-moths")

In [None]:
outputs = clean_and_remove_duplicated("nigeria-moths")

In [None]:
outputs = clean_and_remove_duplicated("thailand-moths")

In [None]:
outputs = clean_and_remove_duplicated("namibia-moths")

In [None]:
outputs