# Preprocess the species lists

Short script to preprocess the species checklist.
Will be modified for each incoming species checklist, depending on what must be done. 

The aim is to transform the column names of the checklist, so that: 
- The column with species name is called "species_name_provided"
- The column with the authority is called "authority_name_provided". If such column doesn't exist, it should be created and left blank.
- The authority column is formatted as "Lastname, year" 

In [2]:
import pandas as pd
import os

## Costa Rica

In [5]:
# Read the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')



In [6]:
df.head()

Unnamed: 0,Superfamily,Family,Genus,Species
0,Bombycoidea,Saturniidae,Attacus,Attacus atlas
1,Pyraloidea,Crambidae,Nausinoe,Nausinoe perspectata
2,Geometroidea,Geometridae,Chiasmia,Chiasmia emersaria
3,Noctuoidea,Erebidae,Caeneressa,Caeneressa fouqueti
4,Bombycoidea,Sphingidae,Psilogramma,Psilogramma increta


In [7]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""

In [8]:
# Transform column names
# provided_species_column_name = "Scientific Name"
# provided_authority_column_name = "Authority"

# df = df.rename(columns={
#     provided_species_column_name: "species_name_provided",
#     provided_authority_column_name: "authority_name_provided"
# })

In [9]:
# Edit the column names to remove [] and ()
# df['authority_name_provided'] = df['authority_name_provided'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
# Costa Rica list was too long for the API call. So had to split in 3 parts:

# Determine the split indices
split1 = len(df) // 3
split2 = 2 * split1

# Split the DataFrame into three parts
df1 = df.iloc[:split1]
df2 = df.iloc[split1:split2]
df3 = df.iloc[split2:]

In [None]:
# Save the three parts to separate CSV files
df1.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part1.csv"),
          index=False)
df2.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part2.csv"),
          index=False)
df3.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part3.csv"),
          index=False)

In [None]:
# Save the csv file
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## For UK macro moths 

This file is in a different format

In [None]:
# Read the species checklist
checklist_name = "uksi-macro-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df["Genus"] = ''

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["taxon"].fillna('')

df["authority_name_provided"] = ""

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Thailand

In [18]:
# Read the species checklist
checklist_name = "thailand-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.columns=['Superfamily', 'Family', 'Genus', 'Species']


In [19]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""


df.head()

Unnamed: 0,Superfamily,Family,Genus,Species,species_name_provided,authority_name_provided
0,Bombycoidea,Saturniidae,Attacus,Attacus atlas,Attacus Attacus atlas,
1,Pyraloidea,Crambidae,Nausinoe,Nausinoe perspectata,Nausinoe Nausinoe perspectata,
2,Geometroidea,Geometridae,Chiasmia,Chiasmia emersaria,Chiasmia Chiasmia emersaria,
3,Noctuoidea,Erebidae,Caeneressa,Caeneressa fouqueti,Caeneressa Caeneressa fouqueti,
4,Bombycoidea,Sphingidae,Psilogramma,Psilogramma increta,Psilogramma Psilogramma increta,


In [20]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Madagascar

In [14]:
# Read the species checklist
checklist_name = "madagascar-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.head()

Unnamed: 0,Family,Species,Authority
0,Adelidae,Adela gymnota,"(Meyrick, 1912)"
1,Adelidae,Adela janineae,"(Viette, 1954)"
2,Adelidae,Adela tsaratanana,"(Viette, 1954)"
3,Alucitidae,Alucita decaryella,"(Viette, 1956)"
4,Alucitidae,Alucita euscripta,"Minet, 1976"


In [18]:
df["Genus"] = ""

# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']

df.head()

Unnamed: 0,Family,Species,Authority,Genus,species_name_provided,authority_name_provided
0,Adelidae,Adela gymnota,"(Meyrick, 1912)",,Adela gymnota,"(Meyrick, 1912)"
1,Adelidae,Adela janineae,"(Viette, 1954)",,Adela janineae,"(Viette, 1954)"
2,Adelidae,Adela tsaratanana,"(Viette, 1954)",,Adela tsaratanana,"(Viette, 1954)"
3,Alucitidae,Alucita decaryella,"(Viette, 1956)",,Alucita decaryella,"(Viette, 1956)"
4,Alucitidae,Alucita euscripta,"Minet, 1976",,Alucita euscripta,"Minet, 1976"


In [19]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Anguilla

In [3]:
# Read the species checklist
checklist_name = "anguilla-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.head()


Unnamed: 0,ï»¿Superfamily,Family,Subfamily,Species,GBIF accepted name,Authority (GBIF),Common name,Comments
0,Bombycoidea,Sphingidae,Magroglossinae,Aellopos tantalus,Aellopos tantalus,"Linnaeus, 1758",Tantalus Sphinx,
1,Bombycoidea,Sphingidae,Magroglossinae,Enyo lugubris,Enyo lugubris,"Linnaeus, 1771",Mournful Sphinx,
2,Bombycoidea,Sphingidae,Magroglossinae,Erinnyis ello,Erinnyis ello,"Linnaeus, 1758",Ello Sphinx,
3,Bombycoidea,Sphingidae,Magroglossinae,Hyles lineata,Hyles lineata,"Fabricius, 1775",White-lined Sphinx,
4,Bombycoidea,Sphingidae,Magroglossinae,Pseudosphinx tetrio,Pseudosphinx tetrio,"Linnaeus, 1771","Tetrio sphinx, Frangipani Sphinx",


In [6]:
df["Genus"] = ""

# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]
df.head()

Unnamed: 0,Family,Subfamily,Species,Genus,GBIF accepted name,species_name_provided,authority_name_provided
0,Sphingidae,Magroglossinae,Aellopos tantalus,,Aellopos tantalus,Aellopos tantalus,
1,Sphingidae,Magroglossinae,Enyo lugubris,,Enyo lugubris,Enyo lugubris,
2,Sphingidae,Magroglossinae,Erinnyis ello,,Erinnyis ello,Erinnyis ello,
3,Sphingidae,Magroglossinae,Hyles lineata,,Hyles lineata,Hyles lineata,
4,Sphingidae,Magroglossinae,Pseudosphinx tetrio,,Pseudosphinx tetrio,Pseudosphinx tetrio,


In [7]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)