# Preprocess the species lists

Short script to preprocess the species checklist.
Will be modified for each incoming species checklist, depending on what must be done. 

The aim is to transform the column names of the checklist, so that: 
- The column with species name is called "species_name_provided"
- The column with the authority is called "authority_name_provided". If such column doesn't exist, it should be created and left blank.
- The authority column is formatted as "Lastname, year" 

In [None]:
import pandas as pd
import os

In [None]:
# Read the species checklist
checklist_name = "costarica-moths-updated"

df = pd.read_csv(os.path.join("../species_checklists",checklist_name+".csv"),
                 sep=',', encoding='latin-1')

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""

In [None]:
# Transform column names
# provided_species_column_name = "Scientific Name"
# provided_authority_column_name = "Authority"

# df = df.rename(columns={
#     provided_species_column_name: "species_name_provided",
#     provided_authority_column_name: "authority_name_provided"
# })

In [None]:
# Edit the column names to remove [] and ()
# df['authority_name_provided'] = df['authority_name_provided'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
# Costa Rica list was too long for the API call. So had to split in 3 parts:

# Determine the split indices
split1 = len(df) // 3
split2 = 2 * split1

# Split the DataFrame into three parts
df1 = df.iloc[:split1]
df2 = df.iloc[split1:split2]
df3 = df.iloc[split2:]

In [None]:
# Save the three parts to separate CSV files
df1.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part1.csv"),
          index=False)
df2.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part2.csv"),
          index=False)
df3.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed-part3.csv"),
          index=False)

In [None]:
# Save the csv file
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)