# Preprocess the species lists

Short script to preprocess the species checklist.
Will be modified for each incoming species checklist, depending on what must be done. 

The aim is to transform the column names of the checklist, so that: 
- The column with species name is called "species_name_provided"
- The column with the authority is called "authority_name_provided". If such column doesn't exist, it should be created and left blank.
- The authority column is formatted as "Lastname, year" 

In [None]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

In [None]:
def split_dataframe(df, n, output_dir, list_name):
    split_size = len(df) // n
    for i in range(n):
        start_idx = i * split_size
        # Ensure the last part includes any remaining rows
        end_idx = (i + 1) * split_size if i < n - 1 else len(df)
        df_part = df.iloc[start_idx:end_idx]
        file_path = os.path.join(output_dir, f"{list_name}-preprocessed-part{i + 1}.csv")
        df_part.to_csv(file_path, index=False)
        print(f"Saved part {i + 1} to {file_path}")

## Costa Rica

In [None]:
# Read the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.columns = ['Family', 'Genus', 'Species', 'Subspecies']

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""

In [None]:
# Edit the column names to remove [] and ()
df['authority_name_provided'] = df['authority_name_provided'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
# Costa Rica list was too long for the API call. So had to split in 3 parts:
split_dataframe(df, n=3, 
                output_dir="../species_checklists", 
                list_name=checklist_name)
# # Determine the split indices
# split1 = len(df) // 3
# split2 = 2 * split1

# # Split the DataFrame into three parts
# df1 = df.iloc[:split1]
# df2 = df.iloc[split1:split2]
# df3 = df.iloc[split2:]

In [None]:
# # Save the three parts to separate CSV files
# df1.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part1.csv"),
#           index=False)
# df2.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part2.csv"),
#           index=False)
# df3.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part3.csv"),
#           index=False)

In [None]:
# Save the csv file
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## For UK moths 

This file is in a different format

In [None]:
# Read the species checklist
checklist_name = "uksi-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df["Genus"] = ''

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["taxon"].fillna('')

df["authority_name_provided"] = df['preferred_authority'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Thailand

In [None]:
# Read the species checklist
checklist_name = "thailand-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                sep=',', encoding='latin-1')

#df.columns=['Superfamily', 'Family', 'Genus', 'Species']


In [None]:
df.loc[df['scientific_name'].str.split().str.len() > 2, 'scientific_name']

In [None]:
# remove duplicates in the scientific_name column
df = df.drop_duplicates(subset='scientific_name')

# only keep rows where the scientific_name is two words or more and keep the first two
df = df[df['scientific_name'].str.split().str.len() > 1]
df['scientific_name'] = df['scientific_name'].str.split().str[:2].str.join(' ')

#df = df[['scientific_name', 'taxon_id']]
df['Species']  = df['scientific_name']
df['Genus']  = df['taxon_genus_name']

df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Madagascar

This comes from two sources: 
1. Moths from GBIF using the filter: 
    ```json
    {
    "and" : [
        "BasisOfRecord is one of (Human Observation, Specimen)",
        "Country is Madagascar",
        "OccurrenceStatus is Present",
        "TaxonKey is Lepidoptera"
    ]
    }
    ```
2. From Wikipedia: https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar

### 1. From GBIF

In [None]:
# Read the species checklist
# Read the species checklist
checklist_name = "madagascar-moths"

mad_df1 = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+"1.csv"),
                sep='\t', encoding='latin-1')

mad_df1.head()

In [None]:
mad_df1 = mad_df1.loc[mad_df1['order'] == 'Lepidoptera', ]

# create a column made up from the third word in column onwards
mad_df1['Authority'] = mad_df1['scientificName'].str.split().str[2:].str.join(' ')

mad_df1["species_name_provided"] = mad_df1["genus"].fillna('') + " " + mad_df1["species"].fillna('')
mad_df1["authority_name_provided"] = mad_df1['Authority']

mad_df1.head()

In [None]:
def scrape_wikipedia_to_csv(wikipedia_url):
    # Send a request to the Wikipedia page
    response = requests.get(wikipedia_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the sections containing species information
    family_sections = soup.find_all('h2')

    data = []

    # Loop through each family section to extract species
    for family_section in family_sections:
        # Find the family name
        family_name_tag = family_section.find('span', class_='mw-headline')
        if not family_name_tag:
            continue
        family_name = family_name_tag.get_text()

        # Skip any 'h3' subheadings
        next_node = family_section.find_next_sibling()
        while next_node and next_node.name != 'h2':
            if next_node.name == 'div' and 'columns' in next_node.get('class', []):
                for column in next_node.find_all('div', recursive=False):
                    for li in column.find_all('li'):
                        species_name = li.get_text().strip()
                        # Keep only the first two words in the species name
                        species_name = ' '.join(species_name.split()[:2])
                        data.append({'Family': family_name, 'Species': species_name})
            elif next_node.name == 'ul':
                for li in next_node.find_all('li'):
                    species_name = li.get_text().strip()
                    # Keep only the first two words in the species name
                    species_name = ' '.join(species_name.split()[:2])
                    data.append({'Family': family_name, 'Species': species_name})
            next_node = next_node.find_next_sibling()

    # Create a DataFrame and save to CSV
    df = pd.DataFrame(data)
    return df

In [None]:
# Wikipedia URL for the moth species by family
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar'

mad_df2 = scrape_wikipedia_to_csv(wikipedia_url)

In [None]:
mad_df2['Genus'] = ''
mad_df2['Authority'] = ''

mad_df2.head()

In [None]:
# combine df and mad_df2
mad_df2 = mad_df2[['Family', 'Genus', 'Species', 'Authority']]

mad_df1 = mad_df1[['family', 'genus', 'species', 'Authority']]
mad_df1.columns = mad_df2.columns

df = pd.concat([mad_df1, mad_df2], ignore_index=True)
df.head()

In [None]:
print(df.shape)

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'])

# remove rows with missing species names
df = df.dropna(subset=['Species'])

df.shape

In [None]:
split_dataframe(df=df, n=5, output_dir="../species_checklists/", list_name=checklist_name)

df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

## Anguilla

List provided by David Roy on 21/5/24.

In [None]:
# Read the species checklist
checklist_name = "anguilla-moths"

df = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+".csv"),
                sep=',', encoding='latin-1')

df.head()


In [None]:
df["Genus"] = ""

# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]
df.head()

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)