# Preprocess the species lists

Short script to preprocess the species checklist.
Will be modified for each incoming species checklist, depending on what must be done. 

The aim is to transform the column names of the checklist, so that: 
- The column with species name is called "species_name_provided"
- The column with the authority is called "authority_name_provided". If such column doesn't exist, it should be created and left blank.
- The authority column is formatted as "Lastname, year" 

In [None]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

In [None]:
def split_dataframe(df, n, output_dir, list_name):
    split_size = len(df) // n
    for i in range(n):
        start_idx = i * split_size
        # Ensure the last part includes any remaining rows
        end_idx = (i + 1) * split_size if i < n - 1 else len(df)
        df_part = df.iloc[start_idx:end_idx]
        file_path = os.path.join(output_dir, f"{list_name}-preprocessed-part{i + 1}.csv")
        df_part.to_csv(file_path, index=False)
        print(f"Saved part {i + 1} to {file_path}")

In [None]:
def scrape_wikipedia_to_csv(url):
    # Send an HTTP GET request to fetch the content
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all <div> elements with class "mw-heading mw-heading2"
    families = soup.find_all("div", class_="mw-heading2")

    # Dictionary to store each heading and its corresponding list items
    data = pd.DataFrame()

    # Loop through each heading and capture the associated bullet points
    for family in families:
        # Get the heading text
        family_text = family.get_text(strip=True).replace('[edit]', '')
        
        if family_text in ['Contents', 'References', 'See also']:
            continue

        # Find the next <ul> element (the bullet list after the heading)
        bullet_list = family.find_next("ul")

        # Collect list items if a <ul> is found
        items = []
        if bullet_list:
            auth = []
            species = []
            for li in bullet_list.find_all("li"):

                spec = [x.get_text(strip=True).strip() for x in li.find_all("a")]
                
                if spec == []: 
                    spec = ['formatting error']
                    spec_auth = ['formatting error']
                else:
                    spec = [spec[0]]
                    spec_auth = [li.get_text(strip=True).replace(str(spec[0]), ' ').split('—', 1)[-1].strip().strip("()")]
                    species = species + spec

                    if spec_auth == []: 
                        spec_auth = ['formatting error']
                    
                    auth = auth + spec_auth
                    
                    if len(auth) != len(species):
                        print(spec)
                    


        df_dict = {'Family': [family_text] * len(auth), 'Genus': [''] * len(auth), 'Species': species, 'Authority': auth}
        # print(len(df_dict['Family']), len(df_dict['Genus']), len(df_dict['Species']), len(df_dict['Authority']))
        fam_df = pd.DataFrame(df_dict)
        data = pd.concat([data, fam_df])

    print(data)
    data = data.loc[data['Species'] != "formatting error", ]
    
    return data

In [None]:
# we will want to remove all butterflies from the lists. These are the butterfly families:
butterfly_families = ['Papilionidae', 'Nymphalidae', 'Pieridae', 'Lycaenidae', 'Riodinidae', 'Hesperiidae']

def remove_butterflies(df, family_column='Family'):
    og_count = df.shape[0]
    print(f'Currently {og_count} species')
    df = df[~df[family_column].isin(butterfly_families)]
    print(f'Now {df.shape[0]} species, {og_count - df.shape[0]} butterfly species removed.')
    return df

## Costa Rica

In [None]:
# Read the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.columns = ['Family', 'Genus', 'Species', 'Subspecies']

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""

In [None]:
# Edit the column names to remove [] and ()
df['authority_name_provided'] = df['authority_name_provided'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
df = remove_butterflies(df)

In [None]:
# Costa Rica list was too long for the API call. So had to split in 3 parts:
split_dataframe(df, n=3, 
                output_dir="../species_checklists", 
                list_name=checklist_name)


In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## For UK moths 

This file is in a different format

In [None]:
# Read the species checklist
checklist_name = "uksi-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df["Genus"] = ''

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["taxon"].fillna('')

df["authority_name_provided"] = df['preferred_authority'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
df = remove_butterflies(df, 'family_taxon')

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Thailand

In [None]:
# Read the species checklist
checklist_name = "thailand-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                sep=',', encoding='latin-1')

#df.columns=['Superfamily', 'Family', 'Genus', 'Species']


In [None]:
df.loc[df['scientific_name'].str.split().str.len() > 2, 'scientific_name']

In [None]:
# remove duplicates in the scientific_name column
df = df.drop_duplicates(subset='scientific_name')

# only keep rows where the scientific_name is two words or more and keep the first two
df = df[df['scientific_name'].str.split().str.len() > 1]
df['scientific_name'] = df['scientific_name'].str.split().str[:2].str.join(' ')

#df = df[['scientific_name', 'taxon_id']]
df['Species']  = df['scientific_name']
df['Genus']  = df['taxon_genus_name']

df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

In [None]:
df.head()

In [None]:
df = remove_butterflies(df, 'taxon_family_name')

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Madagascar

This comes from two sources: 
1. Moths from GBIF using the filter: 
    ```json
    {
    "and" : [
        "BasisOfRecord is one of (Human Observation, Specimen)",
        "Country is Madagascar",
        "OccurrenceStatus is Present",
        "TaxonKey is Lepidoptera"
    ]
    }
    ```
2. From Wikipedia: https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar

### 1. From GBIF

In [None]:
# Read the species checklist
checklist_name = "madagascar-moths"

mad_df1 = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+"1.csv"),
                sep='\t', encoding='latin-1')

mad_df1.head()

In [None]:
mad_df1 = mad_df1.loc[mad_df1['order'] == 'Lepidoptera', ]

# create a column made up from the third word in column onwards
mad_df1['Authority'] = mad_df1['scientificName'].str.split().str[2:].str.join(' ')

mad_df1["species_name_provided"] = mad_df1["genus"].fillna('') + " " + mad_df1["species"].fillna('')
mad_df1["authority_name_provided"] = mad_df1['Authority']

mad_df1.head()

### 2. From Wikipedia

In [None]:
# Wikipedia URL for the moth species by family
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar'

# mad_df2 = scrape_wikipedia_to_csv(wikipedia_url)
mad_df2 = scrape_wikipedia_to_csv(wikipedia_url)

In [None]:
mad_df2

#### 3. Combine

In [None]:
# combine df and mad_df2
mad_df2 = mad_df2[['Family', 'Genus', 'Species', 'Authority']]

mad_df1 = mad_df1[['family', 'genus', 'species', 'Authority']]
mad_df1.columns = mad_df2.columns

df = pd.concat([mad_df1, mad_df2], ignore_index=True)
df.head()

In [None]:
print(df.shape)

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'])

# remove rows with missing species names
df = df.dropna(subset=['Species'])

df.shape

In [None]:
df = remove_butterflies(df)

In [None]:
split_dataframe(df=df, n=5, output_dir="../species_checklists/", list_name=checklist_name)

df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

## Anguilla

List provided by David Roy on 21/5/24.
Updated list on 24/10/24

In [None]:
# Read the species checklist
checklist_name = "anguilla-moths"

df = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+".csv"),
                sep=',', encoding='latin-1')

df.head()

In [None]:
df_update = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+"_update.csv"),
                sep=',', encoding='latin-1')

df_update["Genus"] = ""
df_update["Family"] = ""
df_update["Subfamily"] = ""
df_update["GBIF accepted name"] = ""

df_update["species_name_provided"] = df_update["Genus"].fillna('') + " " + df_update["Species"].fillna('')
df_update["authority_name_provided"] = ""

df_update.head()

In [None]:
df["Genus"] = ""

# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]
df.head()

In [None]:
# combine two lists
df_combined = pd.concat([df, df_update[list(df.columns)]])

df_combined.loc[df_combined['species_name_provided'].str.contains('sp\\.'), 'species_name_provided'] = df_combined.loc[df_combined['species_name_provided'].str.contains('sp\\.'), 'species_name_provided'].str.replace(' sp.', '')


df_combined.head()

In [None]:
print(df_combined.shape)

df_combined = df_combined.drop_duplicates(subset='Species', keep="first")
print(df_combined.shape)

In [None]:
df_combined = remove_butterflies(df_combined)

In [None]:
df_combined.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

# Kenya and Uganda

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=AFRICA&country=KE&country=UG&taxon_key=797&advanced=1

In [None]:
# Read the species checklist from GBIF
ku_df1 = pd.read_csv(os.path.join("../species_checklists", "kenya-uganda-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
ku_df1['Authority'] = ku_df1['scientificName'].str.split().str[2:].str.join(' ')

ku_df1["species_name_provided"] = ku_df1["genus"].fillna('') + " " + ku_df1["species"].fillna('')
ku_df1["authority_name_provided"] = ku_df1['Authority']
ku_df1['Source'] = 'gbif'

ku_df1 = ku_df1[['family', 'genus', 'species', 'Authority', 'Source']]

ku_df1.head()

#### From Wikipedia

In [None]:
kenya_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Kenya'
uganda_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Uganda'

kenya_df_wiki = scrape_wikipedia_to_csv(kenya_url)
uganda_df_wiki = scrape_wikipedia_to_csv(uganda_url)

ku_df2 = pd.concat([kenya_df_wiki, uganda_df_wiki], ignore_index=True)
ku_df2['Source'] = 'wiki'
ku_df2.head()

In [None]:
ku_df1.columns = ku_df2.columns
df = pd.concat([ku_df1, ku_df2], ignore_index=True)

df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']
df['Subfamily'] = ""
df["GBIF accepted name"] = df["Species"].fillna('')


df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]

df = remove_butterflies(df)

In [None]:
checklist_name = 'kenya_uganda-gbif-moths'
df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

df.head()

# Japan

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=ASIA&country=JA&taxon_key=797&advanced=1

In [None]:
# Read the species checklist from GBIF
jp_df1 = pd.read_csv(os.path.join("../species_checklists", "japan-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
jp_df1['Authority'] = jp_df1['scientificName'].str.split().str[2:].str.join(' ')

jp_df1["species_name_provided"] = jp_df1["genus"].fillna('') + " " + jp_df1["species"].fillna('')
jp_df1["authority_name_provided"] = jp_df1['Authority']
jp_df1['Source'] = 'gbif'
jp_df1.head()

#### From Wikipedia

In [None]:
japan_urls = ['https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Noctuoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Bombycoidea-Geometroidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Pyraloidea-Drepanoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Choreutoidea-Thyridoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Zygaenoidea-Tortricoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Gelechioidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Micropterigoidea-Yponomeutoidea)'] #'https://en.wikipedia.org/wiki/List_of_moths_of_Japan'

jp_df2 = pd.DataFrame()

for url in japan_urls: 
    print(url)


    japan_df_wiki = scrape_wikipedia_to_csv(url)

    jp_df2 = pd.concat([jp_df2, japan_df_wiki], ignore_index=True)

jp_df2['Source'] = 'wiki'
jp_df2.head()

### From Jenna's list

In [None]:
url = 'http://listmj.mothprog.com/list.html'


# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# Initialize lists to store data
families, genera, species_list = [], [], []

current_family = None
current_genus = None

# Loop through each element in the parsed HTML
for tag in soup.find_all(True):
    if 'family' in tag.get("class", []):
        # Update the current family
        current_family = tag.find("span", class_="highername").get_text(strip=True) 
    elif 'genus' in tag.get("class", []):
        # Update the current genus
        if tag.find("span", class_="genusname") is not None:
            current_genus = tag.find("span", class_="genusname").get_text(strip=True)
        else: 
            current_genus = 'undefined'
    elif 'species' in tag.get("class", []):
        # Treat anything else as a species if within a family and genus
        if tag.find("span", class_="sciname") is not None:
            species_name = tag.find("span", class_="sciname").get_text(strip=True)   # take species name before any additional text
        else:
            species_name = 'Undefined: ' + tag.get_text(strip=True)
        
        families.append(current_family)
        genera.append(current_genus)
        species_list.append(species_name)

# Create DataFrame
jp_df3 = pd.DataFrame({
    "Family": families,
    "Genus": genera,
    "Species": species_list
})

jp_df3['Authority'] = ''
jp_df3['Source'] = 'List-MJ'

jp_df3

#### combine df1, df2 and df3

In [None]:
jp_df1 = jp_df1[['family', 'genus', 'species', 'Authority', 'Source']]
jp_df2 = jp_df2[['Family', 'Genus', 'Species', 'Authority', 'Source']]
jp_df3 = jp_df3[['Family', 'Genus', 'Species', 'Authority', 'Source']]
jp_df1.columns = jp_df2.columns

In [None]:
df = pd.concat([jp_df1, jp_df2, jp_df3], ignore_index=True)
print(df['Source'].value_counts())

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'], keep='last')

# remove rows with missing species names
df = df.dropna(subset=['Species'])

df = remove_butterflies(df)

In [None]:
df.head()

In [None]:
# Combine column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']
df['Subfamily'] = ""
df["GBIF accepted name"] = df["Species"].fillna('')

df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]

In [None]:
print(df.shape)
df = df[df['Species'].notnull()]
print(df.shape)
df = df[df['Genus'] != "undefined"]
print(df.shape)

In [None]:
checklist_name = "japan-moths"
df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

In [None]:
df.head()

# Nigeria

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=AFRICA&country=NG&taxon_key=797&advanced=1

In [None]:
# Read the species checklist from GBIF
ng_df1 = pd.read_csv(os.path.join("../species_checklists", "nigeria-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
ng_df1['Authority'] = ng_df1['scientificName'].str.split().str[2:].str.join(' ')

ng_df1["species_name_provided"] = ng_df1["genus"].fillna('') + " " + ng_df1["species"].fillna('')
ng_df1["authority_name_provided"] = ng_df1['Authority']
ng_df1['Source'] = 'gbif'
ng_df1.head()

#### From Wikipedia

In [None]:
nigeria_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Nigeria'

ng_df2 = scrape_wikipedia_to_csv(nigeria_url)

ng_df2['Source'] = 'wiki'
ng_df2.head()

#### combine df1 and df2

In [None]:
ng_df2 = ng_df2[['Family', 'Genus', 'Species', 'Authority', 'Source']]
ng_df1 = ng_df1[['family', 'genus', 'species', 'Authority', 'Source']]
ng_df1.columns = ng_df2.columns

In [None]:
df = pd.concat([ng_df1, ng_df2], ignore_index=True)
print(df['Source'].value_counts())

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'], keep='first')

# remove rows with missing species names
df = df.dropna(subset=['Species'])

df = remove_butterflies(df)

print(df['Source'].value_counts())

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']
df['Subfamily'] = ""
df["GBIF accepted name"] = df["Species"].fillna('')


df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]

In [None]:
checklist_name = "nigeria-moths"
df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

In [None]:
df.head()