# Preprocess the species lists

Short script to preprocess the species checklist.
Will be modified for each incoming species checklist, depending on what must be done. 

The aim is to transform the column names of the checklist, so that: 
- The column with species name is called "species_name_provided"
- The column with the authority is called "authority_name_provided". If such column doesn't exist, it should be created and left blank.
- The authority column is formatted as "Lastname, year" 

In [1]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

In [2]:
def split_dataframe(df, n, output_dir, list_name):
    split_size = len(df) // n
    for i in range(n):
        start_idx = i * split_size
        # Ensure the last part includes any remaining rows
        end_idx = (i + 1) * split_size if i < n - 1 else len(df)
        df_part = df.iloc[start_idx:end_idx]
        file_path = os.path.join(output_dir, f"{list_name}-preprocessed-part{i + 1}.csv")
        df_part.to_csv(file_path, index=False)
        print(f"Saved part {i + 1} to {file_path}")

In [65]:
def scrape_wikipedia_to_csv(url):
    # Send an HTTP GET request to fetch the content
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all <div> elements with class "mw-heading mw-heading2"
    families = soup.find_all("div", class_="mw-heading2")

    # Dictionary to store each heading and its corresponding list items
    data = pd.DataFrame()

    # Loop through each heading and capture the associated bullet points
    for family in families:
        # Get the heading text
        family_text = family.get_text(strip=True).replace('[edit]', '')
        
        if family_text in ['Contents', 'References', 'See also']:
            continue

        # Find the next <ul> element (the bullet list after the heading)
        bullet_list = family.find_next("ul")

        # Collect list items if a <ul> is found
        items = []
        if bullet_list:
            auth = []
            species = []
            for li in bullet_list.find_all("li"):

                spec = [x.get_text(strip=True).strip() for x in li.find_all("a")]
                
                if spec == []: 
                    spec = ['formatting error']
                    spec_auth = ['formatting error']
                else:
                    spec = [spec[0]]
                    spec_auth = [li.get_text(strip=True).replace(str(spec[0]), ' ').split('—', 1)[-1].strip().strip("()")]
                    species = species + spec

                    if spec_auth == []: 
                        spec_auth = ['formatting error']
                    
                    auth = auth + spec_auth
                    
                    if len(auth) != len(species):
                        print(spec)
                    


        df_dict = {'Family': [family_text] * len(auth), 'Genus': [''] * len(auth), 'Species': species, 'Authority': auth}
        # print(len(df_dict['Family']), len(df_dict['Genus']), len(df_dict['Species']), len(df_dict['Authority']))
        fam_df = pd.DataFrame(df_dict)
        data = pd.concat([data, fam_df])

    data = data.loc[data['Species'] != "formatting error", ]
    
    return data

## Costa Rica

In [None]:
# Read the species checklist
checklist_name = "costarica-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df.columns = ['Family', 'Genus', 'Species', 'Subspecies']

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')

df["authority_name_provided"] = ""

In [None]:
# Edit the column names to remove [] and ()
df['authority_name_provided'] = df['authority_name_provided'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
# Costa Rica list was too long for the API call. So had to split in 3 parts:
split_dataframe(df, n=3, 
                output_dir="../species_checklists", 
                list_name=checklist_name)
# # Determine the split indices
# split1 = len(df) // 3
# split2 = 2 * split1

# # Split the DataFrame into three parts
# df1 = df.iloc[:split1]
# df2 = df.iloc[split1:split2]
# df3 = df.iloc[split2:]

In [None]:
# # Save the three parts to separate CSV files
# df1.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part1.csv"),
#           index=False)
# df2.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part2.csv"),
#           index=False)
# df3.to_csv(os.path.join("../species_checklists/", 
#                        checklist_name+"-preprocessed-part3.csv"),
#           index=False)

In [None]:
# Save the csv file
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## For UK moths 

This file is in a different format

In [None]:
# Read the species checklist
checklist_name = "uksi-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                 sep=',', encoding='latin-1')

df["Genus"] = ''

In [None]:
df.head()

In [None]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["taxon"].fillna('')

df["authority_name_provided"] = df['preferred_authority'].replace('[\(\)\[\]]', '', regex=True)

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Thailand

In [None]:
# Read the species checklist
checklist_name = "thailand-moths"

df = pd.read_csv(os.path.join("../species_checklists", checklist_name+".csv"),
                sep=',', encoding='latin-1')

#df.columns=['Superfamily', 'Family', 'Genus', 'Species']


In [None]:
df.loc[df['scientific_name'].str.split().str.len() > 2, 'scientific_name']

In [None]:
# remove duplicates in the scientific_name column
df = df.drop_duplicates(subset='scientific_name')

# only keep rows where the scientific_name is two words or more and keep the first two
df = df[df['scientific_name'].str.split().str.len() > 1]
df['scientific_name'] = df['scientific_name'].str.split().str[:2].str.join(' ')

#df = df[['scientific_name', 'taxon_id']]
df['Species']  = df['scientific_name']
df['Genus']  = df['taxon_genus_name']

df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

In [None]:
df.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

## Madagascar

This comes from two sources: 
1. Moths from GBIF using the filter: 
    ```json
    {
    "and" : [
        "BasisOfRecord is one of (Human Observation, Specimen)",
        "Country is Madagascar",
        "OccurrenceStatus is Present",
        "TaxonKey is Lepidoptera"
    ]
    }
    ```
2. From Wikipedia: https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar

### 1. From GBIF

In [6]:
# Read the species checklist
checklist_name = "madagascar-moths"

mad_df1 = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+"1.csv"),
                sep='\t', encoding='latin-1')

mad_df1.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,classKey,order,orderKey,family,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory
0,1763874,"Acrapex sogai Viette, 1968",1763874,"Acrapex sogai Viette, 1968",4,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,216,Lepidoptera,797,Noctuidae,7015.0,Acrapex,1763850.0,Acrapex sogai,1763874.0,NE
1,1772579,"Ametropalpis nasuta Mabille, 1884",1772579,"Ametropalpis nasuta Mabille, 1884",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,216,Lepidoptera,797,Erebidae,4532185.0,Ametropalpis,1772578.0,Ametropalpis nasuta,1772579.0,NE
2,1851527,"Hypatima perinetella Viette, 1957",1851527,"Hypatima perinetella Viette, 1957",3,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,216,Lepidoptera,797,Gelechiidae,3553.0,Hypatima,1851496.0,Hypatima perinetella,1851527.0,NE
3,1886069,"Heliothela GuenÃ©e, 1854",1886069,"Heliothela GuenÃ©e, 1854",1,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,216,Lepidoptera,797,Crambidae,8841.0,Heliothela,1886069.0,,,
4,1941060,"Malgassesia Le Cerf, 1922",1941060,"Malgassesia Le Cerf, 1922",20,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,216,Lepidoptera,797,Sesiidae,5340.0,Malgassesia,1941060.0,,,


In [7]:
mad_df1 = mad_df1.loc[mad_df1['order'] == 'Lepidoptera', ]

# create a column made up from the third word in column onwards
mad_df1['Authority'] = mad_df1['scientificName'].str.split().str[2:].str.join(' ')

mad_df1["species_name_provided"] = mad_df1["genus"].fillna('') + " " + mad_df1["species"].fillna('')
mad_df1["authority_name_provided"] = mad_df1['Authority']

mad_df1.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,family,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory,Authority,species_name_provided,authority_name_provided
0,1763874,"Acrapex sogai Viette, 1968",1763874,"Acrapex sogai Viette, 1968",4,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,Noctuidae,7015.0,Acrapex,1763850.0,Acrapex sogai,1763874.0,NE,"Viette, 1968",Acrapex Acrapex sogai,"Viette, 1968"
1,1772579,"Ametropalpis nasuta Mabille, 1884",1772579,"Ametropalpis nasuta Mabille, 1884",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,Erebidae,4532185.0,Ametropalpis,1772578.0,Ametropalpis nasuta,1772579.0,NE,"Mabille, 1884",Ametropalpis Ametropalpis nasuta,"Mabille, 1884"
2,1851527,"Hypatima perinetella Viette, 1957",1851527,"Hypatima perinetella Viette, 1957",3,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,Gelechiidae,3553.0,Hypatima,1851496.0,Hypatima perinetella,1851527.0,NE,"Viette, 1957",Hypatima Hypatima perinetella,"Viette, 1957"
3,1886069,"Heliothela GuenÃ©e, 1854",1886069,"Heliothela GuenÃ©e, 1854",1,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,Crambidae,8841.0,Heliothela,1886069.0,,,,1854,Heliothela,1854
4,1941060,"Malgassesia Le Cerf, 1922",1941060,"Malgassesia Le Cerf, 1922",20,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,Sesiidae,5340.0,Malgassesia,1941060.0,,,,"Cerf, 1922",Malgassesia,"Cerf, 1922"


### 2. From Wikipedia

In [136]:
# Wikipedia URL for the moth species by family
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Madagascar'

# mad_df2 = scrape_wikipedia_to_csv(wikipedia_url)
mad_df2 = scrape_wikipedia_to_csv(wikipedia_url)

In [137]:
mad_df2

Unnamed: 0,Family,Genus,Species,Authority
0,Adelidae,,Adela gymnota,"(Meyrick, 1912)"
1,Adelidae,,Adela janineae,"(Viette, 1954)"
2,Adelidae,,Adela tsaratanana,"(Viette, 1954)"
0,Alucitidae,,Alucita decaryella,"(Viette, 1956)"
1,Alucitidae,,Alucita euscripta,"Minet, 1976"
...,...,...,...,...
0,Zygaenidae,,Ankasocris striatus,"Viette, 1965"
1,Zygaenidae,,Ischnusia culiculina,"(Mabille, 1878)"
2,Zygaenidae,,Madaprocris minetorum,"Viette, 1978"
3,Zygaenidae,,Sthenoprocris brondeli,"Viette, 1978"


#### 3. Combine

In [None]:
# combine df and mad_df2
mad_df2 = mad_df2[['Family', 'Genus', 'Species', 'Authority']]

mad_df1 = mad_df1[['family', 'genus', 'species', 'Authority']]
mad_df1.columns = mad_df2.columns

df = pd.concat([mad_df1, mad_df2], ignore_index=True)
df.head()

In [None]:
print(df.shape)

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'])

# remove rows with missing species names
df = df.dropna(subset=['Species'])

df.shape

In [None]:
split_dataframe(df=df, n=5, output_dir="../species_checklists/", list_name=checklist_name)

df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

## Anguilla

List provided by David Roy on 21/5/24.
Updated list on 24/10/24

In [4]:
# Read the species checklist
checklist_name = "anguilla-moths"

df = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+".csv"),
                sep=',', encoding='latin-1')

df.head()

Unnamed: 0,ï»¿Superfamily,Family,Subfamily,Species,GBIF accepted name,Authority (GBIF),Common name,Comments
0,Bombycoidea,Sphingidae,Magroglossinae,Aellopos tantalus,Aellopos tantalus,"Linnaeus, 1758",Tantalus Sphinx,
1,Bombycoidea,Sphingidae,Magroglossinae,Enyo lugubris,Enyo lugubris,"Linnaeus, 1771",Mournful Sphinx,
2,Bombycoidea,Sphingidae,Magroglossinae,Erinnyis ello,Erinnyis ello,"Linnaeus, 1758",Ello Sphinx,
3,Bombycoidea,Sphingidae,Magroglossinae,Hyles lineata,Hyles lineata,"Fabricius, 1775",White-lined Sphinx,
4,Bombycoidea,Sphingidae,Magroglossinae,Pseudosphinx tetrio,Pseudosphinx tetrio,"Linnaeus, 1771","Tetrio sphinx, Frangipani Sphinx",


In [5]:
df_update = pd.read_csv(os.path.join("../species_checklists",
                            checklist_name+"_update.csv"),
                sep=',', encoding='latin-1')

df_update["Genus"] = ""
df_update["Family"] = ""
df_update["Subfamily"] = ""
df_update["GBIF accepted name"] = ""

df_update["species_name_provided"] = df_update["Genus"].fillna('') + " " + df_update["Species"].fillna('')
df_update["authority_name_provided"] = ""

df_update.head()

Unnamed: 0,Species,Genus,Family,Subfamily,GBIF accepted name,species_name_provided,authority_name_provided
0,Acrolophus walsinghami,,,,,Acrolophus walsinghami,
1,Aellopos tantalus,,,,,Aellopos tantalus,
2,Agrius cingulatus,,,,,Agrius cingulatus,
3,Amyna stricta,,,,,Amyna stricta,
4,Anicla inflecta,,,,,Anicla inflecta,


In [6]:
df["Genus"] = ""

# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = ""

df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]
df.head()

Unnamed: 0,Family,Subfamily,Species,Genus,GBIF accepted name,species_name_provided,authority_name_provided
0,Sphingidae,Magroglossinae,Aellopos tantalus,,Aellopos tantalus,Aellopos tantalus,
1,Sphingidae,Magroglossinae,Enyo lugubris,,Enyo lugubris,Enyo lugubris,
2,Sphingidae,Magroglossinae,Erinnyis ello,,Erinnyis ello,Erinnyis ello,
3,Sphingidae,Magroglossinae,Hyles lineata,,Hyles lineata,Hyles lineata,
4,Sphingidae,Magroglossinae,Pseudosphinx tetrio,,Pseudosphinx tetrio,Pseudosphinx tetrio,


In [7]:
# combine two lists
df_combined = pd.concat([df, df_update[list(df.columns)]])

df_combined.loc[df_combined['species_name_provided'].str.contains('sp\\.'), 'species_name_provided'] = df_combined.loc[df_combined['species_name_provided'].str.contains('sp\\.'), 'species_name_provided'].str.replace(' sp.', '')


df_combined.head()

Unnamed: 0,Family,Subfamily,Species,Genus,GBIF accepted name,species_name_provided,authority_name_provided
0,Sphingidae,Magroglossinae,Aellopos tantalus,,Aellopos tantalus,Aellopos tantalus,
1,Sphingidae,Magroglossinae,Enyo lugubris,,Enyo lugubris,Enyo lugubris,
2,Sphingidae,Magroglossinae,Erinnyis ello,,Erinnyis ello,Erinnyis ello,
3,Sphingidae,Magroglossinae,Hyles lineata,,Hyles lineata,Hyles lineata,
4,Sphingidae,Magroglossinae,Pseudosphinx tetrio,,Pseudosphinx tetrio,Pseudosphinx tetrio,


In [8]:
print(df_combined.shape)

df_combined = df_combined.drop_duplicates(subset='Species', keep="first")
print(df_combined.shape)

(277, 7)
(188, 7)


In [9]:
df_combined.to_csv(os.path.join("../species_checklists/", 
                       checklist_name+"-preprocessed.csv"),
          index=False)

# Kenya and Uganda

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=AFRICA&country=KE&country=UG&taxon_key=797&advanced=1

In [52]:
# Read the species checklist from GBIF
ku_df1 = pd.read_csv(os.path.join("../species_checklists", "kenya-uganda-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
ku_df1['Authority'] = ku_df1['scientificName'].str.split().str[2:].str.join(' ')

ku_df1["species_name_provided"] = ku_df1["genus"].fillna('') + " " + ku_df1["species"].fillna('')
ku_df1["authority_name_provided"] = ku_df1['Authority']
ku_df1['Source'] = 'gbif'
ku_df1.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory,Authority,species_name_provided,authority_name_provided,Source
0,1751906,"Staphylinochrous flavida Hampson, 1919",1751906,"Staphylinochrous flavida Hampson, 1919",9,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,3562.0,Staphylinochrous,1751898.0,Staphylinochrous flavida,1751906.0,NE,"Hampson, 1919",Staphylinochrous Staphylinochrous flavida,"Hampson, 1919",gbif
1,1768047,Heraclia doenitzi GrÃ¼nberg,1767954,"Heraclia longipennis Walker, 1854",1,SPECIES,SYNONYM,Animalia,1,Arthropoda,...,7015.0,Heraclia,1767949.0,Heraclia longipennis,1767954.0,NE,GrÃ¼nberg,Heraclia Heraclia longipennis,GrÃ¼nberg,gbif
2,1797802,"Eublemma therma Hampson, 1910",1797802,"Eublemma therma Hampson, 1910",3,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,7015.0,Eublemma,9118632.0,Eublemma therma,1797802.0,NE,"Hampson, 1910",Eublemma Eublemma therma,"Hampson, 1910",gbif
3,1824415,"Psalisodes discalis Hampson, 1910",1824415,"Psalisodes discalis Hampson, 1910",3,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,7016.0,Psalisodes,1824413.0,Psalisodes discalis,1824415.0,NE,"Hampson, 1910",Psalisodes Psalisodes discalis,"Hampson, 1910",gbif
4,1937855,"Papilio zalmoxis Hewitson, 1864",1937855,"Papilio zalmoxis Hewitson, 1864",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,9417.0,Papilio,10175914.0,Papilio zalmoxis,1937855.0,NE,"Hewitson, 1864",Papilio Papilio zalmoxis,"Hewitson, 1864",gbif


#### From Wikipedia

In [53]:
kenya_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Kenya'
uganda_url = 'https://en.wikipedia.org/wiki/List_of_moths_of_Uganda'

kenya_df_wiki = scrape_wikipedia_to_csv(kenya_url)
uganda_df_wiki = scrape_wikipedia_to_csv(uganda_url)

ku_df2 = pd.concat([kenya_df_wiki, uganda_df_wiki], ignore_index=True)
ku_df2['Source'] = 'wiki'
ku_df2.head()

Unnamed: 0,Family,Genus,Species,Authority,Source
0,Alucitidae,,Alucita dohertyi,"Walsingham, 1909",wiki
1,Anomoeotidae,,Anomoeotes elegans,"Pagenstecher, 1903",wiki
2,Anomoeotidae,,Staphylinochrous holotherma,"Hampson, 1920",wiki
3,Arctiidae,,Acantharctia atriramosa,"Hampson, 1907",wiki
4,Arctiidae,,Acantharctia bivittata,"Butler, 1898",wiki


In [None]:
ku_df1.columns = ku_df2.columns
df = pd.concat([ku_df1, ku_df2], ignore_index=True)

# Japan

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=ASIA&country=JA&taxon_key=797&advanced=1

In [4]:
# Read the species checklist from GBIF
jp_df1 = pd.read_csv(os.path.join("../species_checklists", "japan-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
jp_df1['Authority'] = jp_df1['scientificName'].str.split().str[2:].str.join(' ')

jp_df1["species_name_provided"] = jp_df1["genus"].fillna('') + " " + jp_df1["species"].fillna('')
jp_df1["authority_name_provided"] = jp_df1['Authority']
jp_df1['Source'] = 'gbif'
jp_df1.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory,Authority,species_name_provided,authority_name_provided,Source
0,1736947,"Gypsonoma attrita Falkovitsh, 1965",1736947,"Gypsonoma attrita Falkovitsh, 1965",8,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,5343.0,Gypsonoma,1736895.0,Gypsonoma attrita,1736947.0,NE,"Falkovitsh, 1965",Gypsonoma Gypsonoma attrita,"Falkovitsh, 1965",gbif
1,1736956,"Cryptophlebia yasudai Kawabe, 1972",1736956,"Cryptophlebia yasudai Kawabe, 1972",12,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,5343.0,Cryptophlebia,1736951.0,Cryptophlebia yasudai,1736956.0,NE,"Kawabe, 1972",Cryptophlebia Cryptophlebia yasudai,"Kawabe, 1972",gbif
2,1741364,"Notocelia longispina Nasu, 1980",8345820,"Notocelia incarnatana (HÃ¼bner, 1796-1799)",4,SPECIES,SYNONYM,Animalia,1,Arthropoda,...,5343.0,Notocelia,1741338.0,Notocelia incarnatana,8345820.0,NE,"Nasu, 1980",Notocelia Notocelia incarnatana,"Nasu, 1980",gbif
3,1745969,"Homona menciana (Walker, 1863)",1745947,"Homona coffearia (Nietner, 1861)",2,SPECIES,SYNONYM,Animalia,1,Arthropoda,...,5343.0,Homona,1745927.0,Homona coffearia,1745947.0,NE,"(Walker, 1863)",Homona Homona coffearia,"(Walker, 1863)",gbif
4,1767869,"Neachrostia Hampson, 1907",1767869,"Neachrostia Hampson, 1907",88,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,4532185.0,Neachrostia,1767869.0,,,,1907,Neachrostia,1907,gbif


#### From Wikipedia

In [66]:
japan_urls = ['https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Noctuoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Bombycoidea-Geometroidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Pyraloidea-Drepanoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Choreutoidea-Thyridoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Zygaenoidea-Tortricoidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Gelechioidea)', 
             'https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Micropterigoidea-Yponomeutoidea)'] #'https://en.wikipedia.org/wiki/List_of_moths_of_Japan'

jp_df2 = pd.DataFrame()

for url in japan_urls: 
    print(url)


    japan_df_wiki = scrape_wikipedia_to_csv(url)

    jp_df2 = pd.concat([jp_df2, japan_df_wiki], ignore_index=True)

jp_df2['Source'] = 'wiki'
jp_df2.head()

https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Noctuoidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Bombycoidea-Geometroidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Pyraloidea-Drepanoidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Choreutoidea-Thyridoidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Zygaenoidea-Tortricoidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Gelechioidea)
https://en.wikipedia.org/wiki/List_of_moths_of_Japan_(Micropterigoidea-Yponomeutoidea)


Unnamed: 0,Family,Genus,Species,Authority,Source
0,Notodontidae,,Dudusa sphingiformis,"Moore, 1872",wiki
1,Notodontidae,,Tarsolepis japonica japonica,"Wileman & South, 1917",wiki
2,Notodontidae,,Phycidopsis albovittata,"Hampson, 1893",wiki
3,Notodontidae,,Stauropus alternus,"Walker, 1855",wiki
4,Notodontidae,,Stauropus basalis basalis,"Moore, 1877",wiki


#### combine df1 and df2

In [67]:
jp_df2 = jp_df2[['Family', 'Genus', 'Species', 'Authority', 'Source']]
jp_df1 = jp_df1[['family', 'genus', 'species', 'Authority', 'Source']]
jp_df1.columns = jp_df2.columns

In [68]:
df = pd.concat([jp_df1, jp_df2], ignore_index=True)
print(df['Source'].value_counts())

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'], keep='first')

# remove rows with missing species names
# df = df.dropna(subset=['Species'])

print(df['Source'].value_counts())

Source
gbif    7158
wiki    5887
Name: count, dtype: int64
Source
wiki    5887
gbif    5291
Name: count, dtype: int64


In [69]:
df.head()

Unnamed: 0,Family,Genus,Species,Authority,Source
0,Tortricidae,Gypsonoma,Gypsonoma attrita,"Falkovitsh, 1965",gbif
1,Tortricidae,Cryptophlebia,Cryptophlebia yasudai,"Kawabe, 1972",gbif
2,Tortricidae,Notocelia,Notocelia incarnatana,"Nasu, 1980",gbif
3,Tortricidae,Homona,Homona coffearia,"(Walker, 1863)",gbif
4,Erebidae,Neachrostia,,1907,gbif


In [70]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']
df['Subfamily'] = ""
df["GBIF accepted name"] = df["Species"].fillna('')


df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]

In [71]:
checklist_name = "japan-moths"
df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

In [72]:
df.head()

Unnamed: 0,Family,Subfamily,Species,Genus,GBIF accepted name,species_name_provided,authority_name_provided
0,Tortricidae,,Gypsonoma attrita,Gypsonoma,Gypsonoma attrita,Gypsonoma Gypsonoma attrita,"Falkovitsh, 1965"
1,Tortricidae,,Cryptophlebia yasudai,Cryptophlebia,Cryptophlebia yasudai,Cryptophlebia Cryptophlebia yasudai,"Kawabe, 1972"
2,Tortricidae,,Notocelia incarnatana,Notocelia,Notocelia incarnatana,Notocelia Notocelia incarnatana,"Nasu, 1980"
3,Tortricidae,,Homona coffearia,Homona,Homona coffearia,Homona Homona coffearia,"(Walker, 1863)"
4,Erebidae,,,Neachrostia,,Neachrostia,1907


# Nigeria

#### GBIF download

From: https://www.gbif.org/occurrence/download?continent=AFRICA&country=NG&taxon_key=797&advanced=1

In [75]:
# Read the species checklist from GBIF
ng_df1 = pd.read_csv(os.path.join("../species_checklists", "nigeria-gbif-moths.tsv"),
                sep='\t', encoding='latin-1')

# create a column made up from the third word in column onwards
ng_df1['Authority'] = ng_df1['scientificName'].str.split().str[2:].str.join(' ')

ng_df1["species_name_provided"] = ng_df1["genus"].fillna('') + " " + ng_df1["species"].fillna('')
ng_df1["authority_name_provided"] = ng_df1['Authority']
ng_df1['Source'] = 'gbif'
ng_df1.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory,Authority,species_name_provided,authority_name_provided,Source
0,1781456,"Egnasia microtype Hampson, 1926",1781456,"Egnasia microtype Hampson, 1926",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,4532185.0,Egnasia,9123839.0,Egnasia microtype,1781456.0,NE,"Hampson, 1926",Egnasia Egnasia microtype,"Hampson, 1926",gbif
1,1789033,"Ethiopica polyastra Hampson, 1909",1789033,"Ethiopica polyastra Hampson, 1909",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,7015.0,Ethiopica,1789026.0,Ethiopica polyastra,1789033.0,NE,"Hampson, 1909",Ethiopica Ethiopica polyastra,"Hampson, 1909",gbif
2,1883743,"Dichocrocis biplagialis Hampson, 1918",1883743,"Dichocrocis biplagialis Hampson, 1918",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,8841.0,Dichocrocis,1883698.0,Dichocrocis biplagialis,1883743.0,NE,"Hampson, 1918",Dichocrocis Dichocrocis biplagialis,"Hampson, 1918",gbif
3,1928122,"Liptena tricolora (Bethune-Baker, 1915)",1928122,"Liptena tricolora (Bethune-Baker, 1915)",1,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,5473.0,Liptena,1928101.0,Liptena tricolora,1928122.0,NE,"(Bethune-Baker, 1915)",Liptena Liptena tricolora,"(Bethune-Baker, 1915)",gbif
4,1937855,"Papilio zalmoxis Hewitson, 1864",1937855,"Papilio zalmoxis Hewitson, 1864",24,SPECIES,ACCEPTED,Animalia,1,Arthropoda,...,9417.0,Papilio,10175914.0,Papilio zalmoxis,1937855.0,NE,"Hewitson, 1864",Papilio Papilio zalmoxis,"Hewitson, 1864",gbif


#### From Wikipedia

In [77]:
nigeria_url = ['https://en.wikipedia.org/wiki/List_of_moths_of_Nigeria']

ng_df2 = scrape_wikipedia_to_csv(url)

ng_df2['Source'] = 'wiki'
ng_df2.head()

Unnamed: 0,Family,Genus,Species,Authority,Source
0,Micropterigidae,,Micropterix aureatella,"Scopoli, 1763",wiki
1,Micropterigidae,,Paramartyria immaculatella,"Issiki, 1931",wiki
2,Micropterigidae,,Paramartyria semifasciella,"Issiki, 1931",wiki
3,Micropterigidae,,Issikiomartyria akemiae,"Hashimoto, 2006",wiki
4,Micropterigidae,,Issikiomartyria bisegmentata,"Hashimoto, 2006",wiki


#### combine df1 and df2

In [78]:
ng_df2 = ng_df2[['Family', 'Genus', 'Species', 'Authority', 'Source']]
ng_df1 = ng_df1[['family', 'genus', 'species', 'Authority', 'Source']]
ng_df1.columns = ng_df2.columns

In [79]:
df = pd.concat([ng_df1, ng_df2], ignore_index=True)
print(df['Source'].value_counts())

# remove duplicated rows based on family, genus and species
df = df.drop_duplicates(subset=['Family', 'Genus', 'Species'], keep='first')

# remove rows with missing species names
# df = df.dropna(subset=['Species'])

print(df['Source'].value_counts())

Source
gbif    3367
wiki     676
Name: count, dtype: int64
Source
gbif    2652
wiki     676
Name: count, dtype: int64


In [80]:
df.head()

Unnamed: 0,Family,Genus,Species,Authority,Source
0,Erebidae,Egnasia,Egnasia microtype,"Hampson, 1926",gbif
1,Noctuidae,Ethiopica,Ethiopica polyastra,"Hampson, 1909",gbif
2,Crambidae,Dichocrocis,Dichocrocis biplagialis,"Hampson, 1918",gbif
3,Lycaenidae,Liptena,Liptena tricolora,"(Bethune-Baker, 1915)",gbif
4,Papilionidae,Papilio,Papilio zalmoxis,"Hewitson, 1864",gbif


In [81]:
# Combine costa rica column names
df["species_name_provided"] = df["Genus"].fillna('') + " " + df["Species"].fillna('')
df["authority_name_provided"] = df['Authority']
df['Subfamily'] = ""
df["GBIF accepted name"] = df["Species"].fillna('')


df = df[['Family', 'Subfamily', 'Species', 'Genus', 'GBIF accepted name', 'species_name_provided', 'authority_name_provided']]

In [82]:
checklist_name = "nigeria-moths"
df.to_csv(os.path.join("../species_checklists/",
                    checklist_name+"-preprocessed.csv"),
        index=False)

In [83]:
df.head()

Unnamed: 0,Family,Subfamily,Species,Genus,GBIF accepted name,species_name_provided,authority_name_provided
0,Erebidae,,Egnasia microtype,Egnasia,Egnasia microtype,Egnasia Egnasia microtype,"Hampson, 1926"
1,Noctuidae,,Ethiopica polyastra,Ethiopica,Ethiopica polyastra,Ethiopica Ethiopica polyastra,"Hampson, 1909"
2,Crambidae,,Dichocrocis biplagialis,Dichocrocis,Dichocrocis biplagialis,Dichocrocis Dichocrocis biplagialis,"Hampson, 1918"
3,Lycaenidae,,Liptena tricolora,Liptena,Liptena tricolora,Liptena Liptena tricolora,"(Bethune-Baker, 1915)"
4,Papilionidae,,Papilio zalmoxis,Papilio,Papilio zalmoxis,Papilio Papilio zalmoxis,"Hewitson, 1864"
