<a href="https://colab.research.google.com/github/FFI-Vietnam/camtrap-tools/blob/main/Wildlife%20Insights/bulk-upload/04_collect-WI-taxon-database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This script is used for cleaning the metadata spreadsheet generated by ExifTool
After runnning this script, 'data cleaning' folder is created

data cleaning
    |__ 4.1.  csv
    
"""

In [1]:
import pandas as pd
import numpy as np
import os
import json
import requests

# mount with Google Drive to read and save file
from google.colab import drive
drive.mount('/content/drive')

# specifies Colab directories and file names
root = '/content/drive/'

dataset_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/dataset'
contain_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning'

WIdatabase_file_name = "4.1_collect_WI-taxon-database.csv"

# useful functions
def read_csv_Google_drive(root, contain_folder, file_name):
  """
  function to read a csv file from Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  file_path = os.path.join(root, contain_folder, file_name)
  return pd.read_csv(file_path)

def save_csv_Google_drive(df, root, contain_folder, file_name):
  """
  function to save a csv file to Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  # save file to Colab runtime storage (will be deleted when this notebook is closed)
  df.to_csv('dataframe.csv', index=False)

  # save file back to Google Drive for permanent storage
  folder_path = os.path.join(root, contain_folder)
  file_path = os.path.join(root, contain_folder, file_name)
  try:
    os.makedirs(folder_path)
  except:
    pass

  with open('dataframe.csv', 'r') as f:
    df_file = f.read()

  with open(file_path, 'w') as f:
    f.write(df_file)

  print(f'File is saved to {file_name} in Google Drive at {file_path}')

Mounted at /content/drive


In [2]:
# 4.1) 

# download WI taxon id database
URL = "https://api.wildlifeinsights.org/api/v1/taxonomy?fields=class,order,family,genus,species,authority,taxonomyType,uniqueIdentifier,commonNameEnglish&page[size]=30000"
response = requests.get(URL)

# temporarily save the database to a json file
with open('taxonomy.json', 'w') as outfile:
    json.dump(response.json()["data"], outfile)

taxonomy_df = pd.read_json(r'taxonomy.json')

In [3]:
# find species who is not in WI taxon db

df = read_csv_Google_drive(root, contain_folder, "1.4_clean_metadata_remove-no-animal-image.csv")

# list of species
species_list = df['species_common_name'].unique()

unfound = []
for s in species_list:
  if (s not in taxonomy_df['commonNameEnglish'].to_list()):
    unfound.append(s)

print("There are",len(unfound),"unfound species")
unfound

There are 54 unfound species


['Yellow-breasted Magpie',
 'Emerald Dove',
 'Racket-tailed Treepie',
 'Ochraceous Bulbul',
 'Murid',
 'Ferret-badger',
 'Hill Blue Flycatcher',
 'Moth',
 'White-throated Fantail',
 'Asian Red-cheeked Squirrel',
 'Leopoldamys',
 'Asian Palm Civet',
 'Indochinese Wren-babbler',
 'Squirrel',
 'Banded Kingfisher',
 'Rhesus Macaque',
 'Wild Pig',
 'Crested Serpent Eagle',
 'Annamite Dark Muntjac',
 'Flying Insect',
 'Snake',
 'Black-throated Laughingthrush',
 'Asian Black Bear',
 'White-Crowned Forktail',
 'Macaque',
 'Animal',
 'Scaly Thrush',
 'Impressed Tortoise',
 'Maxomys',
 'Banded Krait',
 'Small Asian Mongoose',
 'Oriental Magpie Robin',
 'Muntjac',
 'Asian Water Monitor',
 'Japanese Robin',
 'Particolored Flying Squirrel',
 'Shrew',
 'Butterfly',
 'Brownish-flanked Bush Warbler',
 'White-tailed Robin',
 'Flying squirrel',
 'Streaked Wren Babbler',
 'Lesser Mouse-deer',
 'Domestic Buffalo',
 'Green magpie',
 'Pale-footed Bush Warbler',
 'Greater Racket-tailed Drongo',
 'Eastern Str

In [24]:
# (optional) create template match_unfound_species matching for the below cell
for i in unfound:
  print('{:60}'.format('match_unfound_species["'+i+'"]') + '= {"commonNameEnglish":""}')

match_unfound_species["Yellow-breasted Magpie"]             = {"commonNameEnglish":""}
match_unfound_species["Emerald Dove"]                       = {"commonNameEnglish":""}
match_unfound_species["Racket-tailed Treepie"]              = {"commonNameEnglish":""}
match_unfound_species["Ochraceous Bulbul"]                  = {"commonNameEnglish":""}
match_unfound_species["Murid"]                              = {"commonNameEnglish":""}
match_unfound_species["Ferret-badger"]                      = {"commonNameEnglish":""}
match_unfound_species["Hill Blue Flycatcher"]               = {"commonNameEnglish":""}
match_unfound_species["Moth"]                               = {"commonNameEnglish":""}
match_unfound_species["White-throated Fantail"]             = {"commonNameEnglish":""}
match_unfound_species["Asian Red-cheeked Squirrel"]         = {"commonNameEnglish":""}
match_unfound_species["Leopoldamys"]                        = {"commonNameEnglish":""}
match_unfound_species["Asian Palm Civet"]  

Use these query to find common species name

In [None]:
taxonomy_df[(taxonomy_df['genus'] == 'Homo') & (taxonomy_df['species'] == 'sapiens')]

In [None]:
taxonomy_df[taxonomy_df['species'] == 'owstoni']

In [None]:
taxonomy_df[(taxonomy_df['genus'] == 'Homo')]

In [None]:
taxonomy_df[(taxonomy_df['family'] == 'Viverridae')]

In [None]:
taxonomy_df[(taxonomy_df['order'] == 'Carnivora')]

In [None]:
taxonomy_df[(taxonomy_df['class'] == 'Mammalia')]

In [9]:
# match unfound species name with corresponding name in WI taxon id

match_unfound_species = {}
# match_unfound_species["Hill Blue Flycatcher"]
# match_unfound_species["Indochinese Wren-babbler"] = {"Taxonomy":"Gypsophila brevicaudata"}
# match_unfound_species["Black-throated Laughingthrush"] = {"Taxonomy":"Pterorhinus chinensis"}
# match_unfound_species["Banded Krait"] = {"Taxonomy":"Bungarus fasciatus"}
# match_unfound_species["Impressed Tortoise"] = {"Taxonomy":"Manouria impressa"}
# match_unfound_species["Small Asian Mongoose"] = {"genus":"Urva", "species":"javanica"}
# match_unfound_species["Natalia's Agama"] = {"commonNameEnglish":""}

# match_unfound_species["Flying Insect"] = {"commonNameEnglish":""}
# match_unfound_species["Animal"] = {"commonNameEnglish":""}

match_unfound_species["Streaked Wren Babbler"]              = {"commonNameEnglish":"Streaked Wren-babbler"}
match_unfound_species["Asian Water Monitor"]                = {"commonNameEnglish":"Common Water Monitor"}
match_unfound_species["Asian Red-cheeked Squirrel"]         = {"commonNameEnglish":"Red-cheeked Squirrel"}
match_unfound_species["Muntjac"]                            = {"commonNameEnglish":"Muntiacus Species"}
match_unfound_species["Asian Black Bear"]                   = {"commonNameEnglish":"Asiatic Black Bear"}
match_unfound_species["Ferret-badger"]                      = {"commonNameEnglish":"Melogale Species"}
match_unfound_species["Rhesus Macaque"]                     = {"commonNameEnglish":"Rhesus Monkey"}
match_unfound_species["Murid"]                              = {"commonNameEnglish":"Muridae Family"}
match_unfound_species["Maxomys"]                            = {"commonNameEnglish":"Muridae Family"}
match_unfound_species["Leopoldamys"]                        = {"commonNameEnglish":"Muridae Family"}
match_unfound_species["Wild Pig"]                           = {"commonNameEnglish":"Wild Boar"}
match_unfound_species["Crested Serpent Eagle"]              = {"commonNameEnglish":"Crested Serpent-Eagle"}
match_unfound_species["Yellow-breasted Magpie"]             = {"commonNameEnglish":"Indochinese Green Magpie"}
match_unfound_species["Macaque"]                            = {"commonNameEnglish":"Macaca Species"}
match_unfound_species["Snake"]                              = {"commonNameEnglish":"Lizards and Snakes"}
match_unfound_species["Annamite Dark Muntjac"]              = {"commonNameEnglish":"Annamite Muntjac"}
match_unfound_species["White-Crowned Forktail"]             = {"commonNameEnglish":"White-crowned Forktail"}
match_unfound_species["Particolored Flying Squirrel"]       = {"commonNameEnglish":"African Linsang"}
match_unfound_species["Oriental Magpie Robin"]              = {"commonNameEnglish":"Oriental Magpie-Robin"}
match_unfound_species["Squirrel"]                           = {"commonNameEnglish":"Sciuridae Family"}
match_unfound_species["Butterfly"]                          = {"commonNameEnglish":"Butterflies and Moths"}
match_unfound_species["Moth"]                               = {"commonNameEnglish":"Butterflies and Moths"}
match_unfound_species["Asian Palm Civet"]                   = {"commonNameEnglish":"Common Palm Civet"}
match_unfound_species["Shrew"]                              = {"commonNameEnglish":"Soricidae Family"}
match_unfound_species["Racket-tailed Treepie"]              = {"commonNameEnglish":"Racquet-tailed Treepie"}
match_unfound_species["Emerald Dove"]                       = {"commonNameEnglish":"Common Emerald Dove"}
match_unfound_species["Domestic Buffalo"]                   = {"commonNameEnglish":"Bubalus Species"}
match_unfound_species["Malayan Night Heron"]                = {"commonNameEnglish":"Malay Night-heron"}
match_unfound_species["Lesser Mouse-deer"]                  = {"commonNameEnglish":"Lesser Oriental Chevrotain"}
match_unfound_species["Eastern Striped Squirrel"]           = {"commonNameEnglish":"Maritime Striped Squirrel"}
match_unfound_species["Green magpie"]                       = {"commonNameEnglish":"Common Green Magpie"}
match_unfound_species["Flying squirrel"]                    = {"commonNameEnglish":"Sciuridae Family"}
match_unfound_species["White-tailed Robin"]                 = {"commonNameEnglish":"White-tailed Blue Robin"}
match_unfound_species["Tickell's Blue Flycatcher"]          = {"commonNameEnglish":"Tickell's Blue-Flycatcher"}
match_unfound_species["Pale-footed Bush Warbler"]           = {"commonNameEnglish":"Pale-footed Bush-warbler"}  
match_unfound_species["Banded Kingfisher"]                  = {"genus":"Lacedo",     "species":"pulchella"}
match_unfound_species["Japanese Robin"]                     = {"genus":"Larvivora",  "species":"akahige"}
match_unfound_species["Ochraceous Bulbul"]                  = {"genus":"Alophoixus", "species":"ochraceus"}
match_unfound_species["Scaly Thrush"]                       = {"genus":"Zoothera",   "species":"dauma"}
match_unfound_species["White-throated Fantail"]             = {"genus":"Rhipidura",  "species":"albicollis"}
match_unfound_species["Brownish-flanked Bush Warbler"]      = {"genus":"Horornis",   "species":"fortipes"}
match_unfound_species["Greater Racquet-tailed Drongo"]      = {"genus":"Dicrurus",   "species":"paradiseus"}
match_unfound_species["Greater Racket-tailed Drongo"]       = {"genus":"Dicrurus",   "species":"paradiseus"}
match_unfound_species["Snowy-browed Flycatcher"]            = {"genus":"Ficedula",   "species":"hyperythra"}
match_unfound_species["Puff-throated Bulbul"]               = {"genus":"Alophoixus", "species":"pallidus"}

In [10]:
# rename some species
match_unfound_species["Chinese Serow"] = {"commonNameEnglish":"Sumatran Serow"}

In [11]:
# match species name with corresponding name in WI taxon id

match_table = taxonomy_df[taxonomy_df['commonNameEnglish'] == ""]
FFI_species_name = []

species_list = df['species_common_name'].dropna().unique()

for s in species_list:
  if (s in match_unfound_species): 
    FFI_species_name.append(s)
    # if the species has commonNameEnglish
    if 'commonNameEnglish' in match_unfound_species[s]:
      if taxonomy_df[taxonomy_df['commonNameEnglish'] == match_unfound_species[s]['commonNameEnglish']].empty:
        print('Cannot find', s)
        continue
      match_table = match_table.append(taxonomy_df[taxonomy_df['commonNameEnglish'] == match_unfound_species[s]['commonNameEnglish']])
    # if the species has genus
    else:
      if taxonomy_df[(taxonomy_df['genus'] == match_unfound_species[s]['genus']) &
                      (taxonomy_df['species'] == match_unfound_species[s]['species'])].empty:
        print('Cannot find', s)
        continue
      match_table = match_table.append(taxonomy_df[(taxonomy_df['genus'] == \
                                                      match_unfound_species[s]['genus']) & \
                                                      (taxonomy_df['species'] == \
                                                      match_unfound_species[s]['species'])])

  else:
    if s in taxonomy_df['commonNameEnglish'].to_list():
      FFI_species_name.append(s)
      match_table = match_table.append(taxonomy_df[taxonomy_df['commonNameEnglish'] == s])
    else:
      print(f"{s} not found")

match_table['FFI_species_name'] = FFI_species_name

# save match_table to Google Drive
save_csv_Google_drive(match_table, root, contain_folder, WIdatabase_file_name)
match_table.sample(7)

Hill Blue Flycatcher not found
Indochinese Wren-babbler not found
Flying Insect not found
Black-throated Laughingthrush not found
Animal not found
Impressed Tortoise not found
Banded Krait not found
Small Asian Mongoose not found
Natalia's Agama not found
File is saved to 4.1_collect_WI-taxon-database.csv in Google Drive at /content/drive/My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning/4.1_collect_WI-taxon-database.csv


Unnamed: 0,id,class,order,family,genus,species,authority,commonNameEnglish,taxonomyType,uniqueIdentifier,FFI_species_name
9577,2012886,Aves,Passeriformes,Corvidae,Crypsirina,temia,"Daudin, 1800",Racquet-tailed Treepie,biological,029c8784-8b87-4adf-aab4-e1d93cd29b10,Racket-tailed Treepie
14842,2007513,Aves,Gruiformes,Rallidae,Rallina,eurizonoides,"Lafresnaye, 1845",Slaty-legged Crake,biological,b602ff8a-1870-4633-90f3-51846594950a,Slaty-legged Crake
11448,2015400,Aves,Passeriformes,Muscicapidae,Ficedula,hyperythra,"Blyth, 1843",,biological,fd99749f-e5fd-4a8b-8787-823cfedb8c48,Snowy-browed Flycatcher
20537,2020989,Mammalia,Primates,Cercopithecidae,Macaca,,,Macaca Species,biological,bc0178de-d331-4a6e-84d7-f6e8d55b966d,Macaque
807,2021551,Reptilia,Squamata,,,,,Lizards and Snakes,biological,0af344ad-6657-42c4-85d8-03fd2106a22a,Snake
21546,2021544,Mammalia,Artiodactyla,Suidae,Sus,scrofa scrofa,,Domestic Pig,biological,c150a21e-952d-4665-8a62-a319841c5a56,Domestic Pig
3470,2004596,Mammalia,Rodentia,Sciuridae,Ratufa,bicolor,"Sparrman, 1778",Black Giant Squirrel,biological,185c2482-9e60-47a2-ac40-81b5d78ffbbe,Black Giant Squirrel
