<a href="https://colab.research.google.com/github/FFI-Vietnam/camtrap-tools/blob/main/Wildlife%20Insights/bulk-upload/03_explore-exiftool-dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This script is used for exploring the exiftool metadata spreadsheet 
After runnning this script, these new files are added to 'data exploration' folder

data cleaning
    |__ 3.1_explore_summary.csv
    
"""

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# mount with Google Drive to read and save file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# specifies Colab directories and file names
root = '/content/drive/'

dataset_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/dataset'
contain_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning'

explore_metadata_summary_file_name = "3.1_explore_summary.csv"


In [4]:
# useful functions
def read_csv_Google_drive(root, contain_folder, file_name):
  file_path = os.path.join(root, contain_folder, file_name)
  return pd.read_csv(file_path)

def save_csv_Google_drive(df, root, contain_folder, file_name):
  """
  function to save a csv file to Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  # save file to Colab runtime storage (will be deleted when this notebook is closed)
  df.to_csv('dataframe.csv', index=False)

  # save file back to Google Drive for permanent storage
  folder_path = os.path.join(root, contain_folder)
  file_path = os.path.join(root, contain_folder, file_name)
  try:
    os.makedirs(folder_path)
  except:
    pass

  with open('dataframe.csv', 'r') as f:
    df_file = f.read()

  with open(file_path, 'w') as f:
    f.write(df_file)

  print(f'File is saved to {file_name} in Google Drive at {file_path}')

In [10]:
# 01) summarize # of species, # of images for each species, and # of images uploaded
# to Wildlife Insights

df = read_csv_Google_drive(root, contain_folder, "1.4_clean_metadata_remove-no-animal-image.csv")

# summarize # of species total
species_count = df['species_common_name'].value_counts()

# summarize # of animal images uploaded
uploaded_species_counts = df[df['uploaded_to_WI'] == 'x']['species_common_name'].value_counts()

# record # of uploaded species
df = species_count.to_frame()
df.rename(columns={'species_common_name':'total'}, inplace=True)

# reset index
df['species_common_name'] = df.index
df.reset_index(drop=True, inplace=True)

# reposition total column and species_common_name column
df = df[['species_common_name', 'total']]

# count the # of uploaded images
uploaded = []
for i in range(len(df)):
  name = df.index[i]
  try:
    uploaded.append(uploaded_species_counts[name])
  except:
    uploaded.append('0')

df['uploaded_to_WI'] = uploaded
df['uploaded_to_WI'] = df['uploaded_to_WI'].astype('int32')
# percentage of uploaded images
df['%'] = round(df['uploaded_to_WI']*100/df['total'], 1)

# save file to Google Drive
save_csv_Google_drive(df, root, contain_folder, explore_metadata_summary_file_name)
df

File is saved to 3.1_explore_summary.csv in Google Drive at /content/drive/My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning/3.1_explore_summary.csv


Unnamed: 0,species_common_name,total,uploaded_to_WI,%
0,Murid,8411,0,0.0
1,Human,3266,0,0.0
2,Ferret-badger,2161,0,0.0
3,Orange-headed Thrush,1694,0,0.0
4,Bird,985,0,0.0
...,...,...,...,...
136,Grey-shanked Douc Langur,1,0,0.0
137,Japanese Robin,1,0,0.0
138,Pale-footed Bush Warbler,1,0,0.0
139,Indochinese Blue-flycatcher,1,0,0.0
