<a href="https://colab.research.google.com/github/FFI-Vietnam/camtrap-tools/blob/main/MegaDetector/confusion_matrix/01_ground-truth-from-metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This script walks through the process of extracting ground-truth data from exiftool metadata file 
to compare with MegaDetector predictions later.
It also groups 3s-consecutive images into a group.

After runnning this script, a 'data cleaning' folder is created

data cleaning
    |__ 01_ground-truth-table_Kon-Plong.csv
    
"""

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# specifies Colab directories and file names
root = '/content/drive/'

dataset_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/dataset'
WI_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning'
contain_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/data cleaning'

# ground-truth table is generated using Wildlife Insights/bulk-upload/01_clean-exiftool-dataset.ipynb script
# it is saved as 1.3_clean_metadata_single-species-row.csv
WI_ground_truth_file_name = '1.3_clean_metadata_single-species-row.csv'

detection_ground_truth_file_name = "01_ground-truth-table_Kon-Plong.csv"

In [4]:
# read and save file functions
def read_csv_Google_drive(root, contain_folder, file_name):
  file_path = os.path.join(root, contain_folder, file_name)
  return pd.read_csv(file_path)

def save_csv_Google_drive(df, root, contain_folder, file_name):
  """
  function to save a csv file to Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  # save file to Colab runtime storage (will be deleted when this notebook is closed)
  df.to_csv('dataframe.csv', index=False)

  # save file back to Google Drive for permanent storage
  folder_path = os.path.join(root, contain_folder)
  file_path = os.path.join(root, contain_folder, file_name)
  try:
    os.makedirs(folder_path)
  except:
    pass

  with open('dataframe.csv', 'r') as f:
    df_file = f.read()

  with open(file_path, 'w') as f:
    f.write(df_file)

  print(f'File is saved to {file_name} in Google Drive at {file_path}')

In [5]:
# read ground-truth table
ground_truth_table = read_csv_Google_drive(root, WI_folder, WI_ground_truth_file_name)
# keep only filename and species_common_name column
ground_truth_table = ground_truth_table[['FileName', 'species_common_name', 'datetime_created', 'Flash']]
print(f"#animal-images / #images = {len(ground_truth_table[ground_truth_table['species_common_name'].notnull()])/len(ground_truth_table)}")
ground_truth_table.head(5)

#animal-images / #images = 0.24887111691420488


Unnamed: 0,FileName,species_common_name,datetime_created,Flash
0,197__68974__2019-04-27__10-42-20(1).JPG,Human,2019-04-27 10:42:20,No Flash
1,197__68974__2019-04-27__10-42-21(2).JPG,Human,2019-04-27 10:42:21,No Flash
2,197__68974__2019-04-27__10-42-22(3).JPG,Human,2019-04-27 10:42:22,No Flash
3,197__68974__2019-04-27__10-42-44(4).JPG,Human,2019-04-27 10:42:44,No Flash
4,197__68974__2019-04-27__10-42-45(5).JPG,Human,2019-04-27 10:42:45,No Flash


In [6]:
# the above dataset shows that a batch of three is usually 2s-interval
# for example 41-43-45 or 21-22-24
# so, to determine a batch, they should be grouped into 5s interval

ground_truth_table = read_csv_Google_drive(root, WI_folder, WI_ground_truth_file_name)
ground_truth_table = ground_truth_table[['FileName', 'species_common_name', 'datetime_created', 'Flash']]

# define CONSTANT
BATCH_INTERVAL = 2 # 2 seconds
NUM_CONSECTIVE_IMAGES = 3 # captures max 3 images in a row

batch_group = []

# convert datetime_created column to pandas datetime type
ground_truth_table['datetime_created'] = pd.to_datetime(ground_truth_table['datetime_created'])

first_of_batch = ground_truth_table['datetime_created'][0]
current_batch = 0
batch_group.append(current_batch)
for i in range(1, len(ground_truth_table)):
  # in a batch
  if (ground_truth_table['datetime_created'][i]-first_of_batch).seconds < BATCH_INTERVAL*NUM_CONSECTIVE_IMAGES:
    batch_group.append(current_batch)
  else:
    first_of_batch = ground_truth_table['datetime_created'][i]
    current_batch += 1
    batch_group.append(current_batch)

ground_truth_table['batch_group'] = batch_group

# use as index column in replacement for FileName (has duplication due to explode)
id = [i for i in range(len(ground_truth_table))]
ground_truth_table['id'] = id

# bring id column to first
# ground_truth_table = ground_truth_table[['id', 'FileName', 'datetime_created', 'datetime_created', 'batch_group']]
# EDIT: not do this bc pandas will exclude it when reading csv

# save to file
save_csv_Google_drive(ground_truth_table, root, contain_folder, detection_ground_truth_file_name)

ground_truth_table

File is saved to 01_ground-truth-table_Kon-Plong.csv in Google Drive at /content/drive/My Drive/FFI/MegaDetector Test/confusion-matrix/data cleaning/01_ground-truth-table_Kon-Plong.csv


Unnamed: 0,FileName,species_common_name,datetime_created,Flash,batch_group,id
0,197__68974__2019-04-27__10-42-20(1).JPG,Human,2019-04-27 10:42:20,No Flash,0,0
1,197__68974__2019-04-27__10-42-21(2).JPG,Human,2019-04-27 10:42:21,No Flash,0,1
2,197__68974__2019-04-27__10-42-22(3).JPG,Human,2019-04-27 10:42:22,No Flash,0,2
3,197__68974__2019-04-27__10-42-44(4).JPG,Human,2019-04-27 10:42:44,No Flash,1,3
4,197__68974__2019-04-27__10-42-45(5).JPG,Human,2019-04-27 10:42:45,No Flash,1,4
...,...,...,...,...,...,...
110281,196__68890__2019-08-05__17-58-54(2).JPG,,2019-08-05 17:58:54,Fired,85333,110281
110282,196__68890__2019-08-05__17-59-22(1).JPG,,2019-08-05 17:59:22,Fired,85334,110282
110283,196__68890__2019-08-06__06-31-24(1).JPG,Rufous-throated Partridge,2019-08-06 06:31:24,Fired,85335,110283
110284,196__68890__2019-08-06__06-31-54(2).JPG,,2019-08-06 06:31:54,Fired,85336,110284
