<a href="https://colab.research.google.com/github/FFI-Vietnam/camtrap-tools/blob/main/MegaDetector/confusion_matrix/01_ground-truth-from-metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This script creates a groundtruth table which specifies what species inside an 
image and if three consecutive images are in the same batch.

After runnning this script, a 'data cleaning' folder is created

data cleaning
    |__ 01_ground-truth-table_Kon-Plong.csv
    
"""

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# specifies Colab directories and file names
root = '/content/drive/'

dataset_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/dataset'
WI_folder = 'My Drive/FFI/Wildlife Insights Bulk Upload Test/bulk-upload_template-autofill/data cleaning'
contain_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/ground-truths'

ground_truth_dataset_file_name = 'image_metadata(2020-06-26)_full.csv'
# ground-truth table is generated using Wildlife Insights/bulk-upload/01_clean-exiftool-dataset.ipynb script
# it is saved as 1.3_clean_metadata_single-species-row.csv
WI_ground_truth_file_name = '1.3_clean_metadata_single-species-row.csv'

detection_ground_truth_file_name = "01_ground-truth-table_Kon-Plong.csv"

In [4]:
# read and save file functions
def read_csv_Google_drive(root, contain_folder, file_name):
  file_path = os.path.join(root, contain_folder, file_name)
  return pd.read_csv(file_path)

def save_csv_Google_drive(df, root, contain_folder, file_name):
  """
  function to save a csv file to Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  # save file to Colab runtime storage (will be deleted when this notebook is closed)
  df.to_csv('dataframe.csv', index=False)

  # save file back to Google Drive for permanent storage
  folder_path = os.path.join(root, contain_folder)
  file_path = os.path.join(root, contain_folder, file_name)
  try:
    os.makedirs(folder_path)
  except:
    pass

  with open('dataframe.csv', 'r') as f:
    df_file = f.read()

  with open(file_path, 'w') as f:
    f.write(df_file)

  print(f'File is saved to {file_name} in Google Drive at {file_path}')

In [8]:
# read ground-truth table
ground_truth_table = read_csv_Google_drive(root, WI_folder, WI_ground_truth_file_name)
# keep only filename and species_common_name column
ground_truth_table = ground_truth_table[['FileName', 'species_common_name', 'datetime_created']]
print(f"#animal-images / #images = {len(ground_truth_table[ground_truth_table['species_common_name'].notnull()])/len(ground_truth_table)}")
ground_truth_table.head(5)

#animal-images / #images = 0.24818252687684694


Unnamed: 0,FileName,species_common_name,datetime_created
0,100__67096__2019-04-22__15-09-11(1).JPG,Human,2019-04-22 15:09:11
1,100__67096__2019-04-22__15-09-42(2).JPG,Human,2019-04-22 15:09:42
2,100__67096__2019-04-22__15-12-32(1).JPG,Human,2019-04-22 15:12:32
3,100__67096__2019-04-22__15-12-55(2).JPG,Human,2019-04-22 15:12:55
4,100__67096__2019-04-22__15-13-18(1).JPG,Human,2019-04-22 15:13:18


In [10]:
# the above dataset shows that a batch of three is usually 2s-interval
# for example 41-43-45 or 21-22-24
# so, to determine a batch, they should be grouped into 5s interval

ground_truth_table = read_csv_Google_drive(root, WI_folder, WI_ground_truth_file_name)
ground_truth_table = ground_truth_table[['FileName', 'species_common_name', 'datetime_created']]

# define CONSTANT
BATCH_INTERVAL = 2 # 2 seconds
NUM_CONSECTIVE_IMAGES = 3 # captures max 3 images in a row

batch_group = []

# convert datetime_created column to pandas datetime type
ground_truth_table['datetime_created'] = pd.to_datetime(ground_truth_table['datetime_created'])

first_of_batch = ground_truth_table['datetime_created'][0]
current_batch = 0
batch_group.append(current_batch)
for i in range(1, len(ground_truth_table)):
  # in a batch
  if (ground_truth_table['datetime_created'][i]-first_of_batch).seconds < BATCH_INTERVAL*NUM_CONSECTIVE_IMAGES:
    batch_group.append(current_batch)
  else:
    first_of_batch = ground_truth_table['datetime_created'][i]
    current_batch += 1
    batch_group.append(current_batch)

ground_truth_table['batch_group'] = batch_group

# use as index column in replacement for FileName (has duplication due to explode)
id = [i for i in range(len(ground_truth_table))]
ground_truth_table['id'] = id

# bring id column to first
ground_truth_table = ground_truth_table[['id', 'FileName', 'datetime_created', 'datetime_created', 'batch_group']]

# save to file
save_csv_Google_drive(ground_truth_table, root, contain_folder, detection_ground_truth_file_name)

ground_truth_table.head(20)

File is saved to 01_ground-truth-table_Kon-Plong.csv in Google Drive at /content/drive/My Drive/FFI/MegaDetector Test/confusion-matrix/ground-truths/01_ground-truth-table_Kon-Plong.csv


Unnamed: 0,id,FileName,datetime_created,datetime_created.1,batch_group
0,0,100__67096__2019-04-22__15-09-11(1).JPG,2019-04-22 15:09:11,2019-04-22 15:09:11,0
1,1,100__67096__2019-04-22__15-09-42(2).JPG,2019-04-22 15:09:42,2019-04-22 15:09:42,1
2,2,100__67096__2019-04-22__15-12-32(1).JPG,2019-04-22 15:12:32,2019-04-22 15:12:32,2
3,3,100__67096__2019-04-22__15-12-55(2).JPG,2019-04-22 15:12:55,2019-04-22 15:12:55,3
4,4,100__67096__2019-04-22__15-13-18(1).JPG,2019-04-22 15:13:18,2019-04-22 15:13:18,4
5,5,100__67096__2019-04-22__15-14-06(1).JPG,2019-04-22 15:14:06,2019-04-22 15:14:06,5
6,6,100__67096__2019-04-22__15-14-47(2).JPG,2019-04-22 15:14:47,2019-04-22 15:14:47,6
7,7,100__67096__2019-04-22__15-15-23(1).JPG,2019-04-22 15:15:23,2019-04-22 15:15:23,7
8,8,100__67096__2019-04-24__09-55-44(1).JPG,2019-04-24 09:55:44,2019-04-24 09:55:44,8
9,9,100__67096__2019-04-24__09-55-46(2).JPG,2019-04-24 09:55:46,2019-04-24 09:55:46,8
