<a href="https://colab.research.google.com/github/FFI-Vietnam/camtrap-tools/blob/main/MegaDetector/confusion_matrix/01_ground-truth-from-metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This script creates a groundtruth table which specifies what species inside an 
image from the exiftool metadata file. The species information is extracted 
from **CatalogSets** column, and image names are extracted from **FileName** column

After runnning this script, a 'ground-truths' folder is created

ground-truths
    |__ 01_ground-truth-table_Kon-Plong.csv
    
"""

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# specifies Colab directories and file names
root = '/content/drive/'

dataset_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/dataset'
contain_folder = 'My Drive/FFI/MegaDetector Test/confusion-matrix/ground-truths'

ground_truth_dataset_file_name = 'image_metadata(2020-06-26)_full.csv'
ground_truth_table_file_name = "01_ground-truth-table_Kon-Plong.csv"

In [4]:
# read and save file functions
def read_csv_Google_drive(root, contain_folder, file_name):
  file_path = os.path.join(root, contain_folder, file_name)
  return pd.read_csv(file_path)

def save_csv_Google_drive(df, root, contain_folder, file_name):
  """
  function to save a csv file to Google Drive
  param examples:
    root = '/content/drive/'
    contain_folder = 'My Drive/FFI/dataset'
    file_name = 'image_metadata(2020-06-26)_full.csv'
  """
  # save file to Colab runtime storage (will be deleted when this notebook is closed)
  df.to_csv('dataframe.csv', index=False)

  # save file back to Google Drive for permanent storage
  folder_path = os.path.join(root, contain_folder)
  file_path = os.path.join(root, contain_folder, file_name)
  try:
    os.makedirs(folder_path)
  except:
    pass

  with open('dataframe.csv', 'r') as f:
    df_file = f.read()

  with open(file_path, 'w') as f:
    f.write(df_file)

  print(f'File is saved to {file_name} in Google Drive at {file_path}')

In [5]:
# read ExifTool spreadsheet
full_metadata = read_csv_Google_drive(root, dataset_folder, ground_truth_dataset_file_name)
full_metadata.head()

  if self.run_code(code, result):


Unnamed: 0,SourceFile,BitsPerSample,Caption-Abstract,CatalogSets,Categories,CodedCharacterSet,ColorComponents,ColorLabel,Comment,CreateDate,CreatorTool,CurrentIPTCDigest,DateCreated,DateTime,DateTimeCreated,DateTimeOriginal,DateUTC,Description,Directory,EncodingProcess,Error,ExifByteOrder,ExifImageHeight,ExifImageWidth,ExifToolVersion,ExifVersion,FileAccessDate,FileCreateDate,FileModifyDate,FileName,FilePermissions,FileSize,FileType,FileTypeExtension,Flash,HierarchicalSubject,ImageDescription,ImageHeight,ImageNumber,ImageSize,ImageWidth,InstanceID,IPTCDigest,JFIFVersion,Keywords,LastKeywordIPTC,LastKeywordXMP,Make,Megapixels,MetadataDate,MIMEType,Model,ModificationDate,ModifyDate,Notes,OffsetSchema,Orientation,OriginatingProgram,Padding,PickLabel,ProcessingSoftware,ProgramVersion,Rating,RatingPercent,RegionInfoRegions,RegionList,ResolutionUnit,Software,Subject,TagsList,TimeCreated,Urgency,UserComment,Warning,XMPToolkit,XPKeywords,XResolution,YCbCrSubSampling,YResolution
0,D:/Fauna & Flora International/VietnamCameraSt...,8.0,,"People|Setup, Observer|Tam","<Categories><Category Assigned=""0"">People<Cate...",UTF8,3.0,,,2019:04:22 15:09:11,,0dfadea9315c091a67a43eb6881ea5df,,,,2019:04:22 15:09:11,,,D:/Fauna & Flora International/VietnamCameraSt...,"Baseline DCT, Huffman coding",,"Big-endian (Motorola, MM)",1536.0,2048.0,11.63,220.0,2020:05:26 10:18:20+07:00,2020:03:20 13:23:52+07:00,2020:05:07 10:40:30+07:00,100__67096__2019-04-22__15-09-11(1).JPG,rw-rw-rw-,458 kB,JPEG,jpg,Fired,"People|Setup, Observer|Tam",integtime=1535 A=88 D=128\r\n\rflashCrossover...,1536.0,1154.0,2048x1536,2048.0,,,1.02,"Setup, Tam",,"People/Setup, Observer/Tam",Panthera V4,3.1,,image/jpeg,CAM67096,,2019:04:22 15:09:11,,,,,,,,,,,,,,102717 V6-Bld1.15,"Setup, Tam","People/Setup, Observer/Tam",,,,[minor] Unrecognized MakerNotes,XMP Core 4.4.0-Exiv2,,1.0,YCbCr4:2:2 (2 1),1.0
1,D:/Fauna & Flora International/VietnamCameraSt...,8.0,,"People|Setup, Observer|Tam","<Categories><Category Assigned=""0"">People<Cate...",UTF8,3.0,,,2019:04:22 15:09:42,,0dfadea9315c091a67a43eb6881ea5df,,,,2019:04:22 15:09:42,,,D:/Fauna & Flora International/VietnamCameraSt...,"Baseline DCT, Huffman coding",,"Big-endian (Motorola, MM)",1536.0,2048.0,11.63,220.0,2020:06:24 14:00:28+07:00,2020:03:20 13:23:52+07:00,2020:05:07 10:40:30+07:00,100__67096__2019-04-22__15-09-42(2).JPG,rw-rw-rw-,471 kB,JPEG,jpg,Fired,"People|Setup, Observer|Tam",integtime=1535 A=88 D=128\r\n\rflashCrossover...,1536.0,1155.0,2048x1536,2048.0,,,1.02,"Setup, Tam",,"People/Setup, Observer/Tam",Panthera V4,3.1,,image/jpeg,CAM67096,,2019:04:22 15:09:42,,,,,,,,,,,,,,102717 V6-Bld1.15,"Setup, Tam","People/Setup, Observer/Tam",,,,[minor] Unrecognized MakerNotes,XMP Core 4.4.0-Exiv2,,1.0,YCbCr4:2:2 (2 1),1.0
2,D:/Fauna & Flora International/VietnamCameraSt...,8.0,,"People|Setup, Observer|Tam","<Categories><Category Assigned=""0"">People<Cate...",UTF8,3.0,,,2019:04:22 15:12:32,,0dfadea9315c091a67a43eb6881ea5df,,,,2019:04:22 15:12:32,,,D:/Fauna & Flora International/VietnamCameraSt...,"Baseline DCT, Huffman coding",,"Big-endian (Motorola, MM)",1536.0,2048.0,11.63,220.0,2020:06:24 14:00:29+07:00,2020:03:20 13:23:52+07:00,2020:05:07 10:40:30+07:00,100__67096__2019-04-22__15-12-32(1).JPG,rw-rw-rw-,485 kB,JPEG,jpg,Fired,"People|Setup, Observer|Tam",integtime=1535 A=88 D=128\r\n\rflashCrossover...,1536.0,1155.0,2048x1536,2048.0,,,1.02,"Setup, Tam",,"People/Setup, Observer/Tam",Panthera V4,3.1,,image/jpeg,CAM67096,,2019:04:22 15:12:32,,,,,,,,,,,,,,102717 V6-Bld1.15,"Setup, Tam","People/Setup, Observer/Tam",,,,[minor] Unrecognized MakerNotes,XMP Core 4.4.0-Exiv2,,1.0,YCbCr4:2:2 (2 1),1.0
3,D:/Fauna & Flora International/VietnamCameraSt...,8.0,,"People|Setup, Observer|Tam","<Categories><Category Assigned=""0"">People<Cate...",UTF8,3.0,,,2019:04:22 15:12:55,,0dfadea9315c091a67a43eb6881ea5df,,,,2019:04:22 15:12:55,,,D:/Fauna & Flora International/VietnamCameraSt...,"Baseline DCT, Huffman coding",,"Big-endian (Motorola, MM)",1536.0,2048.0,11.63,220.0,2020:06:24 14:00:29+07:00,2020:03:20 13:23:52+07:00,2020:05:07 10:40:30+07:00,100__67096__2019-04-22__15-12-55(2).JPG,rw-rw-rw-,479 kB,JPEG,jpg,Fired,"People|Setup, Observer|Tam",integtime=1535 A=88 D=128\r\n\rflashCrossover...,1536.0,1156.0,2048x1536,2048.0,,,1.02,"Setup, Tam",,"People/Setup, Observer/Tam",Panthera V4,3.1,,image/jpeg,CAM67096,,2019:04:22 15:12:55,,,,,,,,,,,,,,102717 V6-Bld1.15,"Setup, Tam","People/Setup, Observer/Tam",,,,[minor] Unrecognized MakerNotes,XMP Core 4.4.0-Exiv2,,1.0,YCbCr4:2:2 (2 1),1.0
4,D:/Fauna & Flora International/VietnamCameraSt...,8.0,,"People|Setup, Observer|Tam","<Categories><Category Assigned=""0"">People<Cate...",UTF8,3.0,,,2019:04:22 15:13:18,,0dfadea9315c091a67a43eb6881ea5df,,,,2019:04:22 15:13:18,,,D:/Fauna & Flora International/VietnamCameraSt...,"Baseline DCT, Huffman coding",,"Big-endian (Motorola, MM)",1536.0,2048.0,11.63,220.0,2020:06:24 14:00:29+07:00,2020:03:20 13:23:52+07:00,2020:05:07 10:40:30+07:00,100__67096__2019-04-22__15-13-18(1).JPG,rw-rw-rw-,494 kB,JPEG,jpg,Fired,"People|Setup, Observer|Tam",integtime=1535 A=88 D=128\r\n\rflashCrossover...,1536.0,1157.0,2048x1536,2048.0,,,1.02,"Setup, Tam",,"People/Setup, Observer/Tam",Panthera V4,3.1,,image/jpeg,CAM67096,,2019:04:22 15:13:18,,,,,,,,,,,,,,102717 V6-Bld1.15,"Setup, Tam","People/Setup, Observer/Tam",,,,[minor] Unrecognized MakerNotes,XMP Core 4.4.0-Exiv2,,1.0,YCbCr4:2:2 (2 1),1.0


In [6]:
# 01) create ground truths table

# keep 'FileName' and 'CatalogSets' columns
metadata = read_csv_Google_drive(root, dataset_folder, ground_truth_dataset_file_name)
metadata = metadata[['FileName', 'CatalogSets']]

# split to species_common_name column, each row represents at most one species 
# of each image. If one image has more than one species, it will be exploded into
# multiple entries

species_common_name = [] 

# prefix in CatalogSets column
SPECIES_PREFIX = 'Species'
PEOPLE_PREFIX = 'People'

catalog_sets = metadata['CatalogSets']
for i in range(len(catalog_sets)):
  if catalog_sets[i] is not np.NaN:
    catalog = str(catalog_sets[i]).split(', ')
    spe = ''
    for i in range(len(catalog)):
      if SPECIES_PREFIX in catalog[i]:
        if catalog[i].split('|')[1] != 'Human':
          if '(unknown)' in catalog[i].split('|')[1]: # remove 'unknown' mark
            spe = spe + ', ' + catalog[i].split('|')[1].split('(unknown)')[0]
          else:
            spe = spe + ', ' + catalog[i].split('|')[1]
      # Human is a bit tricky, it either has 'People' or 'Human' prefix, or both, 
      # so have to check to avoid duplication
      if PEOPLE_PREFIX in catalog[i]:
        if not 'Human' in spe:
          spe = spe + ', ' + 'Human'
    species_common_name.append(spe[2:])

  else:
    species_common_name.append('')

metadata['species_common_name'] = species_common_name

col = metadata.columns.to_list()
col.remove('species_common_name')

# explode an image which has multiple species into multiple rows
metadata = metadata.set_index(col).apply(lambda x: x.str.split(',').explode()).reset_index()
metadata['species_common_name'] = metadata['species_common_name'].str.strip()

# save to Google Drive
save_csv_Google_drive(metadata, root, contain_folder, ground_truth_table_file_name)

metadata.sample(20)

  if self.run_code(code, result):


File is saved to 01_ground-truth-table_Kon-Plong.csv in Google Drive at /content/drive/My Drive/FFI/MegaDetector Test/confusion-matrix/ground-truths/01_ground-truth-table_Kon-Plong.csv


Unnamed: 0,FileName,CatalogSets,species_common_name
94291,412__66962__2019-10-20__18-41-31(2).JPG,,
81639,373__67119__2019-11-05__23-44-39(2).JPG,Observer|Tam,
69378,kpc349__ct68841__2019-10-08__12-33-14(3)__Blan...,Observer|An Nguyen,
82859,374__66941__2019-12-09__11-54-32(4).JPG,"Observer|Tam, Species|Northern Treeshrew",Northern Treeshrew
13316,197__68974__2019-06-28__09-07-05(3).JPG,,
38574,256__68919__2019-06-12__13-33-49(3).JPG,,
64695,330__68912__2019-06-30__20-40-45(2).JPG,,
87811,kpc386__ct67170__2019-11-18__02-15-53(1)__Blan...,Observer|An Nguyen,
96710,415__68950__2019-12-07__03-29-36(2).JPG,,
28417,238__69217__2019-07-02__15-25-04(1).JPG,,


In [15]:
print(f"#animal-images / #images = {len(metadata[metadata['species_common_name'] != ''])/len(metadata)}")

#animal-images / #images = 0.24818252687684694
