in this notebook we:
* summarize data structure and decide on validation split strategy - for now skip exif and divide on location, preserving species stats
* construct a validation set
 * splitting data on locations trying to preserve valid/train split
 * undersample most frequent (with numbers >35000) categories on train and valid separately to ensure 1:6 valid/train ratio
 * remove Mammal_Other entirely as it is confusing (bats, "ant-eaters, etc. in one class)
 * we drop the Blank category and want to use sigmoid not softmax at the final layer (with binary cross entropy as loss)

In [47]:
import pandas as pd
from pathlib import Path
import os

from fastai.vision import *

import random

from IPython.display import Image, display

DATA_PATH = Path("/data/Gabon_trainingData")

In [48]:
# import exifread

# img_file = labels.uniqueName[0]
# # Open image file for reading (binary mode)
# path_name = DATA_PATH / img_file
# f = open(path_name, 'rb')

# # Return Exif tags
# tags = exifread.process_file(f)
# # tags["EXIF DateTimeOriginal"]
# tags

### Data summary

Compressed Camera Trap Images
- 1 185 264
- (>1mln) are Blank, rest from 6 species
- camera location and datetime in name of file

allData
- 344 304
- 29 species, Blank on 7th place (OK balance)
- 4 sources given in folder names
- only some (?) have EXIF data

Camera trap Nki National Park
- 120 740
- 24 species, Blank first, (OK balance)
- location in folder
- datetime in EXIF
- single study

We have datetime for comp_df, half of all_df (from exif), and nki_df (from exif), but for now we try to split the data based on source or location, but preserving sensible species stats.

In [49]:
labels = pd.read_csv("/data/Labels/labels_clean_w_path.csv")
labels["main_folder"] = labels.fullPath.str.extract('D:/([^/]*)', expand=True)
labels.head()

Unnamed: 0,fullPath,species,n,uniqueName,main_folder
0,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000001.jpg,Compressed Camera Trap Images
1,D:/Compressed Camera Trap Images//Between fiel...,Blank,0.0,0000002.jpg,Compressed Camera Trap Images
2,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000003.jpg,Compressed Camera Trap Images
3,D:/Compressed Camera Trap Images//Between fiel...,Blank,0.0,0000004.jpg,Compressed Camera Trap Images
4,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000005.jpg,Compressed Camera Trap Images


In [50]:
sources = labels.main_folder.unique()
comp_df = labels[labels["main_folder"] == "Compressed Camera Trap Images"].copy()
all_df = labels[labels["main_folder"] == "allData"].copy()
nki_df = labels[labels["main_folder"] == "Camera trap Nki National Park"].copy()

### Splitting comp_df

In [51]:
comp_df["location"] = comp_df["fullPath"].str.extract('D:/Compressed Camera Trap Images//[^/]*/([^\s]*)', expand=True)
comp_df.head()

Unnamed: 0,fullPath,species,n,uniqueName,main_folder,location
0,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000001.jpg,Compressed Camera Trap Images,T33
1,D:/Compressed Camera Trap Images//Between fiel...,Blank,0.0,0000002.jpg,Compressed Camera Trap Images,T36
2,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000003.jpg,Compressed Camera Trap Images,T33
3,D:/Compressed Camera Trap Images//Between fiel...,Blank,0.0,0000004.jpg,Compressed Camera Trap Images,T36
4,D:/Compressed Camera Trap Images//Field season...,Human,1.0,0000005.jpg,Compressed Camera Trap Images,T33


In [52]:
comp_df_locations_count = comp_df.location.value_counts()
comp_locs = list(comp_df_locations_count.index)

In [53]:
locs_for_valid = []
for i in range(len(comp_locs)):
    if (i + 1)%7 == 0: #take every seventh starting from second
        locs_for_valid.append(comp_locs[i])

In [54]:
print(locs_for_valid)
locs_for_train = [loc for loc in comp_locs if loc not in locs_for_valid]
print(locs_for_train)

['T1', 'T28', 'T11', 'T41', 'T18']
['T27', 'T20', 'T35', 'T39', 'T5', 'T26', 'T29', 'T4', 'T36', 'T37', 'T38', 'T17', 'T15', 'T40', 'T33', 'T10', 'T32', 'T9', 'T14', 'T19', 'T34', 'T31', 'T3', 'T30', 'T23', 'T8', 'T7', 'T6', 'T24', 'T12', 'T16', 'T2', 'T25', 'T21', 'T42']


In [55]:
for_valid = 0
for loc in locs_for_valid:
    for_valid += comp_df_locations_count[loc]
print(f"Taken {for_valid} from {comp_df_locations_count.sum()} photos from comp_df, that is {round(for_valid*100/comp_df_locations_count.sum(),2)}%")

Taken 123338 from 1185264 photos from comp_df, that is 10.41%


In [56]:
comp_df["is_valid"] = comp_df.location.apply(lambda x: x in locs_for_valid)
comp_df.is_valid.sum()

123338

In [57]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(comp_df.groupby(["is_valid", "species"]).size().reset_index(name="counts").sort_values(by=['is_valid','counts'],ascending=False))

Unnamed: 0,is_valid,species,counts
7,True,Blank,99006
9,True,Elephant_African,18202
11,True,Human,3964
10,True,Hog_Red_River,1446
8,True,Buffalo_African,353
12,True,Leopard_African,351
13,True,Monkey,16
0,False,Blank,958232
2,False,Elephant_African,65162
4,False,Human,28990


In [58]:
comp_df_w_valid = comp_df[["uniqueName", "species", "main_folder", "is_valid"]].copy()
comp_df_w_valid["sub_folder"] = comp_df.location
comp_df_w_valid.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000001.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000002.jpg,Blank,Compressed Camera Trap Images,False,T36
2,0000003.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000004.jpg,Blank,Compressed Camera Trap Images,False,T36
4,0000005.jpg,Human,Compressed Camera Trap Images,False,T33


### Splitting nki_df

In [59]:
nki_df["location"] = nki_df["fullPath"].str.extract('D:/Camera trap Nki National Park/([^/]*)', expand=True)
nki_df.head()

Unnamed: 0,fullPath,species,n,uniqueName,main_folder,location
1529568,D:/Camera trap Nki National Park/C01/100EK113/...,Human,0.0,1529577.jpg,Camera trap Nki National Park,C01
1529569,D:/Camera trap Nki National Park/C01/100EK113/...,Human,0.0,1529578.jpg,Camera trap Nki National Park,C01
1529570,D:/Camera trap Nki National Park/C01/100EK113/...,Human,0.0,1529579.jpg,Camera trap Nki National Park,C01
1529571,D:/Camera trap Nki National Park/C01/100EK113/...,Human,0.0,1529580.jpg,Camera trap Nki National Park,C01
1529572,D:/Camera trap Nki National Park/C01/100EK113/...,Human,0.0,1529581.jpg,Camera trap Nki National Park,C01


In [60]:
nki_df_locations_count = nki_df.location.value_counts()
nki_locs = list(nki_df_locations_count.index)
nki_df_locations_count

C10    15642
C25    13634
C13     6321
C02     5489
C15     5243
C06     4794
C03     4612
C05     4260
C09     4095
C08     4065
C17     4011
C26     3830
C16     3696
C07     3660
C04     3519
C11     3498
C24     3411
C29     3357
C27     3101
C01     2978
C23     2960
C19     2897
C20     2845
C28     2510
C14     2430
C30     2354
C12     1528
Name: location, dtype: int64

In [61]:
locs_for_valid = []
for i in range(len(nki_locs)):
    if (i+2)%7 == 0: #take every seventh starting from third
        locs_for_valid.append(nki_locs[i])

In [62]:
print(locs_for_valid)
locs_for_train = [loc for loc in nki_locs if loc not in locs_for_valid]
print(locs_for_train)

['C06', 'C16', 'C01', 'C12']
['C10', 'C25', 'C13', 'C02', 'C15', 'C03', 'C05', 'C09', 'C08', 'C17', 'C26', 'C07', 'C04', 'C11', 'C24', 'C29', 'C27', 'C23', 'C19', 'C20', 'C28', 'C14', 'C30']


In [63]:
for_valid = 0
for loc in locs_for_valid:
    for_valid += nki_df_locations_count[loc]
print(f"Taken {for_valid} from {nki_df_locations_count.sum()} photos from nki_df, that is {round(for_valid*100/nki_df_locations_count.sum(),2)}%")

Taken 12996 from 120740 photos from nki_df, that is 10.76%


In [64]:
nki_df["is_valid"] = nki_df.location.apply(lambda x: x in locs_for_valid)
nki_df.is_valid.sum()

12996

In [65]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(nki_df.groupby(["is_valid", "species"]).size().reset_index(name="counts").sort_values(by=['is_valid','counts'],ascending=False))

Unnamed: 0,is_valid,species,counts
25,True,Blank,5998
30,True,Duiker_Blue,2906
31,True,Duiker_Red,1295
37,True,Human,793
24,True,Bird,471
27,True,Chevrotain_Water,227
45,True,Squirrel,190
32,True,Duiker_Yellow_Backed,181
36,True,Hog_Red_River,181
44,True,Rodent,170


In [66]:
nki_df_w_valid = nki_df[["uniqueName", "species", "main_folder", "is_valid"]].copy()
nki_df_w_valid["sub_folder"] = nki_df.location
nki_df_w_valid.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
1529568,1529577.jpg,Human,Camera trap Nki National Park,True,C01
1529569,1529578.jpg,Human,Camera trap Nki National Park,True,C01
1529570,1529579.jpg,Human,Camera trap Nki National Park,True,C01
1529571,1529580.jpg,Human,Camera trap Nki National Park,True,C01
1529572,1529581.jpg,Human,Camera trap Nki National Park,True,C01


### Splitting all_df

In [67]:
all_df["source"] = all_df["fullPath"].str.extract('D:/allData/([^_]*)', expand=True)
all_df.head()

Unnamed: 0,fullPath,species,n,uniqueName,main_folder,source
1185264,D:/allData/StephBrittainZSL_1.jpg,Duiker_Blue,1.0,1185273.jpg,allData,StephBrittainZSL
1185265,D:/allData/StephBrittainZSL_2.jpg,Duiker_Blue,1.0,1185274.jpg,allData,StephBrittainZSL
1185266,D:/allData/StephBrittainZSL_3.jpg,Duiker_Blue,1.0,1185275.jpg,allData,StephBrittainZSL
1185267,D:/allData/StephBrittainZSL_4.jpg,Duiker_Red,1.0,1185276.jpg,allData,StephBrittainZSL
1185268,D:/allData/StephBrittainZSL_5.jpg,Duiker_Red,1.0,1185277.jpg,allData,StephBrittainZSL


In [68]:
all_df_sources_count = all_df.source.value_counts()
all_locs = list(all_df_sources_count.index)
print(all_df_sources_count)

KorupTEAM                  172569
NouabaleTEAM               120352
LailaBahaaelDinPanthera     38368
StephBrittainZSL            13015
Name: source, dtype: int64


In [69]:
locs_for_valid = []
for i in range(len(all_locs)):
    if i == 2 or i == 3: #take every seventh starting from second
        locs_for_valid.append(all_locs[i])

In [70]:
print(locs_for_valid)
locs_for_train = [loc for loc in all_locs if loc not in locs_for_valid]
print(locs_for_train)

['LailaBahaaelDinPanthera', 'StephBrittainZSL']
['KorupTEAM', 'NouabaleTEAM']


In [71]:
for_valid = 0
for loc in locs_for_valid:
    for_valid += all_df_sources_count[loc]
print(f"Taken {for_valid} from {all_df_sources_count.sum()} photos from all_df, that is {round(for_valid*100/all_df_sources_count.sum(),2)}%")

Taken 51383 from 344304 photos from all_df, that is 14.92%


In [72]:
all_df["is_valid"] = all_df.source.apply(lambda x: x in locs_for_valid)
all_df.is_valid.sum()

51383

In [73]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(all_df.groupby(["is_valid", "species"]).size().reset_index(name="counts").sort_values(by=['is_valid','counts'],ascending=False))

Unnamed: 0,is_valid,species,counts
27,True,Blank,13764
34,True,Duiker_Red,11825
36,True,Elephant_African,7078
33,True,Duiker_Blue,3009
52,True,Rat_Giant,2868
45,True,Mandrillus,2626
41,True,Hog_Red_River,1912
42,True,Human,1084
31,True,Chimpanzee,1046
35,True,Duiker_Yellow_Backed,909


In [74]:
all_df_w_valid = all_df[["uniqueName", "species", "main_folder", "is_valid"]].copy()
all_df_w_valid["sub_folder"] = all_df.source
all_df_w_valid.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
1185264,1185273.jpg,Duiker_Blue,allData,True,StephBrittainZSL
1185265,1185274.jpg,Duiker_Blue,allData,True,StephBrittainZSL
1185266,1185275.jpg,Duiker_Blue,allData,True,StephBrittainZSL
1185267,1185276.jpg,Duiker_Red,allData,True,StephBrittainZSL
1185268,1185277.jpg,Duiker_Red,allData,True,StephBrittainZSL


Remarks:
- comp_df
 - most Leopard_African are in valid
- nki_df
 - few Elephant_African are in valid
- all_df
 - all Blank are in valid
 - few Rail_Nkulengu in valid
 - most Human are in valid

# Combine the dataframes (main_folders) together

In [142]:
df = pd.concat([comp_df_w_valid, nki_df_w_valid, all_df_w_valid], sort=False)
len(df)

1650308

In [143]:
df.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000001.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000002.jpg,Blank,Compressed Camera Trap Images,False,T36
2,0000003.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000004.jpg,Blank,Compressed Camera Trap Images,False,T36
4,0000005.jpg,Human,Compressed Camera Trap Images,False,T33


In [79]:
df.to_csv("df_w_valid.csv")

In [80]:
print(f"The validation dataset is now {round(df.is_valid.sum() / len(df) *100,2)}% of the data")

The validation dataset is now 11.37% of the data


#### Validation percentage per class

In [81]:
print(f"Percentage of data put in validation per species:")
valid_perc = {}
for species in df.species.unique():
    df_species = df[df.species == species]
    perc = df_species.is_valid.sum() / len(df_species) * 100
    print(f"{round(perc, 2)}%   {species}")
    valid_perc[species] = perc

Percentage of data put in validation per species:
14.99%   Human
10.43%   Blank
24.39%   Elephant_African
13.58%   Hog_Red_River
7.48%   Buffalo_African
30.21%   Leopard_African
20.35%   Monkey
13.44%   Duiker_Red
6.52%   Civet_African_Palm
6.73%   Squirrel
5.99%   Duiker_Blue
10.42%   Bird
11.12%   Mongoose_Black_Footed
24.89%   Rodent
7.06%   Duiker_Yellow_Backed
9.84%   Genet
20.42%   Chimpanzee
66.85%   Gorilla
5.87%   Mongoose
6.6%   Porcupine_Brush_Tailed
15.14%   Pangolin
28.22%   Mandrillus
8.4%   Chevrotain_Water
25.09%   Mammal_Other
18.47%   Cat_Golden
8.38%   Rat_Giant
4.36%   Guineafowl_Crested
2.57%   Guineafowl_Black
0.18%   Rail_Nkulengu


In [82]:
sorted( ((v,k) for k,v in valid_perc.items()), reverse=True)

[(66.85288640595903, 'Gorilla'),
 (30.205415499533146, 'Leopard_African'),
 (28.221386351423966, 'Mandrillus'),
 (25.09025270758123, 'Mammal_Other'),
 (24.889543446244478, 'Rodent'),
 (24.39132530120482, 'Elephant_African'),
 (20.418250950570343, 'Chimpanzee'),
 (20.345375148868598, 'Monkey'),
 (18.472906403940886, 'Cat_Golden'),
 (15.137614678899084, 'Pangolin'),
 (14.98806805060173, 'Human'),
 (13.57551114350378, 'Hog_Red_River'),
 (13.440144235694238, 'Duiker_Red'),
 (11.118644067796609, 'Mongoose_Black_Footed'),
 (10.431541253896413, 'Blank'),
 (10.422619942848549, 'Bird'),
 (9.839539812291855, 'Genet'),
 (8.402291534054743, 'Chevrotain_Water'),
 (8.379349636252082, 'Rat_Giant'),
 (7.4779377612633535, 'Buffalo_African'),
 (7.061414874319771, 'Duiker_Yellow_Backed'),
 (6.731870649396423, 'Squirrel'),
 (6.601431283773916, 'Porcupine_Brush_Tailed'),
 (6.517094017094018, 'Civet_African_Palm'),
 (5.989994734070563, 'Duiker_Blue'),
 (5.871692642261689, 'Mongoose'),
 (4.362194315928618, '

#### For Gorillas, check how many valids are from which source

In [83]:
df[df.species == "Gorilla"].groupby(["main_folder", "is_valid"]).size().reset_index(name="counts").sort_values(by=['main_folder','counts'],ascending=False)

Unnamed: 0,main_folder,is_valid,counts
2,allData,True,684
0,Camera trap Nki National Park,False,356
1,Camera trap Nki National Park,True,34


so all Gorillas from allData went into valid...

In [84]:
df[(df.species == "Gorilla") & (df.main_folder == "allData")].groupby(["sub_folder", "is_valid"]).size().reset_index(name="counts").sort_values(by=['sub_folder','counts'],ascending=False)

Unnamed: 0,sub_folder,is_valid,counts
1,StephBrittainZSL,True,2
0,LailaBahaaelDinPanthera,True,682


so almost all Gorillas from allData are in Laila source... hence source split is not good enough for them

#### Rail_Nkulengu - few in valid

In [86]:
df[df.species == "Rail_Nkulengu"].groupby(["main_folder", "is_valid"]).size().reset_index(name="counts").sort_values(by=['main_folder','counts'],ascending=False)

Unnamed: 0,main_folder,is_valid,counts
0,allData,False,3263
1,allData,True,6


In [87]:
df[(df.species == "Rail_Nkulengu") & (df.main_folder == "allData")].groupby(["sub_folder", "is_valid"]).size().reset_index(name="counts").sort_values(by=['sub_folder','counts'],ascending=False)

Unnamed: 0,sub_folder,is_valid,counts
1,StephBrittainZSL,True,6
0,KorupTEAM,False,3263


so almost all Rail_Nkulengus come from KorupTEAM source.

### Valid to train per class compare

In [144]:
valid_per_species = {}
for species in df.species.unique():
    df_species = df[df.species == species]
    n_valid = df_species.is_valid.sum()
    n_species = len(df_species)
    n_train = n_species - n_valid
    perc = n_valid / n_species * 100
    valid_per_species[species] = (n_train, n_valid, n_species, perc)

In [145]:
valid_per_species = pd.DataFrame.from_dict(valid_per_species, columns=["n_train", "n_valid", "n_species", "perc"], orient="index").sort_values("perc", ascending=False)
valid_per_species

Unnamed: 0,n_train,n_valid,n_species,perc
Gorilla,356,718,1074,66.852886
Leopard_African,1495,647,2142,30.205415
Mandrillus,6679,2626,9305,28.221386
Mammal_Other,415,139,554,25.090253
Rodent,1020,338,1358,24.889543
Elephant_African,78444,25306,103750,24.391325
Chimpanzee,4186,1074,5260,20.418251
Monkey,4013,1025,5038,20.345375
Cat_Golden,331,75,406,18.472906
Pangolin,740,132,872,15.137615


# Conclusions so far

df and df.to_csv("df_w_valid.csv") represent current validation split,

some imbalance is present (e.g., too many Gorillas in valid and too few Rail_Nkulengus)

Ideas to improve the split and the data:

- check balance after undersampling (adjust undersampling to how many pics we have in valid set per species)
- possibly move some sources (sub_folders) to and from the valid set to have a better balance per class (on the other hand this messess up the valid split based on location, and is an indirect data leakage. 

# undersampling 

In [146]:
valid_per_species.sort_values("n_species", ascending=False)

Unnamed: 0,n_train,n_valid,n_species,perc
Blank,1019779,118768,1138547,10.431541
Elephant_African,78444,25306,103750,24.391325
Duiker_Blue,92833,5915,98748,5.989995
Duiker_Red,84498,13120,97618,13.440144
Human,33130,5841,38971,14.988068
Rat_Giant,31359,2868,34227,8.37935
Hog_Red_River,22530,3539,26069,13.575511
Duiker_Yellow_Backed,14346,1090,15436,7.061415
Porcupine_Brush_Tailed,12790,904,13694,6.601431
Guineafowl_Black,11745,310,12055,2.571547


In [147]:
to_undersample = list(valid_per_species.sort_values("n_species", ascending=False).index)[:5]
to_undersample

['Blank', 'Elephant_African', 'Duiker_Blue', 'Duiker_Red', 'Human']

**idea**: undersample valid and train separately to limit ['Blank',
 'Elephant_African',
 'Duiker_Blue',
 'Duiker_Red',
 'Human'] such that in total we have ~35000 of them and valid/train is ~1:6 (perc ~15)

In [148]:
desired_n_total = 35000
desired_n_valid = round(desired_n_total / 7)
desired_n_train = round(desired_n_total / 7 * 6)
print(f"desired_n_valid: {desired_n_valid}")
print(f"desired_n_train: {desired_n_train}")
print(f"desired valid percentage: {desired_n_valid/(desired_n_total)}")


desired_n_valid: 5000
desired_n_train: 30000
desired valid percentage: 0.14285714285714285


In [153]:
to_drop_train = {}
for species in to_undersample:
    current_n_train = valid_per_species.loc[species,"n_train"]
    to_drop_train[species] = df[(df.species == species) & (df.is_valid == False)].sample(n=current_n_train-desired_n_train, random_state=271).index

to_drop_valid = {}
for species in to_undersample:
    current_n_valid = valid_per_species.loc[species,"n_valid"]
    to_drop_valid[species] = df[(df.species == species) & (df.is_valid == True)].sample(n=current_n_valid-desired_n_valid, random_state=271).index

In [154]:
df_undersampled = df.copy()
for species in to_undersample:
    df_undersampled = df_undersampled.drop(to_drop_train[species], errors="ignore").drop(to_drop_valid[species], errors="ignore")

#### Check if undersampling worked

In [155]:
valid_per_species_u = {}
for species in df_undersampled.species.unique():
    df_species = df_undersampled[df_undersampled.species == species]
    n_valid = df_species.is_valid.sum()
    n_species = len(df_species)
    n_train = n_species - n_valid
    perc = n_valid / n_species * 100
    valid_per_species_u[species] = (n_train, n_valid, n_species, perc)

In [156]:
valid_per_species_u = pd.DataFrame.from_dict(valid_per_species_u, columns=["n_train", "n_valid", "n_species", "perc"], orient="index").sort_values("n_species", ascending=False)
valid_per_species_u

Unnamed: 0,n_train,n_valid,n_species,perc
Human,30000,5000,35000,14.285714
Duiker_Red,30000,5000,35000,14.285714
Elephant_African,30000,5000,35000,14.285714
Blank,30000,5000,35000,14.285714
Duiker_Blue,30000,5000,35000,14.285714
Rat_Giant,31359,2868,34227,8.37935
Hog_Red_River,22530,3539,26069,13.575511
Duiker_Yellow_Backed,14346,1090,15436,7.061415
Porcupine_Brush_Tailed,12790,904,13694,6.601431
Guineafowl_Black,11745,310,12055,2.571547


In [158]:
print(f"We keep {len(df_undersampled)} images from {len(df)}, i.e., {round(len(df_undersampled)/len(df)*100,2)}%")

We keep 347674 images from 1650308, i.e., 21.07%


# Mammal_Other is confusing (has very different animals, bats, ant-eaters (?), etc.) so we drop them 

In [162]:
df_undersampled_no_mammals = df_undersampled[~(df_undersampled.species == "Mammal_Other")]
len(df_undersampled_no_mammals)

347120

In [163]:
df_undersampled_no_mammals.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
4,0000005.jpg,Human,Compressed Camera Trap Images,False,T33
6,0000007.jpg,Human,Compressed Camera Trap Images,False,T33
8,0000009.jpg,Human,Compressed Camera Trap Images,False,T33
12,0000013.jpg,Human,Compressed Camera Trap Images,False,T33
14,0000015.jpg,Human,Compressed Camera Trap Images,False,T33


# Drop Blank entirely (use sigmoid and binary cross entropy as loss)

In [165]:
df_undersampled_no_mammals_no_blank = df_undersampled_no_mammals[~(df_undersampled_no_mammals.species == "Blank")]
len(df_undersampled_no_mammals_no_blank)

312120

In [166]:
print(f"We keep {len(df_undersampled_no_mammals_no_blank)} images from {len(df)}, i.e., {round(len(df_undersampled_no_mammals_no_blank)/len(df)*100,2)}%")

We keep 312120 images from 1650308, i.e., 18.91%


In [167]:
df_undersampled_no_mammals_no_blank.to_csv("train_valid_df.csv", index=False)