in this notebook we:
* investigate sizes of images in the dataset (only in the selected training and validation sets)

In [1]:
from pathlib import Path
import json
import logging

import numpy as np
import pandas as pd

import os

from PIL import Image

In [2]:
PATH_TO_IMG = Path("/data/Gabon_trainingData")

PATH_TO_MAIN = Path("/home/jupyter/")
PATH_TO_TRAIN_DF = PATH_TO_MAIN / "inspect_data_split_validation"

In [3]:
df = pd.read_csv(PATH_TO_TRAIN_DF / "train_valid_df.csv")
df.head()

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder
0,0000005.jpg,Human,Compressed Camera Trap Images,False,T33
1,0000007.jpg,Human,Compressed Camera Trap Images,False,T33
2,0000009.jpg,Human,Compressed Camera Trap Images,False,T33
3,0000013.jpg,Human,Compressed Camera Trap Images,False,T33
4,0000015.jpg,Human,Compressed Camera Trap Images,False,T33


In [13]:
%%time
sizes = [Image.open(PATH_TO_IMG / file).size for file in df.uniqueName[:10]]
sizes[:4]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.98 ms


[(1088, 816), (1088, 816), (1088, 816), (1088, 816)]

In [14]:
# %%time
# sizes = [list(Image.open(PATH_TO_IMG / file).size) + [file] for file in df.uniqueName]

CPU times: user 3min 30s, sys: 20.8 s, total: 3min 51s
Wall time: 42min 2s


In [5]:
filename = "sizes.npy"

In [16]:
# np.save(filename, sizes)

In [6]:
sizes_from_file = np.load(filename).tolist()

In [7]:
len(sizes_from_file)

347120

In [19]:
df_sizes = pd.DataFrame(sizes_from_file, columns=["x", "y", "uniqueName"])
df_sizes.head()

Unnamed: 0,x,y,uniqueName
0,1088,816,0000005.jpg
1,1088,816,0000007.jpg
2,1088,816,0000009.jpg
3,1088,816,0000013.jpg
4,1088,816,0000015.jpg


In [21]:
df_sizes["x"].value_counts()

2048    130260
1088    102418
1280     60393
3840     36026
1600      9268
2576      4791
3264      3229
2560       734
1920         1
Name: x, dtype: int64

In [22]:
df_sizes["y"].value_counts()

1536    130260
816     102418
1024     60393
2160     36026
1200      9268
1496      4750
1832      2905
1920       734
2448       324
1984        41
1440         1
Name: y, dtype: int64

In [23]:
df_sizes["dim"] = df_sizes["x"]+" x "+df_sizes["y"]
df_sizes.dim.value_counts()

2048 x 1536    130260
1088 x 816     102418
1280 x 1024     60393
3840 x 2160     36026
1600 x 1200      9268
2576 x 1496      4750
3264 x 1832      2905
2560 x 1920       734
3264 x 2448       324
2576 x 1984        41
1920 x 1440         1
Name: dim, dtype: int64

In [25]:
df_desc_sizes = pd.merge(df, df_sizes, how="left", on="uniqueName")
df_desc_sizes

Unnamed: 0,uniqueName,species,main_folder,is_valid,sub_folder,x,y,dim
0,0000005.jpg,Human,Compressed Camera Trap Images,False,T33,1088,816,1088 x 816
1,0000007.jpg,Human,Compressed Camera Trap Images,False,T33,1088,816,1088 x 816
2,0000009.jpg,Human,Compressed Camera Trap Images,False,T33,1088,816,1088 x 816
3,0000013.jpg,Human,Compressed Camera Trap Images,False,T33,1088,816,1088 x 816
4,0000015.jpg,Human,Compressed Camera Trap Images,False,T33,1088,816,1088 x 816
...,...,...,...,...,...,...,...,...
347115,1529566.jpg,Duiker_Red,allData,False,NouabaleTEAM,1280,1024,1280 x 1024
347116,1529571.jpg,Duiker_Red,allData,False,NouabaleTEAM,1280,1024,1280 x 1024
347117,1529572.jpg,Duiker_Red,allData,False,NouabaleTEAM,1280,1024,1280 x 1024
347118,1529575.jpg,Duiker_Red,allData,False,NouabaleTEAM,1280,1024,1280 x 1024


In [31]:
counts_by_source_dim = df_desc_sizes.groupby(["main_folder", "dim", "is_valid"]).size().reset_index(name="counts").sort_values(by=['main_folder','counts'],ascending=False)
counts_by_source_dim

Unnamed: 0,main_folder,dim,is_valid,counts
7,allData,2048 x 1536,False,123954
5,allData,1280 x 1024,False,60393
6,allData,1600 x 1200,True,9268
8,allData,2048 x 1536,True,6306
10,allData,2576 x 1496,True,4750
12,allData,3264 x 1832,True,2905
9,allData,2560 x 1920,True,734
13,allData,3264 x 2448,True,324
11,allData,2576 x 1984,True,41
3,Compressed Camera Trap Images,1088 x 816,False,88990


In [30]:
counts_by_source_dim_all_data = df_desc_sizes[df_desc_sizes["main_folder"] == "allData"]\
                                    .groupby(["dim", "sub_folder", "is_valid"]).size()\
                                    .reset_index(name="counts")\
                                    .sort_values(by=['sub_folder','counts'],ascending=False)
counts_by_source_dim_all_data

Unnamed: 0,dim,sub_folder,is_valid,counts
5,2576 x 1496,StephBrittainZSL,True,4750
7,3264 x 1832,StephBrittainZSL,True,2905
9,3264 x 2448,StephBrittainZSL,True,74
6,2576 x 1984,StephBrittainZSL,True,41
0,1280 x 1024,NouabaleTEAM,False,60393
1,1600 x 1200,LailaBahaaelDinPanthera,True,9268
3,2048 x 1536,LailaBahaaelDinPanthera,True,6306
4,2560 x 1920,LailaBahaaelDinPanthera,True,734
8,3264 x 2448,LailaBahaaelDinPanthera,True,250
2,2048 x 1536,KorupTEAM,False,123954


### Proportions

In [36]:
df_sizes["propor"] = round(df_sizes.x.apply(float) / df_sizes.y.apply(float),2)
df_sizes.propor.value_counts()

1.33    243005
1.25     60393
1.78     38931
1.72      4750
1.30        41
Name: propor, dtype: int64

In [39]:
df_sizes.groupby(["propor", "dim"]).size()

propor  dim        
1.25    1280 x 1024     60393
1.30    2576 x 1984        41
1.33    1088 x 816     102418
        1600 x 1200      9268
        1920 x 1440         1
        2048 x 1536    130260
        2560 x 1920       734
        3264 x 2448       324
1.72    2576 x 1496      4750
1.78    3264 x 1832      2905
        3840 x 2160     36026
dtype: int64

In [37]:
df_sizes

Unnamed: 0,x,y,uniqueName,dim,propor
0,1088,816,0000005.jpg,1088 x 816,1.33
1,1088,816,0000007.jpg,1088 x 816,1.33
2,1088,816,0000009.jpg,1088 x 816,1.33
3,1088,816,0000013.jpg,1088 x 816,1.33
4,1088,816,0000015.jpg,1088 x 816,1.33
...,...,...,...,...,...
347115,1280,1024,1529566.jpg,1280 x 1024,1.25
347116,1280,1024,1529571.jpg,1280 x 1024,1.25
347117,1280,1024,1529572.jpg,1280 x 1024,1.25
347118,1280,1024,1529575.jpg,1280 x 1024,1.25


In [41]:
# ! cp /data/Gabon_trainingData/0000005.jpg /home/jupyter/0000005.jpg

Hence we want to do:
* for propor ~1.3 (so 1.25, 1.3, 1.33)
 * push y to 384 and x accordingly (so x is ~512)
* else (so 1.78, 1.72)
 * push y to 384 so x is 684 and 660