In [1]:
#External imports
import glob
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import re
import sys
import cv2
import pandas as pd
sys.path.append("../")

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib import colors
from PIL import Image
from skimage.io import imread
from matplotlib import pyplot as plt  
from skimage.transform import rotate, rescale, resize

In [2]:
# Get all the images present in the ss_data drive. using GLOB
data_set_path = r'E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped\S9'
image_files = glob.glob(os.path.join(data_set_path, "*/*/*.JPG"))
number_of_files = len(image_files)
print("No of files: ", number_of_files)

No of files:  976985


In [2]:
# Path prefix to be used for the image_names in the csvs
path_prefix = r'E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped'
# Read the latest train csv 
train_csv = r'E:\ss_data\train_phase2_v6_fixed.csv'
train_df = pd.read_csv(train_csv)
print(f"Number of csv files v1: {len(train_df)}")
# Append the other csv files as well 
#v6 = 000000 to 700000
#v7 = 700000 to end
train_csv_v2 = r'E:\ss_data\train_phase2_v7_fixed.csv'
train_df_v2 = pd.read_csv(train_csv_v2)
train_df_v2.columns = ['file_path', 'species']
print(f"Number of csv files v2: {len(train_df_v2)}")
print(f'Train df v2: \n{train_df_v2.head()}')
print(f"Total number of samples: {len(train_df_v2)+len(train_df)}")

Number of csv files v1: 699931
Number of csv files v2: 286211
Train df v2: 
                              file_path  species
0  S9/M10/M10_R2/S9_M10_R2_IMAG0041.JPG        2
1  S9/M10/M10_R2/S9_M10_R2_IMAG0627.JPG        0
2  S9/M10/M10_R2/S9_M10_R2_IMAG0628.JPG        0
3  S9/M10/M10_R2/S9_M10_R2_IMAG0629.JPG        0
4  S9/M10/M10_R2/S9_M10_R2_IMAG6529.JPG        0
Total number of samples: 986142


In [3]:
train_df = pd.concat([train_df, train_df_v2], axis=0, ignore_index=True)
print(f"Number of csv files: {len(train_df)}")

Number of csv files: 986142


In [4]:
# Take a look at the unique species in our datasets
print(f"Unique species labels in our dataset: {train_df.species.unique()}")

Unique species labels in our dataset: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52]


In [5]:
# Read csv for labels enumeration
species_enumeration = r'E:\ss_data\label_to_species_V3.csv'
enum_species_df = pd.read_csv(species_enumeration)
print(f"Number of species: {len(enum_species_df)}")
print(f'{enum_species_df.head()}')

Number of species: 53
       species  encoded_species
0     aardvark               33
1     aardwolf               22
2       baboon                4
3  batEaredFox               17
4        blank                0


In [6]:
# Join species with the train csv to get the correct labels
# Identify joining columns
print(f"Train df columns: {train_df.columns}")
print(f"Species enum columns: {enum_species_df.columns}")
# join on species and encoded_species
train_df = pd.merge(left=train_df, right=enum_species_df, how='inner', left_on='species', right_on='encoded_species')
train_df.drop('species_x', axis=1, inplace=True)
train_df.columns = ['file_path_rel', 'species', 'encoded_species']
print(f'Merged train df: \n{train_df.head()}')

Train df columns: Index(['file_path', 'species'], dtype='object')
Species enum columns: Index(['species', 'encoded_species'], dtype='object')
Merged train df: 
                          file_path_rel species  encoded_species
0  S9/B03/B03_R1/S9_B03_R1_IMAG0001.JPG   blank                0
1  S9/B03/B03_R1/S9_B03_R1_IMAG0002.JPG   blank                0
2  S9/B03/B03_R1/S9_B03_R1_IMAG0015.JPG   blank                0
3  S9/B03/B03_R1/S9_B03_R1_IMAG0016.JPG   blank                0
4  S9/B03/B03_R1/S9_B03_R1_IMAG0017.JPG   blank                0


In [7]:
# Filter on the dataset with species only
species_train_df = train_df.loc[(train_df.encoded_species > 0)]
print(f'Species only train df: \n{species_train_df.head()}')
print(f'Species only size: \n{len(species_train_df)}')
# Store this as a csv
species_train_path = r'E:\ss_data\train_species_only.csv'
species_train_df.to_csv(species_train_path, sep=',', header=True, index=False, columns=['file_path_rel', 'encoded_species'])

Species only train df: 
                               file_path_rel     species  encoded_species
773884  S9/B03/B03_R1/S9_B03_R1_IMAG0250.JPG  wildebeest                1
773885  S9/B03/B03_R1/S9_B03_R1_IMAG0251.JPG  wildebeest                1
773886  S9/B03/B03_R1/S9_B03_R1_IMAG0252.JPG  wildebeest                1
773887  S9/B03/B03_R1/S9_B03_R1_IMAG0256.JPG  wildebeest                1
773888  S9/B03/B03_R1/S9_B03_R1_IMAG0258.JPG  wildebeest                1
Species only size: 
212258


In [17]:
#Extract a df that contains only 1 species
duplicatedspecies_df = species_train_df[species_train_df.duplicated(['file_path_rel'])]
print(f'Duplicated files : \n{duplicatedspecies_df.head()}')
print(f'Duplicated files length: \n{len(duplicatedspecies_df)}')
print(f'Unique files: \n{duplicatedspecies_df.file_path_rel.nunique()}')
# We see that this list still contains double capture_id's meaning that some images (7128-7065= 63) contain more than 2 species 
# # cond1 = files_with_multiple_species Remove them from our train csv
cond1 = species_train_df['file_path_rel'].isin(duplicatedspecies_df['file_path_rel'])
# for the rows where condition = true -> row is dropped cause it refers to image containing multiple species 
single_species_df = species_train_df.drop(species_train_df[cond1].index)
print(f'Train csv df with only one species: \n{single_species_df.head()}')
print(f'Number of files with one species: \n{len(single_species_df)}')
# Store this as a csv
single_species_train_path = r'E:\ss_data\train_species_only_singles.csv'
single_species_df.to_csv(single_species_train_path, sep=',', header=True, index=False, columns=['file_path_rel', 'encoded_species'])


Duplicated files : 
                               file_path_rel species  encoded_species
840670  S9/B03/B03_R1/S9_B03_R1_IMAG0057.JPG   zebra                2
840671  S9/B03/B03_R1/S9_B03_R1_IMAG0058.JPG   zebra                2
840672  S9/B03/B03_R1/S9_B03_R1_IMAG0059.JPG   zebra                2
840685  S9/B03/B03_R1/S9_B03_R1_IMAG0911.JPG   zebra                2
840727  S9/B03/B03_R2/S9_B03_R2_IMAG0227.JPG   zebra                2
Duplicated files length: 
9188
Unique files: 
9092
Train csv df with only one species: 
                               file_path_rel     species  encoded_species
773884  S9/B03/B03_R1/S9_B03_R1_IMAG0250.JPG  wildebeest                1
773885  S9/B03/B03_R1/S9_B03_R1_IMAG0251.JPG  wildebeest                1
773886  S9/B03/B03_R1/S9_B03_R1_IMAG0252.JPG  wildebeest                1
773887  S9/B03/B03_R1/S9_B03_R1_IMAG0256.JPG  wildebeest                1
773888  S9/B03/B03_R1/S9_B03_R1_IMAG0258.JPG  wildebeest                1
Number of files with one spe

In [3]:
train_df['file_path_local'] = train_(df.file_path.map(lambda x: os.path.join(path_prefix, x))

In [8]:
# train_df needed as input having a file path local, file path rel and a label
# Code to resize images and then store then as a train csv ready to be used by the model
base_dir = os.path.join("e:/", "ss_data")
rel_path = []
species_found = []
encoded_species = []
start_from = 700000
end_at = 1000000
for index, file_name in enumerate(train_df.file_path_local[start_from:]):
#     print(file_name)
#     print(index)
#     print(train_df.file_path[index])
#     print(train_df.file_path_local[index])
#     print(train_df.encoded_species[index])

    index = start_from + index
    if index == (start_from+end_at):
        print(f'Breaking at size {index}')
        break
    
    image = cv2.imread(file_name)
    if image is not None:
#         print(image.shape)
        image = cv2.resize(image, (256, 256)) 
#         print(image.shape)
        cv2.imwrite(file_name, image)
        rel_path.append(train_df.file_path[index])
        species_found.append(0 if train_df.encoded_species[index] == 0 else 1)
        encoded_species.append(train_df.encoded_species[index])
    else:        
        if os.path.isfile(file_name):
            print(f'Removing {file_name}')
            try:
                os.remove(os.path.join(file_name))
            except:
                continue
        continue
    if index % 10000 == 0:
        print(index)    
    if index % 100000 == 0:
        print(index)
        
print(f'Lengths: RelativePath  = {len(rel_path)}.  species_found  = {len(species_found)}.  encoded_species  = {len(encoded_species)}')
train_df_filtered = pd.DataFrame(np.column_stack([rel_path, species_found, encoded_species]), 
                               columns=['rel_path', 'boolean_species', 'encoded_species'])
print(f'Filtered Dataframe size {len(train_df_filtered)}')
output_filepath = os.path.join(base_dir, "train_phase1_v7.csv")
train_df_filtered.to_csv(output_filepath, columns=['rel_path', 'boolean_species'], sep=',', index=False)

output_filepath = os.path.join(base_dir, "train_phase2_v7.csv")
train_df_filtered.to_csv(output_filepath, columns=['rel_path', 'encoded_species'], sep=',', index=False)

700000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
900000
910000
920000
930000
940000
950000
Removing E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped\S9/S10/S10_R2/S9_S10_R2_IMAG1045.JPG
Removing E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped\S9/S10/S10_R2/S9_S10_R2_IMAG1689.JPG
Removing E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped\S9/S10/S10_R2/S9_S10_R2_IMAG0216.JPG
Removing E:\ss_data\snapshotserengeti-unzipped\snapshotserengeti-unzipped\S9/S10/S10_R2/S9_S10_R2_IMAG2383.JPG
960000
970000
980000
Lengths: RelativePath  = 286211.  species_found  = 286211.  encoded_species  = 286211
Filtered Dataframe size 286211
