# Model for Nature Conservancy Fisheries Kaggle Competition

#### Dependencies

In [1]:
import fish_data as fd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import json

#### Helper functions

In [2]:
help(fd)

Help on module fish_data:

NAME
    fish_data

DESCRIPTION
    fish_data module contains the helper functions for the model build of the
    Nature Conservancy Fisheries Kaggle Competition.
    
    Dependencies:
        * numpy as np
        * os
        * scipy.ndimage as ndimage
        * scipy.misc as misc
        * scipy.special as special
        * matplotlib.pyplot as plt
        * tensorflow as tf

FUNCTIONS
    count_nodes(x, y, kernel, stride, conv_depth, pad='SAME')
        Calculates the number of total nodes present in the next layer of a
        convolution OR max_pooling event.
    
    decode_image(image_name, size, num_channels=3, mean_channel_vals=[155.0, 155.0, 155.0], mutate=False, crop='random', crop_size=224)
        Converts a dequeued image read from filename to a single tensor array,
        with modifications:
            * smallest dimension resized to standard height and width supplied in size param
            * each channel centered to mean near zero.  Dev

#### Generate a list of filenames

In [3]:
fish_filenames = fd.generate_filenames_list('data/train/', subfolders = True)
print("There are {} filenames in the master set list".format(len(fish_filenames)))
test_filenames = fd.generate_filenames_list('data/test_stg1/', subfolders = False)
print("There are {} filenames in the test set list".format(len(test_filenames)))

There are 3777 filenames in the master set list
There are 1000 filenames in the test set list


#### Retrieve Dictionary of image dimensions

In [4]:
with open('dimensions_dict.json') as f:
    dim_dict = json.load(f)
    
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(fish_filenames[0]) == [720, 1280, 3]))
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(test_filenames[0]) == [720, 1280, 3]))

Training/Valid set filename dimensions downloaded correctly: True
Training/Valid set filename dimensions downloaded correctly: True


#### Generate the labels for the master set list

In [5]:
fish_label_arr = fd.make_labels(fish_filenames, 'train/', '/img')
fish_label_arr.shape
print("One-hot labels generated correctly: {}".format(all(np.sum(fish_label_arr, 0) == [1719, 200, 117, 67, 465, 299, 176, 734]) ))

One-hot labels generated correctly: True


In [6]:
f_list, f_labels = fd.generate_balanced_filenames_epoch(fish_filenames, fish_label_arr, shuffle = False)

Fish counts: [1719  200  117   67  465  299  176  734]
New fish counts: [1719 1719 1719 1719 1719 1719 1719 1719]


#### Shuffle and split the master set list into training and validation sets

In [7]:
valid_size = 300
files_train, files_val, y_train, y_val = train_test_split(f_list, f_labels, test_size = valid_size)
print("Validation set size: {}".format(y_val.shape[0]))
print("Training set size: {}".format(y_train.shape[0]))

Validation set size: 300
Training set size: 13452


In [8]:
val_data, val_labels = fd.process_batch(files_val, y_val, offset = 0, batch_size = valid_size, 
                        std_size = 256, crop_size = 224, crop_mode = 'centre', 
                        pixel_offsets = [96.482, 107.203,99.974], mutation = False)

In [9]:
print(val_data.shape)
print("Cropped validation set is {} kb".format(val_data.nbytes/1000))
print("Mean pixel value for RGB is {}".format([np.mean(val_data[:,:,x]) for x in range(3) ]))
print("SD pixel value for RGB is {}".format([np.std(val_data[:,:,x]) for x in range(3)]))

(300, 224, 224, 3)
Cropped validation set is 180633.6 kb
Mean pixel value for RGB is [-0.016075572, -0.017150072, -0.019373991]
SD pixel value for RGB is [0.36377558, 0.36338377, 0.36223468]


## Graph and Session Runs

#### Graph parameters

In [10]:
%run -i 'PARAMETERS.py'

Dimensions for each entry: 224x224x3 = 150528
Dimensions after first convolution step (with max pool): 27x27x96 = 69984
Dimensions after second convolution step (with max pool): 13x13x256 = 43264
Dimensions after third convolution step: 13x13x384 = 64896
Dimensions after fourth convolution step: 13x13x384 = 64896
Dimensions after fifth convolution step (with max pool): 6x6x256 = 9216
Dimensions after first connected layer: 4096
Dimensions after second connected layer: 2048
Final dimensions for classification: 8


#### Session parameters

In [11]:
version_ID = 'v2.0.0.5'

In [12]:
%run -i 'GRAPH.py'

In [13]:
%run -i 'SESSION.py'

Initialized!


To view your tensorboard dashboard summary, run the following on the command line:
tensorboard --logdir='/Users/ccthomps/Documents/Python Files/Kaggle Competitions/Nature Conservancy Fisheries/TB_logs/v2.0.0.5'

Batch number: 1
     Training_mean_cross_entropy: 19.157926559448242
     Valid_mean_cross_entropy: 8.442024230957031
[5 2] [[ -1.75796688   7.79659414   4.79857492   4.27434969   2.2399447
   -8.49505711   0.65023082 -16.93010712]
 [ -2.61181045   6.07383013   3.90891409   1.27933574   3.91963458
   -5.2151947   -2.31330442 -18.31378365]]
Batch number: 3
     Training_mean_cross_entropy: 15.992972373962402
     Valid_mean_cross_entropy: 8.355060577392578
[5 2] [[-0.60826963  5.10380268  9.67159939 -4.22150326 -4.61054707 -3.55666924
  -0.82151157 -7.30444336]
 [-0.1117873   4.08006001  7.28828239 -8.23226738 -6.20082331 -2.490165
  -0.0895315  -7.95359135]]
Batch number: 5
     Training_mean_cross_entropy: 7.694501876831055
     Valid_mean_cross_entropy: 3.61543

KeyboardInterrupt: 

#### Notes during run 
