# Model for Nature Conservancy Fisheries Kaggle Competition

#### Dependencies

In [1]:
import fish_data as fd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import json

#### Helper functions

In [2]:
help(fd)

Help on module fish_data:

NAME
    fish_data

DESCRIPTION
    fish_data module contains the helper functions for the model build of the
    Nature Conservancy Fisheries Kaggle Competition.
    
    Dependencies:
        * numpy as np
        * os
        * scipy.ndimage as ndimage
        * scipy.misc as misc
        * scipy.special as special
        * matplotlib.pyplot as plt
        * tensorflow as tf

FUNCTIONS
    count_nodes(x, y, kernel, stride, conv_depth, pad='SAME')
        Calculates the number of total nodes present in the next layer of a
        convolution OR max_pooling event.
    
    decode_image(image_name, size, num_channels=3, mean_channel_vals=[155.0, 155.0, 155.0], mutate=False, crop='random', crop_size=224)
        Converts a dequeued image read from filename to a single tensor array,
        with modifications:
            * smallest dimension resized to standard height and width supplied in size param
            * each channel centered to mean near zero.  Dev

#### Generate a list of filenames

In [3]:
fish_filenames = fd.generate_filenames_list('data/train/', subfolders = True)
print("There are {} filenames in the master set list".format(len(fish_filenames)))
test_filenames = fd.generate_filenames_list('data/test_stg1/', subfolders = False)
print("There are {} filenames in the test set list".format(len(test_filenames)))

There are 3777 filenames in the master set list
There are 1000 filenames in the test set list


#### Retrieve Dictionary of image dimensions

In [4]:
with open('dimensions_dict.json') as f:
    dim_dict = json.load(f)
    
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(fish_filenames[0]) == [720, 1280, 3]))
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(test_filenames[0]) == [720, 1280, 3]))

Training/Valid set filename dimensions downloaded correctly: True
Training/Valid set filename dimensions downloaded correctly: True


#### Generate the labels for the master set list

In [5]:
fish_label_arr = fd.make_labels(fish_filenames, 'train/', '/img')
fish_label_arr.shape
print("One label per row entry: {}".format(all(np.sum(fish_label_arr, 1) == 1) ))

One label per row entry: True


#### Shuffle and split the master set list into training and validation sets

In [6]:
valid_size = 300
files_train, files_val, y_train, y_val = train_test_split(fish_filenames, fish_label_arr, test_size = valid_size)
print("Validation set size: {}".format(y_val.shape[0]))
print("Training set size: {}".format(y_train.shape[0]))

Validation set size: 300
Training set size: 3477


#### Generate a files_train list that represents each class of fish equally

In [7]:
"""Need to refactor generate_balanced_filenames to work from this list, not from scratch."""

'Need to refactor generate_balanced_filenames to work from this list, not from scratch.'

In [8]:
train_dims_list = []
for f in files_train :
    train_dims_list.append(dim_dict.get(f))

## Graph and Session Runs

#### Graph parameters

In [9]:
%run -i 'PARAMETERS.py'

Dimensions for each entry: 224x224x3 = 150528
Dimensions after first convolution step (with max pool): 27x27x96 = 69984
Dimensions after second convolution step (with max pool): 13x13x256 = 43264
Dimensions after third convolution step: 13x13x384 = 64896
Dimensions after fourth convolution step: 13x13x384 = 64896
Dimensions after fifth convolution step (with max pool): 6x6x256 = 9216
Dimensions after first connected layer: 4096
Dimensions after second connected layer: 2048
Final dimensions for classification: 8


#### Session parameters

In [10]:
version_ID = 'v2.0.0.2'

In [None]:
%run -i 'GRAPH.py'

In [None]:
%run -i 'SESSION.py'

Initialized!


To view your tensorboard dashboard summary, run the following on the command line:
tensorboard --logdir='/Users/ccthomps/Documents/Python Files/Kaggle Competitions/Nature Conservancy Fisheries/TB_logs/v2.0.0.2'

Batch number: 1
     Training_mean_cross_entropy: 2.0615084171295166
     Valid_mean_cross_entropy: 2.016716718673706
[[ 0.14332998 -0.11647757  0.04149004  0.1926288  -0.06375211 -0.08433591
   0.13266997  0.06963497]
 [ 0.14355561 -0.11767981  0.04131452  0.19433841 -0.06332918 -0.08434913
   0.13230242  0.06896063]]
Batch number: 5
     Training_mean_cross_entropy: 1.7985576391220093
     Valid_mean_cross_entropy: 1.8652681112289429
[[ 0.22698489 -0.12943108  0.01835114  0.16600189 -0.07970841 -0.09153789
   0.10572385  0.10160749]
 [ 0.22733074 -0.13064615  0.01812139  0.16750154 -0.07934919 -0.09158377
   0.10531699  0.10091592]]
Batch number: 9
     Training_mean_cross_entropy: 1.7907931804656982
     Valid_mean_cross_entropy: 1.6612229347229004
[[ 0.903792

#### Notes during run 


In [None]:
print(test_df)

In [None]:
W.shape

In [None]:
for i in range(W.shape[3]) :
    print(i)
    plt.imshow(W[:,:,:,i])
    plt.show()