# Model for Nature Conservancy Fisheries Kaggle Competition

#### Dependencies

In [1]:
import fish_data as fd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import json

#### Helper functions

In [2]:
help(fd)

Help on module fish_data:

NAME
    fish_data

DESCRIPTION
    fish_data module contains the helper functions for the model build of the
    Nature Conservancy Fisheries Kaggle Competition.
    
    Dependencies:
        * numpy as np
        * os
        * scipy.ndimage as ndimage
        * scipy.misc as misc
        * scipy.special as special
        * matplotlib.pyplot as plt
        * tensorflow as tf

FUNCTIONS
    count_nodes(x, y, kernel, stride, conv_depth, pad='SAME')
        Calculates the number of total nodes present in the next layer of a
        convolution OR max_pooling event.
    
    decode_image(image_name, size, num_channels=3, mean_channel_vals=[155.0, 155.0, 155.0], mutate=False, crop='random', crop_size=224)
        Converts a dequeued image read from filename to a single tensor array,
        with modifications:
            * smallest dimension resized to standard height and width supplied in size param
            * each channel centered to mean near zero.  Dev

#### Generate a list of filenames

In [3]:
fish_filenames = fd.generate_filenames_list('data/train/', subfolders = True)
print("There are {} filenames in the master set list".format(len(fish_filenames)))
test_filenames = fd.generate_filenames_list('data/test_stg1/', subfolders = False)
print("There are {} filenames in the test set list".format(len(test_filenames)))

There are 3777 filenames in the master set list
There are 1000 filenames in the test set list


#### Retrieve Dictionary of image dimensions

In [4]:
with open('dimensions_dict.json') as f:
    dim_dict = json.load(f)
    
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(fish_filenames[0]) == [720, 1280, 3]))
print("Training/Valid set filename dimensions downloaded correctly: {}".format(
        dim_dict.get(test_filenames[0]) == [720, 1280, 3]))

Training/Valid set filename dimensions downloaded correctly: True
Training/Valid set filename dimensions downloaded correctly: True


#### Generate the labels for the master set list

In [5]:
fish_label_arr = fd.make_labels(fish_filenames, 'train/', '/img')
fish_label_arr.shape
print("One-hot labels generated correctly: {}".format(all(np.sum(fish_label_arr, 0) == [1719, 200, 117, 67, 465, 299, 176, 734]) ))

One-hot labels generated correctly: True


In [6]:
f_list, f_labels = fd.generate_balanced_filenames_epoch(fish_filenames, fish_label_arr, shuffle = False)

Fish counts: [1719  200  117   67  465  299  176  734]
New fish counts: [1719 1719 1719 1719 1719 1719 1719 1719]


#### Shuffle and split the master set list into training and validation sets

In [7]:
valid_size = 500
files_train, files_val, y_train, y_val = train_test_split(f_list, f_labels, test_size = valid_size)
print("Validation set size: {}".format(y_val.shape[0]))
print("Training set size: {}".format(y_train.shape[0]))

Validation set size: 500
Training set size: 13252


In [8]:
val_data, val_labels = fd.process_batch(files_val, y_val, offset = 0, batch_size = 500, 
                        std_size = 256, crop_size = 224, crop_mode = 'centre', 
                        pixel_offsets = [96.482, 107.203,99.974], mutation = False)

In [9]:
print(val.shape)
print("Cropped validation set is {} kb".format(val.nbytes/1000))

NameError: name 'val' is not defined

## Graph and Session Runs

#### Graph parameters

In [None]:
%run -i 'PARAMETERS.py'

#### Session parameters

In [None]:
version_ID = 'v2.0.0.4'

In [None]:
%run -i 'GRAPH.py'

In [None]:
%run -i 'SESSION.py'

#### Notes during run 


In [None]:
print(test_df)

In [None]:
for i in range(W.shape[3]) :
    print(i)
    plt.imshow(W[:,:,:,i])
    plt.show()

In [None]:
np.zeros([11,11,3,96]).reshape().shape