# Model for Nature Conservancy Fisheries Kaggle Competition

#### Dependencies

In [None]:
import fish_data as fd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import json

#### Helper functions

In [None]:
help(fd)

#### Generate a list of filenames

In [None]:
fish_filenames = fd.generate_filenames_list('data/train/', subfolders = True)
print("There are {} filenames in the master set list".format(len(fish_filenames)))
test_filenames = fd.generate_filenames_list('data/test_stg1/', subfolders = False)
print("There are {} filenames in the test set list".format(len(test_filenames)))

#### Generate the labels for the master set list

In [None]:
fish_label_arr = fd.make_labels(fish_filenames, 'train/', '/img')
fish_label_arr.shape
print("One-hot labels generated correctly: {}".format(all(np.sum(fish_label_arr, 0) == [1719, 200, 117, 67, 465, 299, 176, 734]) ))

In [None]:
valid_size = 250
files_train, files_val, y_train, y_val = train_test_split(fish_filenames, fish_label_arr, test_size = valid_size)
print("Validation set size: {}".format(y_val.shape[0]))
print("Training set size: {}".format(y_train.shape[0]))

In [None]:
files_train, y_train = fd.generate_balanced_filenames_epoch(files_train, y_train, shuffle = True)

#### Shuffle and split the master set list into training and validation sets

In [None]:
val_data, val_labels = fd.process_batch(files_val, y_val, offset = 0, batch_size = valid_size, 
                        std_size = 256, crop_size = 224, crop_mode = 'centre', normalize = 'custom', 
                        pixel_offset = 98.0, pixel_factor = 120.0,
                        mutation = False, verbose = True)

In [None]:
example_batch, example_labels = fd.process_batch(files_train, y_train, offset = 0, batch_size = 5, 
                        std_size = 256, crop_size = 224, crop_mode = 'random', normalize = None, 
                        pixel_offset = 98.0, pixel_factor = 120.0,
                        mutation = True, verbose = True)

In [None]:
for x in range(5):
    print("Fish Label: {}".format(np.argmax(example_labels[x,:],0)))
    fd.show_panel(example_batch[x,:,:,:])

## Graph and Session Runs

#### Graph parameters

In [None]:
%run -i 'PARAMETERS.py'

#### Session parameters

In [None]:
version_ID = 'v2.2.1.0'

**Note on version_ID:** First digit refers to overall architecture.  Second digit refers to runs where the outputs are comparable (summaries are the same, etc).  Third digit refers to significant parameter or activation changes (e.g. depths of layers, types of activation).  Fourth digit refers to minor parameter changes (e.g. learning rate).  (All in theory)


In [None]:
%run -i 'GRAPH.py'

In [None]:
%run -i 'SESSION.py'