# Model for Nature Conservancy Fisheries Kaggle Competition

#### Dependencies

In [1]:
import fish_data as fd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd

#### Helper functions

In [2]:
help(fd)

Help on module fish_data:

NAME
    fish_data

DESCRIPTION
    fish_data module contains the helper functions for the model build of the
    Nature Conservancy Fisheries Kaggle Competition.
    
    Dependencies:
        * numpy as np
        * os
        * scipy.ndimage as ndimage
        * scipy.misc as misc
        * scipy.special as special
        * matplotlib.pyplot as plt
        * tensorflow as tf

FUNCTIONS
    count_nodes(y_in, x_in, conv_depths, conv_strides, pool_strides)
        Calculates the number of total nodes present in the last layer of a
        convolution plus max_pooling architecture.  Calculations assume that
        convolution is 'SAME' padded, and pooling is 'VALID' padded.
    
    decode_image(image_read, size, num_channels=3, mutate=False, brightness_delta=None, contrast_limits=None, hue_delta=None, saturation_limits=None)
        Converts a dequeued image read from filename to a single tensor array,
        with modifications:
            * resized to st

#### Generate a list of filenames

In [3]:
fish_filenames = fd.generate_filenames_list('data/train/', subfolders = True)
print("There are {} filenames in the master set list".format(len(fish_filenames)))
test_filenames = fd.generate_filenames_list('data/test_stg1/', subfolders = False)
print("There are {} filenames in the test set list".format(len(test_filenames)))

There are 3777 filenames in the master set list
There are 1000 filenames in the test set list


#### Generate the labels for the master set list

In [4]:
fish_label_arr = fd.make_labels(fish_filenames, 'train/', '/img')
fish_label_arr.shape
print("One label per row entry: {}".format(all(np.sum(fish_label_arr, 1) == 1) ))

One label per row entry: True


#### Shuffle and split the master set list into training and validation sets

In [5]:
valid_size = 200
files_train, files_val, y_train, y_val = train_test_split(fish_filenames, fish_label_arr, test_size = valid_size)
print("Validation set size: {}".format(y_val.shape[0]))
print("Training set size: {}".format(y_train.shape[0]))

Validation set size: 200
Training set size: 3577


## Graph and Session Runs

#### Graph parameters

In [6]:
num_epochs = 15
#Preprocessing
std_y = 300
std_x = 500

# General
num_channels = 3
num_labels = 8
batch_size = 25
stddev = 0.2

# convolution
kernel_sizes = [5, 3, 3, 3, 3, 3]
conv_depths = [64, 64, 128, 256, 512, 256] # NOTE : first 64 currently not used with dilation2d
conv_strides = [4, 1, 1, 1, 1, 1]

pool_strides = [2, 2, 2, 2]

final_depth = conv_depths[-1]

#dropout
kp = 0.75

# fully connected
fc1_depth = 256
fc2_depth = 64

#regularization
beta = 1e-1 

# Learning rate
init_rate = 5e-2
per_steps = len(files_train)
decay_rate = 0.9

# Report rate
validate_interval = 20


#### Session parameters

In [7]:
version_ID = 'v1.3.3.3'
logs_path = os.getcwd()+'/TB_logs/'+version_ID

In [8]:
%run -i 'GRAPH.py'

In [9]:
%run -i 'SESSION.py'

Initialized!


To view your tensorboard dashboard summary, run the following on the command line:
tensorboard --logdir='/Users/ccthomps/Documents/Python Files/Kaggle Competitions/Nature Conservancy Fisheries/TB_logs/v1.3.3.3'

Batch number: 1
     Training_mean_cross_entropy: 13749.26171875
     Valid_mean_cross_entropy: 12367665.0
Batch number: 5
     Training_mean_cross_entropy: 712026.875
     Valid_mean_cross_entropy: 3113979.0
Batch number: 9
     Training_mean_cross_entropy: 12108.599609375
     Valid_mean_cross_entropy: 5236.61767578125
Batch number: 13
     Training_mean_cross_entropy: 230.14035034179688
     Valid_mean_cross_entropy: 920.41015625
Batch number: 17
     Training_mean_cross_entropy: 19.871185302734375
     Valid_mean_cross_entropy: 2.032050371170044
Batch number: 21
     Training_mean_cross_entropy: 29.451492309570312
     Valid_mean_cross_entropy: 2.0072824954986572
Batch number: 25
     Training_mean_cross_entropy: 1.8928322792053223
     Valid_mean_cross_entro

In [47]:
preds = pd.read_csv('Test_predictions/v1.3.3.3.csv').ix[:, 1:]

In [51]:
def softmax(arr) :
    probs = []
    for i in range(arr.shape[0]) :
        l = arr.ix[i,:]
        lm = np.max(l)
        f = l - lm
        sm = np.exp(f) / np.sum(np.exp(f))
        probs.append(sm)
    return probs

In [52]:
softmax(preds)

[ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 0, dtype: float64, ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 1, dtype: float64, ALB      0.0
 BET      1.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      0.0
 Name: 2, dtype: float64, ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 3, dtype: float64, ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 4, dtype: float64, ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 5, dtype: float64, ALB      0.0
 BET      0.0
 DOL      0.0
 LAG      0.0
 NoF      0.0
 OTHER    0.0
 SHARK    0.0
 YFT      1.0
 Name: 6, dtype: float64, ALB       0.000000e+00
 BET      3.68641

In [38]:
s2 = np.exp(s1) / np.sum(np.exp(s1), axis = 0)

In [39]:
s2

Unnamed: 0,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0
