In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import os
import gc

import keras as k
from keras import optimizers
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import fbeta_score, precision_score, recall_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import cv2
from tqdm import tqdm

from datetime import datetime
import time
import configparser
import json
import sys

from utils.file import makedirs
from utils.recorder import record_model_medata, record_model_scores
from utils.loader import *
from utils.f2thresholdfinder import *
from utils.imagegen import *
from utils.models import *
from utils.custommetrics import *
from utils.samplesduplicator import duplicate_train_samples
from utils.training import *
from utils.predictor import *
from utils.augmentation import *
from utils.generator import *


Using Theano backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
c:\users\me\appdata\local\temp\try_flags_apl7hd.c:4:19: fatal error: cudnn.h: No such file or directory
compilation terminated.

Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


In [2]:
timestr = time.strftime("%Y%m%d-%H%M%S")
start_time = datetime.now()

In [3]:
config_file = 'cfg/default.cfg'
#config_file = 'cfg/11_rgb.cfg'

# command line args processing "python aggregate_model.py cfg/3.cfg"
if len(sys.argv) > 1 and '.cfg' in sys.argv[1]:
    config_file = sys.argv[1]

print('reading configurations from config file: {}'.format(config_file))

settings = configparser.ConfigParser()
settings.read(config_file)
data_dir = settings.get('data', 'data_dir')

df_train = pd.read_csv(data_dir + 'train_v2.csv')
model_filename = 'aggregate_model_'+ timestr +'.h5'
model_filepath = data_dir + 'models/' + model_filename
sample_submission_filepath = data_dir + 'sample_submission_v2.csv'
number_of_samples = len(df_train.index)
print('total number of samples: {}'.format(number_of_samples))

# WARNING: keras allow either 1, 3, or 4 channels per pixel. Other numbers not allowed.
data_mask_label = np.array(['R', 'G', 'B', 'NDVI', 'NDWI', 'NIR'])
#print(settings.get('data', 'data_mask'))
data_mask_list = json.loads(settings.get('data', 'data_mask'))

data_mask = ma.make_mask(data_mask_list)
print(data_mask)

num_channels = np.sum(data_mask)
need_norm_stats = False

model_id = settings.get('model', 'model_id')
print('model: {}'.format(model_id))

# default to 64
rescaled_dim = 64
if settings.has_option('data', 'rescaled_dim'):
    rescaled_dim = settings.getint('data', 'rescaled_dim')
print('rescaled dimension: {}'.format(rescaled_dim))

# one epoch is an arbitrary cutoff : one pass over the entire training set
number_epoch = settings.getint('model', 'number_epoch')

# a batch results in exactly one update to the model.
# batch_size is limited by model size and GPU memory
batch_size = settings.getint('model', 'batch_size') 
print('batch size: {}'.format(batch_size))

classifier_threshold = 0.2 # used for end of epoch f2 approximation only

split = int(number_of_samples * 0.80)  # TODO we may want to increase to 0.90 eventually
number_validations = number_of_samples - split

has_augmentation_config = settings.has_section('augmentation')
if has_augmentation_config:
    rotation_range = settings.getint('augmentation', 'rotation_range')
    horizontal_flip = settings.getboolean('augmentation', 'horizontal_flip')
    vertical_flip = settings.getboolean('augmentation', 'vertical_flip')
    print('rotation_range:{} horizontal_flip:{} vertical_flip:{}'.format(rotation_range,horizontal_flip,vertical_flip))


reading configurations from config file: cfg/default.cfg




total number of samples: 40479
[ True  True  True False False False]
model: JAGG_2
rescaled dimension: 64
batch size: 512
rotation_range:20 horizontal_flip:True vertical_flip:True


In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

print(labels)
print(len(labels))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']
17


In [5]:
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

(40479L, 64L, 64L, 6L)
(40479L, 17L)


In [6]:
x_train = x_train[:, :, :, data_mask]

x_train = x_train.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_train.shape)

(40479L, 3L, 64L, 64L)


In [7]:
# TODO save the shuffling order to hdf5 so we can recreate the training and validation sets post execution.

# shuffle the samples because 
# 1) the original samples may not be randomized & 
# 2) to avoid the possiblility of overfitting the validation data while we tune the model
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(32383L, 3L, 64L, 64L)
(32383L, 17L)
(8096L, 3L, 64L, 64L)
(8096L, 17L)


In [9]:
# experimental hack to get more samples for augmentations for a specific low-frequency tag in unbalanced dataset. e.g. habitation
# selecting the optimal multiplier is sensitive

augmentation_hack_config = settings.has_section('augmentation_hack')
if augmentation_hack_config:
    dup_multiplier = settings.getint('augmentation_hack', 'multiplier')
    hack_label_target = settings.get('augmentation_hack', 'label_target')

if augmentation_hack_config:
    x_train, y_train = duplicate_train_samples(x_train, y_train, labels.index(hack_label_target), multiplier=dup_multiplier)
    print(x_train.shape)
    print(y_train.shape)


In [10]:
# dynamicly set num_samples_per_epoch
# TODO understand the implications of num_samples_per_epoch.
# +0.002 to +0.01??? F2 score improvement when number_of_samples * 3
# num_samples_per_epoch = 1000
num_samples_per_epoch = x_train.shape[0]

In [11]:
single_taget_model = False
# warning: experimental. 
# shuffling won't let you put things back together
if settings.has_option('data', 'single_target'):
    single_taget_model = True
    single_target_label = settings.get('data', 'single_target')
    single_target_label_index = labels.index(single_target_label)
    y_train = y_train[:,single_target_label_index]
    y_valid = y_valid[:,single_target_label_index]
    
score_averaging_method = 'binary' if single_taget_model else 'samples'
print('score_averaging_method', score_averaging_method)

('score_averaging_method', 'samples')


In [12]:
def get_img_generator():
    if has_augmentation_config:
        return GeneralImgGen(rotation_range = rotation_range, 
                             horizontal_flip = horizontal_flip, 
                             vertical_flip = vertical_flip)
    else:
        return ScaledDown() # default
    
image_generator = get_img_generator()
print('image generator', image_generator)

('image generator', rotation_range:20 horizontal_flip:True vertical_flip:True)


In [13]:
# this is the augmentation configuration we will use for training
train_datagen = image_generator.getTrainGenenerator()

In [14]:
if (need_norm_stats):
    # need to compute internal stats like featurewise std and zca whitening
    train_datagen.fit(x_train)

In [15]:
train_generator = train_datagen.flow(
        x_train,
        y_train,
        batch_size=batch_size,
        shuffle=True) 

In [16]:
validation_datagen = image_generator.getValidationGenenerator()

In [17]:
# workaround to provide your own stats: 
# http://stackoverflow.com/questions/41855512/how-does-data-normalization-work-in-keras-during-prediction/43069409#43069409
if (need_norm_stats):
    # need to compute internal stats like featurewise std and zca whitening
    validation_datagen.fit(x_valid)

In [18]:
validation_generator = validation_datagen.flow(
        x_valid,
        y_valid,
        batch_size=batch_size,
        shuffle=False)

In [19]:
if single_taget_model:
    set_model_output_layer_size(1)
    
model = get_model(model_id, num_channels, rescaled_dim, rescaled_dim)


In [20]:
# BUG when resuming training, the learning rate need to be decreased.
# let's load an existing trained model and continue training more epoch gives 0.01 improvement in LB score.
# model = load_model(data_dir + 'models/aggregate_model_20170507-124128.h5') # 0.86
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5') # 0.87
# model = load_model(data_dir + 'models/aggregate_model_20170511-133235.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170515-062741.h5')
#number_epoch = 2

In [21]:
# Ran into MemoryError when training DAGG_2 with 4 channels at epoch 50.
# To try to get reduce memory usage, limit the number of samples and batch_size

validation_num_samples = min(1280, number_of_samples - split)
x_valid_f2 = x_valid[:validation_num_samples]
y_valid_f2 = y_valid[:validation_num_samples]

# Note: threshold is fixed (not optimized per label)
def compute_f2_measure(l_model):    
    val_generator_f2 = validation_datagen.flow(
        x_valid_f2,
        y_valid_f2,
        batch_size=64,
        shuffle=False)
    raw_pred = l_model.predict_generator(val_generator_f2, validation_num_samples)
    thresholded_pred = (np.array(raw_pred) > classifier_threshold).astype(int)
    l_f2_score = fbeta_score(y_valid_f2, thresholded_pred, beta=2, average=score_averaging_method)
    return l_f2_score
    
class F2_Validation(k.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f2_measures = []
    def on_epoch_end(self, epoch, logs={}):
        self.f2_measures.append(compute_f2_measure(self.model))

f2_score_val = F2_Validation()

In [22]:
# Record performance metrics at the end of each epoch
class PerformanceHistory(k.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.train_losses = []
        self.val_losses = []
        self.train_accuracy = []
        self.val_accuracy = []

    def on_epoch_end(self, epoch, logs={}):
        self.train_losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.train_accuracy.append(logs.get('acc'))
        self.val_accuracy.append(logs.get('val_acc'))
        
perf_history = PerformanceHistory()

In [23]:
# early stopping prevents overfitting on training data
early_stop = EarlyStopping(monitor='val_loss',patience=3, min_delta=0, verbose=0, mode='auto')  # TODO tune min_delta?

# save only the best model, not the latest epoch model.
checkpoint = ModelCheckpoint(model_filepath, monitor='val_loss', verbose=1, save_best_only=True)

In [42]:
class CustomImgGenerator2(object):
	""" Generate images in batches.  Usage: pass to Keras fit_generator. 
	Augmentations: flip horizon. 
	these generators will loop indefinitely as required by Keras fit_generator """
	def trainGen(self, x_train, y_train, batch_size):
		i = 0
		limit = x_train.shape[0]
		print('limit', limit)
		while True:
			x_result = x_train[i: i + batch_size]
			x_result = x_result.transpose(0,2,3,1) # imgaug expects channels last
			x_result = fliplr(x_result)
			print(x_result.dtype)
			x_result = x_result / float(255)
			x_result = x_result.transpose(0,3,1,2)
			print(x_result.shape)

			yield x_result, y_train[i: i + batch_size]
			if i + batch_size > limit:
				i = 0
			else:
				i += batch_size

	def validationGen(self, x_valid, y_valid, batch_size):
		i = 0
		limit = x_valid.shape[0]
		while True:
			yield x_valid[i: i + batch_size] / float(255), y_valid[i: i + batch_size]
			if i + batch_size > limit:
				i = 0
			else:
				i += batch_size

	def testGen(self, x_test, y_test, batch_size):
		i = 0
		limit = x_test.shape[0]
		while True:
			yield x_test[i: i + batch_size] / float(255), y_test[i: i + batch_size]
			if i + batch_size > limit:
				i = 0
			else:
				i += batch_size

In [43]:
training_start_time = datetime.now()

history = {}
epochs_arr = [100, 100, 100]
learn_rates = [0.001, 0.0001, 0.00001]

custom_gen = CustomImgGenerator2()

for learn_rate, epochs in zip(learn_rates, epochs_arr):
    adam = optimizers.Adam(lr=learn_rate)

    # https://github.com/fchollet/keras/issues/369
    # https://github.com/fchollet/keras/blob/master/keras/losses.py
    model.compile(loss='binary_crossentropy',
              optimizer=adam,
              metrics=['accuracy', 'recall', 'precision'])

    # Remember to scale down the x values from 0-255 to 0-1
#     tmp_history = model.fit(x_train / float(255),
#                         y_train,
#                         batch_size=batch_size,
#                         nb_epoch=epochs,
#                         verbose=1,
#                         validation_data=(x_valid / float(255), y_valid),
#                         callbacks=[f2_score_val, early_stop, checkpoint])

    # use our custom generators so we can perform augmentation
    # TODO split x_train into smaller batches for larger models
    train_gen = custom_gen.trainGen(x_train, y_train, batch_size)
    valid_gen = custom_gen.validationGen(x_valid, y_valid, batch_size)
    
    tmp_history = model.fit_generator(train_gen,
                        samples_per_epoch=num_samples_per_epoch,
                        nb_epoch=epochs,
                        validation_data=valid_gen,
                        nb_val_samples=number_validations,              
                        verbose=1,
                        callbacks=[f2_score_val, early_stop, checkpoint])
    
    for k, v in tmp_history.history.iteritems():
        history.setdefault(k, []).extend(v)

# Using Keras Image Generator with image augmentation
# fits the model on batches with real-time data augmentation:
# history_obj = model.fit_generator(train_generator,
#                     samples_per_epoch=num_samples_per_epoch,
#                     nb_epoch=number_epoch,
#                     validation_data=validation_generator,
#                     nb_val_samples=number_validations,
#                     callbacks=[f2_score_val, early_stop, checkpoint])
# history = history_obj.history

time_spent_trianing = datetime.now() - training_start_time
print('model training complete')

('limit', 32383L)Epoch 1/100

uint8
(512L, 3L, 64L, 64L)
uint8
(512L, 3L, 64L, 64L)
uint8
(512L, 3L, 64L, 64L)
  512/32383 [..............................] - ETA: 89s - loss: 0.6940 - acc: 0.4662 - recall: 0.5868 - precision: 0.1765uint8
(512L, 3L, 64L, 64L)
uint8
(512L, 3L, 64L, 64L)
 1024/32383 [..............................] - ETA: 61s - loss: 0.6825 - acc: 0.6038 - recall: 0.6440 - precision: 0.2721uint8
(512L, 3L, 64L, 64L)
uint8
(512L, 3L, 64L, 64L)
 1536/32383 [>.............................] - ETA: 51s - loss: 0.6632 - acc: 0.6612 - recall: 0.6702 - precision: 0.3164uint8
(512L, 3L, 64L, 64L)
uint8
(512L, 3L, 64L, 64L)
 

In [44]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution2d_1 (Convolution2D)  (None, 64, 62, 62)    1792        convolution2d_input_1[0][0]      
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 64, 62, 62)    0           convolution2d_1[0][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 64, 31, 31)    0           activation_1[0][0]               
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 64, 29, 29)    36928       maxpooling2d_1[0][0]             
___________________________________________________________________________________________

In [45]:
# model = load_model(data_dir + 'models/aggregate_model_20170517-062305.h5')
print(y_valid.shape)
print(y_valid.ndim)

(8096L, 17L)
2


In [46]:
# use the validation data to compute some stats which tell us how the model is performing on the validation data set.
val_generator_score_board = validation_datagen.flow(
    x_valid,
    y_valid,
    batch_size=batch_size,
    shuffle=False)
p_valid = model.predict_generator(val_generator_score_board, number_validations)

In [47]:
optimized_thresholds = f2_optimized_thresholds(y_valid, np.array(p_valid))

y_predictions = (np.array(p_valid) > optimized_thresholds).astype(int)

precision_s = precision_score(y_valid, y_predictions, average=score_averaging_method)
print('>>>> Overall precision score over validation set ' , precision_s)

recall_s = recall_score(y_valid, y_predictions, average=score_averaging_method)
print('>>>> Overall recall score over validation set ' , recall_s)

# F2 score, which gives twice the weight to recall
# 'samples' is what the evaluation criteria is for the contest
f2_score = fbeta_score(y_valid, y_predictions, beta=2, average=score_averaging_method)
print('>>>> Overall F2 score over validation set ' , f2_score)

  'precision', 'predicted', average, warn_for)


label:0 threshold:0.12 score:0.827716434383
label:1 threshold:0.22 score:0.831975183105
label:2 threshold:0.15 score:0.831975183105
label:3 threshold:0.24 score:0.833255191037
label:4 threshold:0.09 score:0.839936276543
label:5 threshold:0.08 score:0.839954322915
label:6 threshold:0.19 score:0.854887138463
label:7 threshold:0.25 score:0.858361086558
label:8 threshold:0.21 score:0.866693015002
label:9 threshold:0.18 score:0.869730174112
label:10 threshold:0.14 score:0.870310875842
label:11 threshold:0.22 score:0.873580856356
label:12 threshold:0.15 score:0.874346338759
label:13 threshold:0.07 score:0.874346338759
label:14 threshold:0.24 score:0.881415093827
label:15 threshold:0.25 score:0.888059664231
label:16 threshold:0.13 score:0.888153378296
('>>>> Overall precision score over validation set ', 0.80072618388144567)
('>>>> Overall recall score over validation set ', 0.93472942742643839)
('>>>> Overall F2 score over validation set ', 0.88815337829555618)


In [48]:
threshold_df = pd.DataFrame({'label':labels, 
                             'optimized_threshold':optimized_thresholds})
print(threshold_df)

                label  optimized_threshold
0          slash_burn                 0.12
1               clear                 0.22
2            blooming                 0.15
3             primary                 0.24
4              cloudy                 0.09
5   conventional_mine                 0.08
6               water                 0.19
7                haze                 0.25
8         cultivation                 0.21
9       partly_cloudy                 0.18
10     artisinal_mine                 0.14
11         habitation                 0.22
12        bare_ground                 0.15
13          blow_down                 0.07
14        agriculture                 0.24
15               road                 0.25
16  selective_logging                 0.13


In [49]:
precision_l, recall_l, f2_score_l = calculate_stats_for_prediction(y_valid, y_predictions)

prediction_stats_df = pd.DataFrame({
    'label': labels, 
    'true_sum': np.sum(y_valid, axis=0),
    'predict_sum': np.sum(y_predictions, axis=0),
    'f2': f2_score_l,
    'recall': recall_l,
    'precision': precision_l
})

# reordering the columns for easier reading
prediction_stats_df = prediction_stats_df[['label', 'f2', 'recall', 'precision', 'true_sum', 'predict_sum']]
print(prediction_stats_df)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


                label    f2  recall  precision  true_sum  predict_sum
0          slash_burn  0.00    0.00       0.00        44            0
1               clear  0.98    0.99       0.92      5684         6117
2            blooming  0.00    0.00       0.00        66            0
3             primary  0.99    1.00       0.95      7451         7809
4              cloudy  0.85    0.96       0.60       452          723
5   conventional_mine  0.05    0.05       0.11        22            9
6               water  0.65    0.80       0.38      1513         3199
7                haze  0.75    0.80       0.59       527          711
8         cultivation  0.57    0.68       0.33       874         1789
9       partly_cloudy  0.91    0.93       0.83      1433         1618
10     artisinal_mine  0.62    0.70       0.42        70          118
11         habitation  0.61    0.69       0.42       732         1216
12        bare_ground  0.24    0.24       0.29       170          139
13          blow_dow

In [50]:
print(history['val_acc'])

[0.90452074768986157, 0.91031155378922168, 0.91749012376008765, 0.91742472992584168, 0.92140636448803626, 0.92311380526765063, 0.92838148401659937, 0.92928243319507642, 0.93174552304942615, 0.9328789727489939, 0.93369275095905713, 0.93743460004037549, 0.93742735183286097, 0.93834282404820435, 0.93906214585888526, 0.93867705061501661, 0.94055161056782421, 0.94069693423071399, 0.94021012307155749, 0.94148162725885864, 0.94272407641995093, 0.94214281380883313, 0.94215009900421021, 0.94351604333508154, 0.94326173882239417, 0.94387206117155054, 0.940755054177974, 0.94456958346687292, 0.94547053288094141, 0.94552866743487329, 0.94350151275928784, 0.94268048351461242, 0.9419902362842334, 0.94633515031912585, 0.94620437042515271, 0.94626976520176465, 0.94594280356946203, 0.94653859298690979, 0.94597186401427502, 0.94656765578763757, 0.94640055145670776, 0.94604452749486023, 0.94686555273448059, 0.94686555673953576, 0.94713438829414454, 0.94727242982434656, 0.9471852449560354, 0.947461341209562

In [51]:

filtered_data_mask_label = data_mask_label[data_mask]

data_set_name = os.path.basename(get_training_set_file_path(rescaled_dim))

record_model_scores(model_filename, 
                    model_id, 
                    history, 
                    f2_score, 
                    time_spent_trianing, 
                    num_channels,
                    config_file,
                    np.array_str(filtered_data_mask_label),
                    data_set_name)

In [52]:
figures_dir = 'figures/' + model_id
makedirs(figures_dir)

# list all data in history
print('training history stats:')
print(history.keys())

# summarize history for f2 score
fig = plt.figure(figsize=(15, 10))
subplot0 = fig.add_subplot(231)
if hasattr(f2_score_val, 'f2_measures'):
    subplot0.plot(f2_score_val.f2_measures)
subplot0.set_title('f2 score')
subplot0.set_ylabel('f2 score')
subplot0.set_xlabel('epoch')
subplot0.legend(['val'], loc='upper left')

# summarize history for recall
subplot3 = fig.add_subplot(232)
subplot3.plot(history['recall'])
subplot3.plot(history['val_recall'])
subplot3.set_title('recall')
subplot3.set_ylabel('recall')
subplot3.set_xlabel('epoch')
subplot3.legend(['train', 'val'], loc='upper left')

# summarize history for precision
subplot2 = fig.add_subplot(233)
subplot2.plot(history['precision'])
subplot2.plot(history['val_precision'])
subplot2.set_title('precision')
subplot2.set_ylabel('precision')
subplot2.set_xlabel('epoch')
subplot2.legend(['train', 'val'], loc='upper left')

# summarize history for accuracy
subplot1 = fig.add_subplot(234)
subplot1.plot(history['acc'])
subplot1.plot(history['val_acc'])
subplot1.set_title('accuracy')
subplot1.set_ylabel('accuracy')
subplot1.set_xlabel('epoch')
subplot1.legend(['train', 'val'], loc='upper left')

# summarize history for loss
subplot4 = fig.add_subplot(235)
subplot4.plot(history['loss'])
subplot4.plot(history['val_loss'])
subplot4.set_title('model loss')
subplot4.set_ylabel('loss')
subplot4.set_xlabel('epoch')
subplot4.legend(['train', 'val'], loc='upper left')

# precision and recall for each label
subplot5 = fig.add_subplot(236)
colors = cm.rainbow(np.linspace(0, 1, len(prediction_stats_df['label'])))
subplot5.scatter(prediction_stats_df['precision'], prediction_stats_df['recall'], c=colors)
subplot5.set_title('precision & recall')
subplot5.set_xlabel('precision')
subplot5.set_ylabel('recall')
for i, txt in enumerate(prediction_stats_df['label']):
    subplot5.annotate(txt, (prediction_stats_df['precision'][i], prediction_stats_df['recall'][i]))

fig.savefig(figures_dir + '/stats_' + timestr + '.png')
#plt.show()

training history stats:
['acc', 'loss', 'recall', 'precision', 'val_acc', 'val_recall', 'val_precision', 'val_loss']


In [53]:
# load pre-trained model
#model = load_model(data_dir + 'models/aggregate_model_20170521-141533.h5')
#print(model.summary())

# copied manually from stout
#recorded_thresholds = [0.13, 0.13, 0.1, 0.14, 0.05, 0.23, 0.17, 0.15, 0.18, 0.23, 0.09, 0.21, 0.17, 0.10, 0.20, 0.23, 0.1]

#image_generator = get_img_generator()




In [54]:
# this is the configuration we will use for testing:
testset_datagen = image_generator.getTestGenenerator()

In [55]:
if not is_test_set_in_cache(rescaled_dim):
    # populate the test dataset cache
    df_test = pd.read_csv(sample_submission_filepath)
    load_test_set(df_test, rescaled_dim)

real_submission_filepath = data_dir + 'my_submissions/submission_' + timestr + '.csv'
#prediction_filepath = data_dir + 'predictions/prediction_' + timestr + '.csv'

make_submission(model,
                optimized_thresholds,
                data_mask,
                testset_datagen, 
                rescaled_dim, 
                labels, 
                sample_submission_filepath,
                real_submission_filepath,
                need_norm_stats)

number of test samples:61191
full batches:61 reminder:191


100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [01:33<00:00,  1.18s/it]
100%|██████████████████████████████████████████████████████████████████████████| 61191/61191 [00:56<00:00, 1082.82it/s]


submission file generated: D:/Downloads/amazon/my_submissions/submission_20170618-200851.csv


In [56]:
total_exec_time = datetime.now() - start_time
print ('time spent to complete execution: {}'.format(total_exec_time))

time spent to complete execution: 2:10:57.968000
