In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.models import load_model

from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm

from utils.file import makedirs
from utils.recorder import record_model_medata, record_model_scores
from utils.loader import load_training_set, load_test_set

from datetime import datetime
import time

Using Theano backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
c:\users\me\appdata\local\temp\try_flags_gmc1pq.c:4:19: fatal error: cudnn.h: No such file or directory
compilation terminated.

Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


In [2]:
timestr = time.strftime("%Y%m%d-%H%M%S")
start_time = datetime.now()

In [3]:
data_dir = 'D:/Downloads/amazon/'
df_train = pd.read_csv(data_dir + 'train_v2.csv')
model_filename = 'aggregate_model_'+ timestr +'.h5'
model_filepath = data_dir + 'models/' + model_filename
sample_submission_filepath = data_dir + 'sample_submission_v2.csv'
number_of_samples = len(df_train.index)
print('total number of samples', number_of_samples)
split = int(number_of_samples * 0.75)
print('test/validation split index', split)

use_nir = False # use Near infrared channel
num_channels = 4 if use_nir else 3
num_samples_per_epoch = number_of_samples * 3 #  increase training samples at every epoch.
rescaled_dim = 64
number_epoch = 30  # TODO Keep increasing above 20 for better performance
batch_size = 128
classifier_threshold = 0.2

('total number of samples', 40479)
('test/validation split index', 30359)


In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

print(labels)
print(len(labels))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']
17


In [5]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [6]:
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)

(40479L, 64L, 64L, 4L)
(40479L, 17L)


In [7]:
if not use_nir:
    x_train = x_train[:, :, :, :3]  # stripe out the NIR channel data

x_train = x_train.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_train.shape)

(40479L, 4L, 64L, 64L)


In [8]:
# shuffle the samples because 
# 1) the original samples may not be randomized & 
# 2) to avoid the possiblility of overfitting the validation data while we tune the model
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

In [9]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(30359L, 4L, 64L, 64L)
(30359L, 17L)
(10120L, 4L, 64L, 64L)
(10120L, 17L)


In [10]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,  # TODO does it make sense to use same scale for NIR channel
        shear_range=0.0,
        zoom_range=0.0,
        horizontal_flip=True,
        vertical_flip=True)

In [11]:
train_generator = train_datagen.flow(
        x_train, 
        y_train, 
        batch_size=batch_size,
        shuffle=True) 

In [12]:
validation_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.0,
        zoom_range=0.0,
        horizontal_flip=False,
        vertical_flip=False)

validation_generator = validation_datagen.flow(
        x_valid,
        y_valid,
        batch_size=batch_size,
        shuffle=False)

In [14]:
model = Sequential()

model.add(Convolution2D(32, 3, 3, input_shape=(num_channels, rescaled_dim, rescaled_dim)))  
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(128, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
# dropout of 0.2 - 0.5 is recommended :
# http://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/
# Keep in mind dropouts overuse will hurt model performance
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # Is this the best loss function?
              optimizer='adam',
              metrics=['accuracy', 'recall', 'precision'])

In [13]:
# let's load the old model and continue training 2 more epoch gives 0.01 improvement in LB score.
# model = load_model(data_dir + 'models/aggregate_model_20170507-124128.h5') # 0.86
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5') # 0.87
# model = load_model(data_dir + 'models/aggregate_model_20170511-133235.h5')
# number_epoch = 2

In [14]:
training_start_time = datetime.now()
# fits the model on batches with real-time data augmentation:
history = model.fit_generator(train_generator,
                    samples_per_epoch=num_samples_per_epoch,
                    nb_epoch=number_epoch)

model.save(model_filepath)  # always save your model and weights after training or during training
time_spent_trianing = datetime.now() - training_start_time

print('model training complete')

Epoch 1/2



Epoch 2/2
model training complete


In [15]:
# use the validation data to compute some stats which tell us how the model is performing on the validation data set.
#np.set_printoptions(threshold='nan')

# validation images has not been normalized!
#p_valid = model.predict(x_valid / float(255), batch_size=128)
p_valid = model.predict_generator(validation_generator, number_of_samples - split)

print(y_valid)
print(p_valid)

y_predictions = (np.array(p_valid) > classifier_threshold).astype(int)  # should this threshold be unique per label?
print(y_predictions)

# see how many positives samples per label for truth vs prediction
print(np.sum(y_valid, axis=0))
print(np.sum(y_predictions, axis=0))

# F2 score, which gives twice the weight to recall emphasising recall higher than precision
# 'samples' is what the evaluation criteria is for the contest
f2_score = fbeta_score(y_valid, y_predictions, beta=2, average='samples')
print('f2 score over validation set using samples averaging ' , f2_score)

record_model_scores(model_filepath, history, f2_score, time_spent_trianing, num_channels)

[[0 1 0 ..., 1 0 0]
 [0 1 0 ..., 1 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 1 1 0]
 [0 0 0 ..., 1 0 0]]
[[  3.54605205e-02   9.96083260e-01   1.31009649e-02 ...,   6.40822053e-01
    2.26660430e-01   2.10418738e-02]
 [  2.26536430e-02   9.99276340e-01   7.24694971e-03 ...,   6.37592614e-01
    2.76897281e-01   2.08612680e-02]
 [  6.82247259e-10   9.99946952e-01   2.30761729e-02 ...,   1.30778830e-03
    2.26672878e-03   1.00781873e-03]
 ..., 
 [  7.17320768e-21   9.08736038e-05   8.22560011e-13 ...,   2.35115411e-04
    8.85157060e-05   4.12335422e-13]
 [  2.69908789e-08   2.47885031e-03   4.91998508e-04 ...,   2.37999950e-02
    5.91139123e-03   4.00240933e-05]
 [  4.66035120e-03   6.09104335e-01   2.18283063e-07 ...,   8.29611063e-01
    7.87771165e-01   4.36809147e-03]]
[[0 1 0 ..., 1 1 0]
 [0 1 0 ..., 1 1 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 1 0 ..., 1 1 0]]
[  57 7103   84 9337  547   27 1886  672 1089 1798   87  915  21

In [23]:
figures_dir = 'figures/aggregate'
makedirs(figures_dir)

# list all data in history
print(history.history.keys())
# summarize history for accuracy
fig = plt.figure()
subplot1 = fig.add_subplot(221)
subplot1.plot(history.history['acc'])
subplot1.set_title('model accuracy')
subplot1.set_ylabel('accuracy')
subplot1.set_xlabel('epoch')
subplot1.legend(['train'], loc='upper left')

# summarize history for precision
subplot2 = fig.add_subplot(222)
subplot2.plot(history.history['precision'])
subplot2.set_title('model precision')
subplot2.set_ylabel('precision')
subplot2.set_xlabel('epoch')
subplot2.legend(['train'], loc='upper left')

# summarize history for recall
subplot3 = fig.add_subplot(223)
subplot3.plot(history.history['recall'])
subplot3.set_title('model recall')
subplot3.set_ylabel('recall')
subplot3.set_xlabel('epoch')
subplot3.legend(['train'], loc='upper left')

# summarize history for loss
subplot4 = fig.add_subplot(224)
subplot4.plot(history.history['loss'])
subplot4.set_title('model loss')
subplot4.set_ylabel('loss')
subplot4.set_xlabel('epoch')
subplot4.legend(['train'], loc='upper left')

fig.savefig(figures_dir + '/stats_' + timestr + '.png')
#plt.show()

['acc', 'loss', 'precision', 'recall']


In [24]:
#model = load_model(model_filepath)

In [25]:
def f2score(truth, predict, label_index):
    return fbeta_score(truth[:, label_index], predict[:, label_index], beta=2, average='macro')
    
def precision_for_label_index2(truth, predict, label_index):
    return precision_score(truth[:, label_index], predict[:, label_index], average='macro')

def recall_for_label_index(truth, predict, label_index):
    return recall_score(truth[:, label_index], predict[:, label_index], average='macro')

In [26]:
# calculate f2 score for each label. find out which labels model is performing badly.
# print stats for each label
for x in range(0, len(labels)):
    f2_score = f2score(y_valid, y_predictions, x)
    precision_s = precision_for_label_index2(y_valid, y_predictions, x)
    recall_s = recall_for_label_index(y_valid, y_predictions, x)
    label = labels[x]
    print(label)
    print('    f2 score : ' , f2_score)
    print('    precision: ' , precision_s)
    print('    recall   : ' , recall_s)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


slash_burn
('    f2 score : ', 0.49943420948145795)
('    precision: ', 0.49718379446640315)
('    recall   : ', 0.5)
clear
('    f2 score : ', 0.89058166149727191)
('    precision: ', 0.94165020979315539)
('    recall   : ', 0.88261389504712406)
blooming
('    f2 score : ', 0.49916441190514083)
('    precision: ', 0.49584980237154153)
('    recall   : ', 0.5)
primary
('    f2 score : ', 0.66625229419854792)
('    precision: ', 0.94961373490296064)
('    recall   : ', 0.64819766345213858)
cloudy
('    f2 score : ', 0.90854580136674468)
('    precision: ', 0.84381433780254622)
('    recall   : ', 0.92951220019895231)
conventional_mine
('    f2 score : ', 0.49973263091182751)
('    precision: ', 0.49866600790513832)
('    recall   : ', 0.5)
water
('    f2 score : ', 0.75324329468097273)
('    precision: ', 0.70465515441593329)
('    recall   : ', 0.78241467561627287)
haze
('    f2 score : ', 0.84241305608135963)
('    precision: ', 0.76544154561207467)
('    recall   : ', 0.8717510130639

In [27]:
# https://www.kaggle.com/paulorzp/find-best-f2-score-threshold
# TODO should this threshold be unique per label?
def estimate_f2score_threshold(p_valid, y_valid, try_all=False, verbose=False):
    best = 0
    best_score = -1
    totry = np.arange(0,1,0.005) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = f2_score(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold =', best)
    return best

In [31]:
# this is the configuration we will use for testing:
# only rescaling
testset_datagen = ImageDataGenerator(rescale=1./255)

testset_dir = data_dir + 'test'

df_test_list = pd.read_csv(sample_submission_filepath)

x_test = load_test_set(df_test_list, rescaled_dim)

if not use_nir:
    x_test = x_test[:, :, :, :3]  # stripe out the NIR channel data

100%|██████████████████████████████████████████████████████████████████████████| 61191/61191 [1:32:58<00:00, 10.97it/s]


In [32]:
#x_test = np.array(x_test, np.uint8)
print(x_test.shape)
x_test = x_test.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_test.shape)

(61191L, 64L, 64L, 4L)
(61191L, 4L, 64L, 64L)


In [33]:
testset_generator = testset_datagen.flow(
    x_test,
    y=None,
    batch_size=batch_size,
    shuffle=False)
    
# ??? There may be a bug below that casues LB score to be 0.5-0.6
# testset_generator = testset_datagen.flow_from_directory(
#         testset_dir,
#         target_size=(rescaled_dim, rescaled_dim),
#         batch_size=batch_size,
#         class_mode=None,
#         shuffle=False)

In [36]:
#from keras.models import load_model
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170509-215809.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170511-001322.h5')
# model = load_model(data_dir + 'models/aggregate_model_20170511-150149.h5')

In [37]:
# run predictions on test set
testset_predict = model.predict_generator(testset_generator, x_test.shape[0]) # number of test samples

y_testset_predictions = (np.array(testset_predict) > classifier_threshold).astype(int)

result = pd.DataFrame(y_testset_predictions, columns = labels)

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.transpose()
    a = a.loc[a[i] == 1]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

df_test = pd.read_csv(sample_submission_filepath)
df_test['tags'] = preds
df_test
print('done')

100%|██████████████████████████████████████████████████████████████████████████| 61191/61191 [00:57<00:00, 1063.91it/s]


done


In [None]:
#test code
# nums_ones = np.ones((1, 17))
# nums_zeros = np.zeros((1, 17))
# haha = np.array([[1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]])

# y_testset_predictions = haha
# result = pd.DataFrame(y_testset_predictions, columns = labels)

# preds = []
# for i in tqdm(range(result.shape[0]), miniters=1000):
#     a = result.ix[[i]]
#     #print(a)
#     a = a.transpose()
#     print(a)
#     a = a.loc[a[i] == 1]
#     print(a)
#     ' '.join(list(a.index))
#     preds.append(' '.join(list(a.index)))
    
# print(preds)

In [38]:
df_test.to_csv(data_dir + 'my_submissions/submission_' + timestr + '.csv', index=False)

In [None]:
print ('time spent to complete execution:' , datetime.now() - start_time)