In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Convolution2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import seaborn as sns

import cv2
from tqdm import tqdm

from utils.loader import load_training_set, load_test_set 

Using Theano backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
c:\users\me\appdata\local\temp\try_flags_p4cet3.c:4:19: fatal error: cudnn.h: No such file or directory
compilation terminated.

Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:01:00.0)


In [2]:
from datetime import datetime
import time

timestr = time.strftime("%Y%m%d-%H%M%S")
startTime = datetime.now()

In [3]:
data_dir = 'D:/Downloads/amazon/'
df_train = pd.read_csv(data_dir + 'train.csv')
model_filename = 'aggregate_model_'+ timestr +'.h5'
number_of_samples = len(df_train.index)
print('total number of samples', number_of_samples)
split = int(number_of_samples * 0.75)
print('test/validation split index', split)

num_samples_per_epoch = number_of_samples * 3 #  increase training samples at every epoch.
rescaled_dim = 64
number_epoch = 6
batch_size = 128
classifier_threshold = 0.2

('total number of samples', 40479)
('test/validation split index', 30359)


In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

print(labels)
print(len(labels))

['slash_burn', 'clear', 'blooming', 'primary', 'cloudy', 'conventional_mine', 'water', 'haze', 'cultivation', 'partly_cloudy', 'artisinal_mine', 'habitation', 'bare_ground', 'blow_down', 'agriculture', 'road', 'selective_logging']
17


In [5]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [7]:
x_train, y_train = load_training_set(df_train, rescaled_dim)
print(x_train.shape)
print(y_train.shape)



(40479L, 64L, 64L, 3L)
(40479L, 17L)


In [9]:
x_train = x_train.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_train.shape)

(40479L, 3L, 64L, 64L)


In [10]:
# TODO serialize array to HDF5

In [10]:
# shuffle the samples because 
# 1) the original samples may not be randomized & 
# 2) to avoid the possiblility of overfitting the validation data while we tune the model
from sklearn.utils import shuffle
x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

In [11]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(30359L, 3L, 64L, 64L)
(30359L, 17L)
(10120L, 3L, 64L, 64L)
(10120L, 17L)


In [12]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.0,
        zoom_range=0.0,
        horizontal_flip=True,
        vertical_flip=True)

In [13]:
train_generator = train_datagen.flow(
        x_train, 
        y_train, 
        batch_size=batch_size,
        shuffle=True) 

In [14]:
validation_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.0,
        zoom_range=0.0,
        horizontal_flip=False,
        vertical_flip=False)

validation_generator = validation_datagen.flow(
        x_valid, 
        y_valid, 
        batch_size=batch_size,
        shuffle=False)

In [16]:
model = Sequential()

model.add(Convolution2D(32, 3, 3, input_shape=(3, rescaled_dim, rescaled_dim)))  
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# model.add(Convolution2D(128, 3, 3))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # Is this the best loss function?
              optimizer='adam',
              metrics=['accuracy', 'recall', 'precision'])

In [15]:
# let's load the old model and continue training 2 more epoch. We want to see if that improves results
# from keras.models import load_model
# model = load_model(data_dir + 'models/aggregate_model_20170507-124128.h5')
# number_epoch = 2

In [None]:
# fits the model on batches with real-time data augmentation:
history = model.fit_generator(train_generator,
                    samples_per_epoch=num_samples_per_epoch,
                    nb_epoch=number_epoch)

model.save(data_dir + 'models/' + model_filename)  # always save your model and weights after training or during training
print('model training complete')

Epoch 1/2
model training complete


In [None]:
figures_dir = 'figures/aggregate'
makedirs(figures_dir)

# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.figure(0)
plt.plot(history.history['acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')

# summarize history for precision
plt.figure(1)
plt.plot(history.history['precision'])
plt.plot(history.history['val_precision'])
plt.title('model precision')
plt.ylabel('precision')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

# summarize history for recall
plt.figure(2)
plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
plt.title('model recall')
plt.ylabel('recall')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

# summarize history for loss
plt.figure(3)
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')

plt.savefig(figures_dir + '/stats_' + timestr + '.png')
#plt.show()

In [4]:
from keras.models import load_model
model = load_model(data_dir + 'models/' + model_filename)

In [None]:
# use the validation data to compute some stats which tell us how the model is performing on the validation data set.
#np.set_printoptions(threshold='nan')

# validation images has not been normalized!
#p_valid = model.predict(x_valid / float(255), batch_size=128)
p_valid = model.predict_generator(validation_generator, number_of_samples - split)

print(y_valid)
print(p_valid)

y_predictions = (np.array(p_valid) > classifier_threshold).astype(int)  # should this threshold be unique per label?
print(y_predictions)

# see how many positives samples per label for truth vs prediction
print(np.sum(y_valid, axis=0))
print(np.sum(y_predictions, axis=0))

print('f2 score over samples in validation set: ')
# 'samples' is what the evaluation criteria is for the contest
print(fbeta_score(y_valid, y_predictions, beta=2, average='samples'))



In [None]:
def f2score(truth, predict, label_index):
    return fbeta_score(truth[:, label_index], predict[:, label_index], beta=2, average='macro')
    
def precision_for_label_index2(truth, predict, label_index):
    return precision_score(truth[:, label_index], predict[:, label_index], average='macro')

def recall_for_label_index(truth, predict, label_index):
    return recall_score(truth[:, label_index], predict[:, label_index], average='macro')

In [None]:
# calculate f2 score for each label. find out which labels model is performing badly.
# print stats for each label
for x in range(0, len(labels)):
    f2_score = f2score(y_valid, y_predictions, x)
    precision_s = precision_for_label_index2(y_valid, y_predictions, x)
    recall_s = recall_for_label_index(y_valid, y_predictions, x)
    label = labels[x]
    print(label)
    print('    f2 score : ' , f2_score)
    print('    precision: ' , precision_s)
    print('    recall   : ' , recall_s)

In [None]:
# https://www.kaggle.com/paulorzp/find-best-f2-score-threshold
# TODO should this threshold be unique per label?
def estimate_f2score_threshold(p_valid, y_valid, try_all=False, verbose=False):
    best = 0
    best_score = -1
    totry = np.arange(0,1,0.005) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = f2_score(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold =', best)
    return best

In [7]:
# from keras.models import load_model
# model = load_model(data_dir + 'models/aggregate_model_20170507-184232.h5')

In [9]:


# this is the configuration we will use for testing:
# only rescaling
testset_datagen = ImageDataGenerator(rescale=1./255)

testset_dir = data_dir + 'test'

df_test_list = pd.read_csv(data_dir + 'sample_submission.csv')

x_test = load_test_set(df_test_list, rescaled_dim)


In [10]:
#x_test = np.array(x_test, np.uint8)
print(x_test.shape)
x_test = x_test.transpose(0,3,1,2)  # https://github.com/fchollet/keras/issues/2681
print(x_test.shape)

(40669L, 64L, 64L, 3L)
(40669L, 3L, 64L, 64L)


In [11]:
testset_generator = testset_datagen.flow(
    x_test,
    y=None,
    batch_size=batch_size,
    shuffle=False)
    
# ??? There may be a bug below that casues LB score to be 0.5-0.6
# testset_generator = testset_datagen.flow_from_directory(
#         testset_dir,
#         target_size=(rescaled_dim, rescaled_dim),
#         batch_size=batch_size,
#         class_mode=None,
#         shuffle=False)

In [12]:
# run predictions on test set
testset_predict = model.predict_generator(testset_generator, x_test.shape[0]) # number of test samples = 40669

y_testset_predictions = (np.array(testset_predict) > classifier_threshold).astype(int)

result = pd.DataFrame(y_testset_predictions, columns = labels)

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.transpose()
    a = a.loc[a[i] == 1]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

df_test = pd.read_csv(data_dir + 'sample_submission.csv')
df_test['tags'] = preds
df_test
print('done')

100%|██████████████████████████████████████████████████████████████████████████| 40669/40669 [00:38<00:00, 1069.06it/s]


done


In [None]:
#test code
# nums_ones = np.ones((1, 17))
# nums_zeros = np.zeros((1, 17))
# haha = np.array([[1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]])

# y_testset_predictions = haha
# result = pd.DataFrame(y_testset_predictions, columns = labels)

# preds = []
# for i in tqdm(range(result.shape[0]), miniters=1000):
#     a = result.ix[[i]]
#     #print(a)
#     a = a.transpose()
#     print(a)
#     a = a.loc[a[i] == 1]
#     print(a)
#     ' '.join(list(a.index))
#     preds.append(' '.join(list(a.index)))
    
# print(preds)

In [13]:
df_test.to_csv(data_dir + 'my_submissions/submission_' + timestr + '.csv', index=False)

In [14]:
print ('time spent to complete execution:' , datetime.now() - startTime)

('time spent to complete execution:', datetime.timedelta(0, 962, 206000))
