In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import random
print(os.listdir("../input"))
print(os.listdir())
from tqdm import tqdm
from keras.preprocessing import image
from keras.applications.densenet import DenseNet201, preprocess_input
from keras.callbacks import TensorBoard, LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras.models import Model
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder

# Any results you write to the current directory are saved as output.

Load seedling species and extract file names.

In [None]:
CATEGORIES = os.listdir("../input/train/")
CATEGORIES.sort()

train = []
for category_id, category in enumerate(CATEGORIES):
    for file in os.listdir(os.path.join('../input/train/', category)):
        train.append(['../input/train/{}/{}'.format(category, file), category_id, category])
train = pd.DataFrame(train, columns=['file', 'category_id', 'category'])
train.shape

Split training and validation data 80/20 stratified.

In [None]:
trainSample = pd.concat([train[train['category'] == c].sample(frac=0.8) for c in CATEGORIES])
validate = train[~train.file.isin(trainSample['file'])]

Load training and validation data.

In [None]:
def read_img(filepath, size):
    img = image.load_img(os.path.join(filepath), target_size=size)
    img = image.img_to_array(img)
    return img

INPUT_SIZE = 224
trainX = np.zeros((len(trainSample), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, file in tqdm(enumerate(trainSample['file'])):
    img = read_img(file, (INPUT_SIZE, INPUT_SIZE))
    trainX[i] = img
    
valX =  np.zeros((len(validate), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, file in tqdm(enumerate(validate['file'])):
    img = read_img(file, (INPUT_SIZE, INPUT_SIZE))
    valX[i] = img

One hot endcoding for response variables. Needed for categorical crossentroy.

In [None]:
ohc = OneHotEncoder(sparse=False)
ohc.fit(trainSample[['category']])
trainY = ohc.transform(trainSample[['category']])
valY = ohc.transform(validate[['category']])

Data preprocessing and augmentation.

In [None]:
train_datagen = ImageDataGenerator(
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    #shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    preprocessing_function=preprocess_input)

val_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input)

Load DeepNet. Start by training only the top layer.

In [None]:
basic_model = DenseNet201(include_top=False, weights='imagenet', pooling='avg')

for layer in basic_model.layers:
    layer.trainable = False

input_tensor = basic_model.input
# build top
x = basic_model.output
x = Dropout(.5)(x)
x = Dense(len(CATEGORIES), activation='softmax')(x)

best_model_file = 'DenseNet201-224x224.h5'
model = Model(inputs=input_tensor, outputs=x)
model.compile(optimizer=RMSprop(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

Add Callbacks.

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=5, verbose=1, min_delta=1e-5),
             ModelCheckpoint(filepath=best_model_file, verbose=1,
                             save_best_only=True, save_weights_only=True, mode='auto')]

Train Model.

In [None]:
model.fit_generator(train_datagen.flow(trainX, trainY, batch_size=16), epochs=40, 
                    validation_data=val_datagen.flow(valX, valY, batch_size=16),
                    callbacks=callbacks,
                    steps_per_epoch = trainSample.shape[0]/16,
                    validation_steps = validate.shape[0]/16,
                    #workers=4,
                    verbose=1)

Train on all layers.

In [None]:
model.load_weights(best_model_file)
for layer in model.layers:
    layer.W_regularizer = l2(1e-2)
    layer.trainable = True

model.compile(optimizer=RMSprop(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

New Callbacks.

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=10, verbose=1, min_delta=1e-5),
             ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, cooldown=1, 
                               verbose=1, min_lr=1e-7),
             ModelCheckpoint(filepath=best_model_file, verbose=1,
                             save_best_only=True, save_weights_only=True, mode='auto')]

Train Model.

In [None]:
model.fit_generator(train_datagen.flow(trainX, trainY, batch_size=16), epochs=100, 
                    validation_data=val_datagen.flow(valX, valY, batch_size=16),
                    callbacks=callbacks,
                    steps_per_epoch = trainSample.shape[0]/16,
                    validation_steps = validate.shape[0]/16,
                    #workers=4,
                    verbose=1)

Load Test Data.

In [None]:
test = []
for file in os.listdir("../input/test"):
    test.append(['../input/test/{}'.format(file), file])
test = pd.DataFrame(test, columns=['filepath', 'file'])

testX = np.zeros((len(test), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, filepath in tqdm(enumerate(test['filepath'])):
    img = read_img(filepath, (INPUT_SIZE, INPUT_SIZE))
    testX[i] = img
print('test Images shape: {} size: {:,}'.format(testX.shape, testX.size))

Save predictions, both results and probability scores.

In [None]:
predictions = model.predict_generator(val_datagen.flow(testX, shuffle=False,
                                                       batch_size=1),
                                     steps=testX.shape[0])

preds = []
for i in range(len(predictions)):
    pos = np.argmax(predictions[i])
    preds.append(CATEGORIES[pos])
    
pred_df = pd.DataFrame({'file': test['filepath'].apply(lambda x: x.split('/')[3]), 'species': preds})
pred_df.to_csv('pred.csv', index=False)
predictions_df = pd.DataFrame(predictions)
predictions_df['file'] = test['filepath'].apply(lambda x: x.split('/')[3])
predictions_df.to_csv('predictions.csv', index=False)