In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import functools
from mpl_toolkits.axes_grid1 import ImageGrid
from os import listdir, makedirs, getcwd, remove
from os.path import isfile, join, abspath, exists, isdir, expanduser

from tqdm import tqdm
from keras.models import Model, Sequential
from keras.layers import Input, Flatten, Dense, Conv2D, MaxPooling2D, Dropout
from keras.utils import layer_utils
from keras import backend as K
from keras.optimizers import RMSprop, SGD, Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16, decode_predictions
from keras.applications.resnet50 import ResNet50
from keras.applications import imagenet_utils, xception, inception_v3
from keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

%matplotlib inline

Using TensorFlow backend.
  return f(*args, **kwds)


In [24]:
INPUT_SIZE = 224
NUM_CLASSES = 2
SEED = 1987

data_dir = 'data'
labels = pd.read_csv(join(data_dir, 'labels.csv'))
sample_submission = pd.read_csv(join(data_dir, 'sample_submission.csv'))
print(len(listdir(join(data_dir, 'train'))), len(labels))
print(len(listdir(join(data_dir, 'test'))), len(sample_submission))

10222 10222
10357 10357


In [23]:
selected_breed_list = list(labels.groupby('breed').count().sort_values(by='id', ascending=False).head(NUM_CLASSES).index)
labels = labels[labels['breed'].isin(selected_breed_list)]
labels['target'] = 1
labels['rank'] = labels.groupby('breed').rank()['id']
labels_pivot = labels.pivot('id', 'breed', 'target').reset_index().fillna(0)
np.random.seed(seed=SEED)
rnd = np.random.random(len(labels))
train_idx = rnd < 0.8
valid_idx = rnd >= 0.8
y_train = labels_pivot[selected_breed_list].values
ytr = y_train[train_idx]
yv = y_train[valid_idx]

In [25]:
ss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, test_size=0.2)
train_index, valid_index = next(ss.split(np.zeros(len(labels)), labels['breed']))

In [27]:
labels = pd.read_csv(join(data_dir, 'labels.csv'))
# Ensure that train and validation set have labels from all classes
num_classes = len(labels.breed.unique())
print(len(train_index), len(valid_index))
len(labels.breed[train_index].unique()), len(labels.breed[valid_index].unique()), len(labels.breed.unique())

8177 2045


(120, 120, 120)

In [28]:
labels['target'] = 1
labels_df = labels.pivot('id', 'breed', 'target').fillna(0)
labels_df.head(3)

breed,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,beagle,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000bec180eb18c7604dcecc8fe0dba07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001513dfcb2ffafc82cccf4d8bbaba97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001cdf01b096e06d78e9e5112d419397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def read_img(train_or_test, size, img_id):
    """Read and resize image.
    # Arguments
        img_id: string
        train_or_test: string 'train' or 'test'.
        size: resize the original image.
    # Returns
        Image as numpy array.
    """
    img = image.load_img(join(data_dir, train_or_test, '{0}.jpg'.format(img_id)), target_size=size)
    img = image.img_to_array(img)
    return img

def read_process(model, size, train_or_test, img_id):
    return model.preprocess_input(read_img(train_or_test, size, img_id))
 
INPUT_SIZE = 299
POOLING = 'avg'

size = (INPUT_SIZE, INPUT_SIZE)
train_image_func = functools.partial(read_process, xception, size, 'train')
test_image_func = functools.partial(read_process, xception, size, 'test')

In [11]:
x_train = np.asarray([train_image_func(img_id) for img_id in labels_df.index])
print('Train Images shape: {} size: {:,}'.format(x_train.shape, x_train.size))

Train Images shape: (10222, 299, 299, 3) size: 2,741,571,066


In [108]:
train_data = x_train[train_index]
valid_data = x_train[valid_index]

print((train_data.shape, valid_data.shape, train_labels.shape, valid_labels.shape))

((8177, 299, 299, 3), (2045, 299, 299, 3), (8177, 120), (2045, 120))


In [28]:
train_labels = labels_df.values[train_index]
valid_labels = labels_df.values[valid_index]
print((train_labels.shape, valid_labels.shape))

((8177, 120), (2045, 120))


In [13]:
inception_bottleneck = inception_v3.InceptionV3(weights='imagenet', include_top=False, pooling=POOLING)
train_i_bf = inception_bottleneck.predict(train_data, batch_size=32, verbose=1)
print('InceptionV3 train bottleneck features shape: {} size: {:,}'.format(train_i_bf.shape, train_i_bf.size))

InceptionV3 train bottleneck features shape: (8177, 2048) size: 16,746,496


In [14]:
valid_i_bf = inception_bottleneck.predict(valid_data, batch_size=32, verbose=1)
print('InceptionV3 valid bottleneck features shape: {} size: {:,}'.format(valid_i_bf.shape, valid_i_bf.size))

InceptionV3 valid bottleneck features shape: (2045, 2048) size: 4,188,160


In [53]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=SEED)
logreg.fit(train_i_bf, (train_labels * range(num_classes)).sum(axis=1))
valid_probs = logreg.predict_proba(valid_i_bf)
valid_preds = logreg.predict(valid_i_bf)

print(valid_preds[:5])
print('Validation Inception LogLoss {}'.format(log_loss(valid_labels, valid_probs)))
print('Validation Inception Accuracy {}'.format(accuracy_score((valid_labels * range(num_classes)).sum(axis=1), valid_preds)))

[  21.   80.   59.  112.   10.]
Validation Inception LogLoss 0.34241194057837054
Validation Inception Accuracy 0.8973105134474327


In [55]:
svc = SVC(probability=True)
svc.fit(train_i_bf, (train_labels * range(num_classes)).sum(axis=1))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
svc_valid_probs = svc.predict_proba(valid_i_bf)
svc_valid_preds = svc.predict(valid_i_bf)
print('Validation Inception LogLoss {}'.format(log_loss(valid_labels, svc_valid_probs)))
print('Validation Inception Accuracy {}'.format(accuracy_score((valid_labels * range(num_classes)).sum(axis=1), svc_valid_preds)))

Validation Inception LogLoss 0.8582123624089703
Validation Inception Accuracy 0.8943765281173595


In [92]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(train_i_bf, (train_labels * range(num_classes)).sum(axis=1))


Validation Inception LogLoss 0.8582123624089703
Validation Inception Accuracy 0.8943765281173595


In [107]:
rf_valid_probs = model.predict_proba(valid_i_bf)
rf_valid_preds = model.predict(valid_i_bf)
print('RF Validation Inception LogLoss {}'.format(log_loss(valid_labels, rf_valid_probs)))
print('RF Validation Inception Accuracy {}'.format(accuracy_score((valid_labels * range(num_classes)).sum(axis=1), rf_valid_preds)))

RF Validation Inception LogLoss 2.5863124078334665
RF Validation Inception Accuracy 0.8019559902200489


In [39]:
import json
with open('validation_inception.json', 'w') as outfile:
    json.dump(valid_i_bf.tolist(), outfile)

In [60]:
test_data = np.asarray([test_image_func(img_id) for img_id in sample_submission['id']])
print('Test Images shape: {} size: {:,}'.format(test_data.shape, test_data.size))

Test Images shape: (10357, 299, 299, 3) size: 2,777,778,471


In [63]:
test_inception = inception_bottleneck.predict(test_data, batch_size=32, verbose=1)
test_probs = logreg.predict_proba(test_inception)



In [91]:
import csv

with open('submission_inception.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(sample_submission.columns)
    
    for i, probs in enumerate(test_probs):
        writer.writerow(np.hstack([test_ids[i], probs]))