In [1]:

%matplotlib inline
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from os import listdir, makedirs
from os.path import join, exists, expanduser
from tqdm import tqdm
from sklearn.metrics import log_loss, accuracy_score
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications import xception
from keras.applications import inception_v3
from keras.applications.vgg16 import preprocess_input, decode_predictions
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
compare_loss={}
compare_accuracy={}

Using TensorFlow backend.


In [2]:

# function to convert image to array
def read_img(address, size):
    """Read and resize image.
    Returns Image as numpy array, by normalizing the values
    """
    img = image.load_img(address, target_size=size)
    img = image.img_to_array(img)
    return img

# function to convert labels to one hot encoding vector
def OneHotEncoded(y_train):
    y_t=np.zeros((len(y_train),Num_Class), dtype=int)
    for i,x in enumerate(y_train):
        y_t[i][int(x)]=1
    return y_t


In [3]:
import os
import csv
from shutil import copy2
INPUT_SIZE = 299
directory_lists=os.listdir("./imgs/train")
labels = {}
data = {}

with open('./labels.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id','class'])
    for i in range(0, len(directory_lists)):                        #'''directory_lists == files in /train == ['c0', 'c1', ... , 'c9']'''
        t = directory_lists[i][1:]
        imag=os.listdir("./imgs/train/"+str(directory_lists[i]))     #'''imag == files in /train/c# '''
        current_loc = "./imgs/train/"+str(directory_lists[i])+"/"    #'''./imgs/train/c#/'''
        for k in range(0, len(imag)):
            x=[current_loc+str(imag[k])]+[t]
            writer.writerow(x)

In [4]:
data_dir = './'
labels = pd.read_csv(join(data_dir, 'labels.csv'))

In [5]:
INPUT_SIZE = 299
POOLING = 'avg'
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

x_train = np.zeros((len(labels), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
y_train= np.zeros((len(labels),), dtype='float32')

for i, img_id in tqdm(enumerate(labels['id'])):
    img = read_img(img_id, (INPUT_SIZE, INPUT_SIZE))
    x = xception.preprocess_input(np.expand_dims(img.copy(), axis=0))
    x_train[i] = x
    y_train[i]=int(labels['class'][i])
print('Train Images shape: {} size: {:,}'.format(x_train.shape, x_train.size))



22424it [02:33, 146.10it/s]

Train Images shape: (22424, 299, 299, 3) size: 6,014,184,072





In [6]:
Num_Class=10  # number of classes in dataset
# convert labels to one hot encoding
y_train=OneHotEncoded(y_train)
print(y_train)
# split data in to training and validation set
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.33, random_state=42)
print(X_train)
print(y_train)

[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]]
[[[[-0.66274512 -0.58431375 -0.65490198]
   [-0.66274512 -0.58431375 -0.67058825]
   [-0.66274512 -0.57647061 -0.68627453]
   ..., 
   [ 0.37254906  1.          0.93725491]
   [ 0.45882356  0.99215686  0.90588236]
   [ 0.60784316  1.          0.94509804]]

  [[-0.65490198 -0.57647061 -0.66274512]
   [-0.65490198 -0.57647061 -0.66274512]
   [-0.65490198 -0.57647061 -0.66274512]
   ..., 
   [ 0.49803925  0.98431373  0.96078432]
   [ 0.63137257  0.98431373  0.96078432]
   [ 0.77254903  1.          0.97647059]]

  [[-0.65490198 -0.57647061 -0.66274512]
   [-0.64705884 -0.56862748 -0.65490198]
   [-0.63921571 -0.56078434 -0.63137257]
   ..., 
   [ 0.68627453  0.99215686  1.        ]
   [ 0.84313726  0.97647059  1.        ]
   [ 0.96862745  0.96862745  1.        ]]

  ..., 
  [[-0.54509807 -0.39607841 -0.34117645]
   [-0.52156866 -0.372549   -0.31764704]
   [-0.5294

In [7]:
# forward passing the training and validation set
xception_bottleneck = xception.Xception(weights='imagenet', include_top=False, pooling=POOLING)
train_x_bf = xception_bottleneck.predict(X_train, batch_size=32, verbose=1)
valid_x_bf = xception_bottleneck.predict(X_val, batch_size=32, verbose=1)
print('Xception train bottleneck features shape: {} size: {:,}'.format(train_x_bf.shape, train_x_bf.size))
print('Xception valid bottleneck features shape: {} size: {:,}'.format(valid_x_bf.shape, valid_x_bf.size))
np.save(open('train_x_bf.npy', 'w'), train_x_bf)
np.save(open('valid_x_bf.npy', 'w'), valid_x_bf)

np.save('train_x_bf_.npy', train_x_bf) 
np.save('valid_x_bf_.npy', valid_x_bf) 

Xception train bottleneck features shape: (15024, 2048) size: 30,769,152
Xception valid bottleneck features shape: (7400, 2048) size: 15,155,200


In [8]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=147)
logreg.fit(train_x_bf, (y_train * range(Num_Class)).sum(axis=1))
valid_probs = logreg.predict_proba(valid_x_bf)
valid_preds = logreg.predict(valid_x_bf)
compare_loss['Xception']=log_loss(y_val, valid_probs)
compare_accuracy['Xception']=accuracy_score((y_val * range(Num_Class)).sum(axis=1), valid_preds)
print('Validation Xception LogLoss {}'.format(compare_loss['Xception']))
print('Validation Xception Accuracy {}'.format(compare_accuracy['Xception']))

Validation Xception LogLoss 0.0977869202068
Validation Xception Accuracy 0.977837837838
