In [1]:
label_dict = {0:'daisy', 1:'dandelion', 2:'rose', 3:'sunflower', 4:'tulip'}

train_data_path = './Day_101/ml100marathon-final-exam/image_data/train'

### Refer to https://github.com/lucasdupin/ml-image-scaling/blob/master/1_preprocessing.ipynb

In [2]:
import numpy as np
import matplotlib.pyplot as plt  

from PIL import Image # Make sure you have Pillow! PIL may import but WILL raise exceptions

%matplotlib inline

In [3]:
import os
import sys

SMALL=120

original_size = (80, 80)
small_size = (SMALL, SMALL)

def image_cache_root_path(image_folder):
    return os.path.join(image_folder, '.cache')

def image_cache_folder_path(image_folder, resolution):
    return os.path.join(image_cache_root_path(image_folder), resolution)
    
def image_cache_file_path(image_folder, image_file, resolution):
    return os.path.join(image_cache_folder_path(image_folder, resolution), image_file)
    

def process_image(folder, image_path, image_cache_high, image_cache_low):
    """ Resizes an image, creating a big and a reduced version """
    
    # Skip files that already were generated or directories
    if os.path.exists(image_cache_high) and os.path.exists(image_cache_low):
        return
    
    with Image.open(image_path) as original:
  
        # Ignore images smaller than our target size
        if original.size[0] < original_size[0] or original.size[1] < original_size[1]:
            print("image %s with size %s is too small, skipping..." % (image_path, original.size))
            return

        #print("resizing and cropping: %s" % image_path)
        center = (original.size[0]/2, original.size[1]/2)
        
        size = original.size[0] if original.size[0] < original.size[1] else original.size[1]
        
        '''
        # Original 
        size = original_size[0]
        '''
        cropped = original.crop((center[0] - size/2, 
                                 center[1] - size/2, 
                                 center[0] + size/2,
                                 center[1] + size/2))
        
        
        # High-res version - labels
        cropped.save(image_cache_high)
        # Generate low-res version of the image - train data
        small = cropped.resize(small_size, Image.ANTIALIAS)
        small.save(image_cache_low)


def process_imagefolder_recursive(folder):
    
    print("Processing folder %s" % (folder))
    #print(".", end="")
    
    image_cache_root = image_cache_root_path(folder)
    if not os.path.exists(image_cache_root):
        os.mkdir(image_cache_root)

    image_cache_folder_high = image_cache_folder_path(folder, 'high')
    if not os.path.exists(image_cache_folder_high):
        os.mkdir(image_cache_folder_high)

    image_cache_folder_low = image_cache_folder_path(folder, 'low')
    if not os.path.exists(image_cache_folder_low):
        os.mkdir(image_cache_folder_low)

    # Get list of all image folders
    for d in sorted(os.listdir(folder)):
        
        # Ignore cache itself
        if d.startswith("."):
            print(f'skip hidden file {d}')
            continue
        
        # Recursive to folder
        if os.path.isdir(os.path.join(folder, d)):
            print(f'skip folder {d}')
            process_imagefolder_recursive(os.path.join(folder, d))
            continue

        # Check picture only
        if d.lower().endswith(('.jpg', 'png')):
            image_path = os.path.join(folder, d)
            image_cache_high = image_cache_file_path(folder, d, 'high')
            image_cache_low = image_cache_file_path(folder, d, 'low')
            process_image(folder, image_path, image_cache_high, image_cache_low)
        
process_imagefolder_recursive('.\Day_101\ml100marathon-final-exam\image_data')
        
print("Done!")

Processing folder .\Day_101\ml100marathon-final-exam\image_data
skip hidden file .cache
skip folder test
Processing folder .\Day_101\ml100marathon-final-exam\image_data\test
skip hidden file .cache
skip folder train
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train
skip hidden file .cache
skip folder daisy
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\daisy
skip hidden file .cache
skip folder dandelion
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\dandelion
skip hidden file .cache
skip folder rose
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\rose
skip hidden file .cache
skip folder sunflower
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\sunflower
skip hidden file .cache
skip folder tulip
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\tulip
skip hidden file .cache
Done!


In [4]:
import imageio

In [5]:
# Read image file to memory as RGB array
def load_image_to_rgb(image_path):
    im = imageio.imread(image_path)
    # print(im.shape)
    return im


In [6]:
import numpy as np

In [7]:
def load_image_and_label(folder, showfile=False):
    image_array = np.empty((0, SMALL, SMALL, 3), int)
    file_array = np.empty((0), str)
    count = 0
    print(f'image_array.shape = {image_array.shape}')
    folder_list = os.listdir(folder)
    print(f'folder_list count of {folder} is {len(folder_list)}')
    for d in folder_list:
        if not d.lower().endswith(('.jpg', 'png')):
            print(f'skip non image file: {d}')
            continue
        im = load_image_to_rgb(os.path.join(folder, d))
        if im is None:
            print(f'Error when loading image:{os.path.join(folder, d)}')
            continue
        # img_array[count] = im
        #print(f'im.shape = {im.shape}')
        a = np.empty((1, SMALL, SMALL, 3), int)
        a[0] = im
        image_array = np.append(image_array, a, axis=0)
        file_array = np.append(file_array, [os.path.splitext(d)[0]], axis=0)
        count += 1
        
        #print(f'count = {count}')
        # print(f'd={d}')
        # print(f'image_array.shape = {image_array.shape}')
        
    if showfile:
        return (image_array, count, file_array)
    else:
        return (image_array, count)

In [8]:

x_img_train = np.empty((0, SMALL, SMALL, 3), int)
y_label_train = np.empty((0, 1), int)

for index, flower in label_dict.items():
    print(f'Processing training data, flower={flower}, index={index}')
    image_cache_low = image_cache_folder_path(os.path.join('.\Day_101\ml100marathon-final-exam\image_data', 'train', flower), 'low')
    
    x, count = load_image_and_label(image_cache_low)
    x_img_train = np.append(x_img_train, x, axis = 0)
    
    y = [[ index ] for x in range(0, count)]
    y_label_train = np.append(y_label_train, np.asarray(y), axis = 0)
    
    print(x_img_train.shape, y_label_train.shape)
    
print(x_img_train[0])
print(y_label_train)

Processing training data, flower=daisy, index=0
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\daisy\.cache\low is 500
(500, 120, 120, 3) (500, 1)
Processing training data, flower=dandelion, index=1
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\dandelion\.cache\low is 687
(1187, 120, 120, 3) (1187, 1)
Processing training data, flower=rose, index=2
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\rose\.cache\low is 515
(1702, 120, 120, 3) (1702, 1)
Processing training data, flower=sunflower, index=3
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\sunflower\.cache\low is 488
(2190, 120, 120, 3) (2190, 1)
Processing training data, flower=tulip, index=4
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam

In [9]:

def plot_images_labels_prediction(images, labels, prediction, idx, num=10):
    # gcf: Get Current Figure
    fig = plt.gcf()
    fig.set_size_inches(10, 14)
    if num>25: num=25 
    for i in range(0, num):
        ax=plt.subplot(5,5, 1+i)
        ax.imshow(images[idx],cmap='binary')
                
        title = str(i) + ',' + label_dict[labels[i][0]]
        if len(prediction)>0:
            title += '=>' + label_dict[prediction[i]]
            
        ax.set_title(title,fontsize=10) 
        ax.set_xticks([]);ax.set_yticks([])        
        idx+=1 
    plt.show()

In [10]:
# plot_images_labels_prediction(x_img_train,y_label_train,[],0)

# Image normalize 

In [11]:
# Show first pixel
x_img_train[0][0][0]

array([ 93, 103, 130])

In [12]:
x_img_train_normalize = x_img_train.astype('float32') / 255.0


In [13]:
# Normalize R,G,B to 0.0 ~ 1.0
x_img_train_normalize[0][0][0]

array([0.3647059 , 0.40392157, 0.50980395], dtype=float32)

# 轉換label 為OneHot Encoding

In [14]:
y_label_train.shape

(2823, 1)

In [15]:
y_label_train[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [16]:
from keras.utils import np_utils
y_label_train_OneHot = np_utils.to_categorical(y_label_train).astype('int')

Using TensorFlow backend.


In [17]:
y_label_train_OneHot.shape

(2823, 5)

In [18]:
y_label_train_OneHot[:5]

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [19]:
del x_img_train, y_label_train

# Keras sequential

In [20]:
import keras
# from keras.datasets import cifar10
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [21]:
batch_size = 128
num_classes = 5
epochs = 5

In [22]:
# build our CNN model, 多加幾層
# Best 0.4x
'''
model = Sequential()

model.add(Conv2D(filters=64, 
                 kernel_size=(3, 3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu',
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dropout(0.5))
model.add(Dense(num_classes))
#model.add(Activation('sigmoid'))
model.add(Activation('softmax'))
'''

"\nmodel = Sequential()\n\nmodel.add(Conv2D(filters=64, \n                 kernel_size=(3, 3),\n                 input_shape=x_img_train_normalize.shape[1:],\n                 activation='relu',\n                 padding='same'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\n\nmodel.add(Flatten())\nmodel.add(Dense(512))\nmodel.add(Activation('relu'))\n\nmodel.add(Dropout(0.5))\nmodel.add(Dense(num_classes))\n#model.add(Activation('sigmoid'))\nmodel.add(Activation('softmax'))\n"

In [23]:

# build our CNN model
# Best : 0.51x
'''
model = Sequential()

model.add(Conv2D(filters=64, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('relu'))

model.add(Dense(num_classes))
model.add(Activation('softmax'))

# Dump all model layers
print(model.summary())
'''

"\nmodel = Sequential()\n\nmodel.add(Conv2D(filters=64, \n                 kernel_size=(3,3),\n                 input_shape=x_img_train_normalize.shape[1:],\n                 activation='relu', \n                 padding='same'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\n\nmodel.add(Flatten())\nmodel.add(Dense(1024))\nmodel.add(Activation('relu'))\n\nmodel.add(Dense(num_classes))\nmodel.add(Activation('softmax'))\n\n# Dump all model layers\nprint(model.summary())\n"

In [24]:
# build our CNN model, 多加幾層
# Best 0.4x
'''
model = Sequential()

model.add(Conv2D(filters=64, 
                 kernel_size=(3, 3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu',
                 padding='same'))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
'''

"\nmodel = Sequential()\n\nmodel.add(Conv2D(filters=64, \n                 kernel_size=(3, 3),\n                 input_shape=x_img_train_normalize.shape[1:],\n                 activation='relu',\n                 padding='same'))\n\nmodel.add(Flatten())\nmodel.add(Dense(512))\nmodel.add(Activation('relu'))\n\nmodel.add(Dense(64))\nmodel.add(Activation('relu'))\n\nmodel.add(Dropout(0.5))\nmodel.add(Dense(num_classes))\nmodel.add(Activation('softmax'))\n"

In [31]:
# Refer to https://medium.com/@syshen/%E5%85%A5%E9%96%80%E6%B7%B1%E5%BA%A6%E5%AD%B8%E7%BF%92-2-d694cad7d1e5 
# Add multiple CNN Conv2D (Same as VGG)

model = Sequential()
'''
# Block 1 
model.add(Convolution2D(64, 3, 3, activation='relu', padding='same'))    
model.add(Convolution2D(64, 3, 3, activation='relu', padding='same'))    
model.add(MaxPooling2D((2,2), strides=(2,2)))
# Block 2
model.add(Convolution2D(128, 3, 3, activation='relu', padding='same'))    
model.add(Convolution2D(128, 3, 3, activation='relu', padding='same'))    
model.add(MaxPooling2D((2,2), strides=(2,2)))
# Block 3
model.add(Convolution2D(256, 3, 3, activation='relu', padding='same'))    
model.add(Convolution2D(256, 3, 3, activation='relu', padding='same'))       
model.add(Convolution2D(256, 3, 3, activation='relu', padding='same'))    
model.add(MaxPooling2D((2,2), strides=(2,2)))
'''

model.add(Conv2D(filters=64, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(Conv2D(filters=64, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(filters=128, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(Conv2D(filters=128, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(filters=256, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(Conv2D(filters=256, 
                 kernel_size=(3,3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu', 
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('relu'))

model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [32]:
# Dump all model layers
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 120, 120, 64)      1792      
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 120, 120, 64)      36928     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 60, 60, 64)        0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 60, 60, 128)       73856     
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 60, 60, 128)       147584    
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 30, 128)       0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 30, 30, 256)       295168    
__________

In [33]:
# validation_split: use 20 % to validate, so 50000*0.2 = 10000. (also means 40000 samples to train)
# epochs: 10 times

In [34]:
# initiate Adam "optimizer"
opt = keras.optimizers.Adam()

# Let's train the model using Adam
model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

model_history = model.fit(x_img_train_normalize, y_label_train_OneHot, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=1)  

# Score trained model.
#scores = model.evaluate(x_test, y_test, verbose=1)
#print('Test accuracy:', scores[1])

Train on 2258 samples, validate on 565 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
del x_img_train_normalize, y_label_train_OneHot

# Start to Predict Data

In [36]:
# Load testing flower data
x_img_test = np.empty((0, SMALL, SMALL, 3), int)
y_label_test = np.empty((0, 1), int)
filelist_test = np.empty((0), str)

print(f'Processing testing flower')

image_cache_low = image_cache_folder_path(os.path.join('.\Day_101\ml100marathon-final-exam\image_data', 'test'), 'low')
x, count, filelist_test = load_image_and_label(image_cache_low, showfile=True)
x_img_test = np.append(x_img_test, x, axis = 0)
    
# y = [[ 0 ] for x in range(0, count)]
# y_label_test = np.append(y_label_test, np.asarray(y), axis = 0)
    
print(x_img_test.shape)
print(x_img_test[0])
# print(y_label_test)

Processing testing flower
image_array.shape = (0, 120, 120, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\test\.cache\low is 2000
(2000, 120, 120, 3)
[[[193 202 217]
  [191 200 215]
  [186 193 209]
  ...
  [149 155 171]
  [148 154 170]
  [147 153 169]]

 [[187 196 211]
  [187 196 211]
  [185 192 208]
  ...
  [153 159 175]
  [153 159 175]
  [153 159 175]]

 [[182 191 206]
  [185 194 209]
  [185 193 206]
  ...
  [153 159 175]
  [152 158 174]
  [151 157 173]]

 ...

 [[186 192 206]
  [186 192 206]
  [186 192 206]
  ...
  [124 127 132]
  [125 128 133]
  [127 130 137]]

 [[186 192 206]
  [186 192 206]
  [186 192 206]
  ...
  [124 127 132]
  [126 129 136]
  [127 130 137]]

 [[186 192 206]
  [186 192 206]
  [186 192 206]
  ...
  [125 128 135]
  [126 129 136]
  [128 131 138]]]


In [37]:
print(f'filelist_test.shape = {filelist_test.shape}')
filelist_test[:5]

filelist_test.shape = (2000,)


array(['0028624c49b3e0610ff9f1d111f5d532',
       '002c30700185b7971369258b438070d5',
       '00852f4f666acecd0c0d140365b42efd',
       '00c08828fce04e360c732cac01edad9e',
       '00d366e7877b6a78b104b57d67b60e6b'], dtype='<U32')

In [38]:
print('x_img_test:',x_img_test.shape)
print('y_label_test :',y_label_test.shape)
x_img_test_normalize = x_img_test.astype('float32') / 255.0

x_img_test: (2000, 120, 120, 3)
y_label_test : (0, 1)


In [39]:
y_label_test_OneHot = model.predict(x_img_test_normalize)

In [40]:
y_label_test_OneHot.shape

(2000, 5)

In [41]:
y_label_test_OneHot[:10]

array([[5.8115292e-02, 1.6521143e-02, 8.4852344e-01, 6.3983970e-03,
        7.0441753e-02],
       [5.9581473e-02, 3.1401467e-02, 3.4252933e-01, 5.1795948e-01,
        4.8528239e-02],
       [2.7668862e-02, 3.7142681e-03, 9.0846884e-01, 6.1968644e-03,
        5.3951085e-02],
       [6.8256028e-02, 2.7041236e-02, 6.7879206e-01, 4.8264880e-02,
        1.7764579e-01],
       [2.7161343e-02, 3.8268096e-03, 9.1031766e-01, 5.9936089e-03,
        5.2700642e-02],
       [5.4528066e-03, 7.0695608e-04, 9.7518969e-01, 4.1722425e-04,
        1.8233368e-02],
       [4.1641045e-02, 2.2977667e-01, 9.9203318e-02, 5.9252673e-01,
        3.6852274e-02],
       [1.2645437e-01, 1.1858117e-01, 2.3337859e-01, 4.0721533e-01,
        1.1437062e-01],
       [1.0665622e-02, 1.6132484e-03, 9.8113662e-01, 6.9356599e-04,
        5.8909077e-03],
       [1.4297724e-01, 7.1882802e-01, 1.0058878e-01, 3.2346305e-02,
        5.2595995e-03]], dtype=float32)

In [42]:
idx = np.argmax(y_label_test_OneHot, axis=-1)
idx.shape

(2000,)

In [43]:
submit = np.column_stack([filelist_test, idx])

In [44]:
import pandas as pd 

In [45]:
submit = pd.DataFrame(submit, columns=['id', 'flower_class'])

In [46]:
submit['id'] = submit['id'].astype(str)
submit['flower_class'] = submit['flower_class'].astype(int)

In [47]:
submit.dtypes

id              object
flower_class     int32
dtype: object

In [48]:
submit.head()

Unnamed: 0,id,flower_class
0,0028624c49b3e0610ff9f1d111f5d532,2
1,002c30700185b7971369258b438070d5,3
2,00852f4f666acecd0c0d140365b42efd,2
3,00c08828fce04e360c732cac01edad9e,2
4,00d366e7877b6a78b104b57d67b60e6b,2


In [49]:
label_array = [[0, 'daisy'], [1, 'dandelion'], [2, 'rose'], [3, 'sunflower'], [4, 'tulip']]

In [50]:
label_array = pd.DataFrame(label_array, columns=['flower_class', 'flower_name'])

In [51]:
label_array['flower_class'] = label_array['flower_class'].astype(int)
label_array['flower_name'] = label_array['flower_name'].astype(str)

In [52]:
label_array.dtypes

flower_class     int32
flower_name     object
dtype: object

In [53]:
label_array.head()

Unnamed: 0,flower_class,flower_name
0,0,daisy
1,1,dandelion
2,2,rose
3,3,sunflower
4,4,tulip


In [54]:
pd.merge(submit, label_array, how='left', on='flower_class')

Unnamed: 0,id,flower_class,flower_name
0,0028624c49b3e0610ff9f1d111f5d532,2,rose
1,002c30700185b7971369258b438070d5,3,sunflower
2,00852f4f666acecd0c0d140365b42efd,2,rose
3,00c08828fce04e360c732cac01edad9e,2,rose
4,00d366e7877b6a78b104b57d67b60e6b,2,rose
5,00e803f7bc6d21b6d6d3a98136ea4635,2,rose
6,00e9cb1dca407810856e77b31309d5ab,3,sunflower
7,014d33090eb706769ff782d8c500dc2a,3,sunflower
8,015c8f0e6b95baf9dcbb34647624c5b8,2,rose
9,0194948a29f0e891c54f88004fb4c51c,1,dandelion


In [55]:
submit.to_csv('./Day101_Final_Exam_Classfication_of_Flowers.csv', sep=',', encoding='utf-8', index=False)