In [118]:
label_dict = {0:'daisy', 1:'dandelion', 2:'rose', 3:'sunflower', 4:'tulip'}

train_data_path = './Day_101/ml100marathon-final-exam/image_data/train'

### Refer to https://github.com/lucasdupin/ml-image-scaling/blob/master/1_preprocessing.ipynb

In [119]:
import numpy as np
import matplotlib.pyplot as plt  

from PIL import Image # Make sure you have Pillow! PIL may import but WILL raise exceptions

%matplotlib inline

In [120]:
import os
import sys

SMALL=32

original_size = (80, 80)
small_size = (SMALL, SMALL)

def image_cache_root_path(image_folder):
    return os.path.join(image_folder, '.cache')

def image_cache_folder_path(image_folder, resolution):
    return os.path.join(image_cache_root_path(image_folder), resolution)
    
def image_cache_file_path(image_folder, image_file, resolution):
    return os.path.join(image_cache_folder_path(image_folder, resolution), image_file)
    

def process_image(folder, image_path, image_cache_high, image_cache_low):
    """ Resizes an image, creating a big and a reduced version """
    
    # Skip files that already were generated or directories
    if os.path.exists(image_cache_high) and os.path.exists(image_cache_low):
        return
    
    with Image.open(image_path) as original:
  
        # Ignore images smaller than our target size
        if original.size[0] < original_size[0] or original.size[1] < original_size[1]:
            print("image %s with size %s is too small, skipping..." % (image_path, original.size))
            return

        #print("resizing and cropping: %s" % image_path)
        center = (original.size[0]/2, original.size[1]/2)
        size = original_size[0]
        cropped = original.crop((center[0] - size/2, 
                                 center[1] - size/2, 
                                 center[0] + size/2,
                                 center[1] + size/2))
        # High-res version - labels
        cropped.save(image_cache_high)
        # Generate low-res version of the image - train data
        small = cropped.resize(small_size, Image.ANTIALIAS)
        small.save(image_cache_low)


def process_imagefolder_recursive(folder):
    
    print("Processing folder %s" % (folder))
    #print(".", end="")
    
    image_cache_root = image_cache_root_path(folder)
    if not os.path.exists(image_cache_root):
        os.mkdir(image_cache_root)

    image_cache_folder_high = image_cache_folder_path(folder, 'high')
    if not os.path.exists(image_cache_folder_high):
        os.mkdir(image_cache_folder_high)

    image_cache_folder_low = image_cache_folder_path(folder, 'low')
    if not os.path.exists(image_cache_folder_low):
        os.mkdir(image_cache_folder_low)

    # Get list of all image folders
    for d in sorted(os.listdir(folder)):
        
        # Ignore cache itself
        if d.startswith("."):
            print(f'skip hidden file {d}')
            continue
        
        # Recursive to folder
        if os.path.isdir(os.path.join(folder, d)):
            print(f'skip folder {d}')
            process_imagefolder_recursive(os.path.join(folder, d))
            continue

        # Check picture only
        if d.lower().endswith(('.jpg', 'png')):
            image_path = os.path.join(folder, d)
            image_cache_high = image_cache_file_path(folder, d, 'high')
            image_cache_low = image_cache_file_path(folder, d, 'low')
            process_image(folder, image_path, image_cache_high, image_cache_low)
        
process_imagefolder_recursive('.\Day_101\ml100marathon-final-exam\image_data')
        
print("Done!")

Processing folder .\Day_101\ml100marathon-final-exam\image_data
skip hidden file .cache
skip folder test
Processing folder .\Day_101\ml100marathon-final-exam\image_data\test
skip hidden file .cache
skip folder train
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train
skip hidden file .cache
skip folder daisy
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\daisy
skip hidden file .cache
skip folder dandelion
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\dandelion
skip hidden file .cache
skip folder rose
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\rose
skip hidden file .cache
skip folder sunflower
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\sunflower
skip hidden file .cache
skip folder tulip
Processing folder .\Day_101\ml100marathon-final-exam\image_data\train\tulip
skip hidden file .cache
Done!


In [121]:
import imageio

In [122]:
# Read image file to memory as RGB array
def load_image_to_rgb(image_path):
    im = imageio.imread(image_path)
    # print(im.shape)
    return im


In [123]:
import numpy as np

In [124]:
def load_image_and_label(folder, showfile=False):
    image_array = np.empty((0, SMALL, SMALL, 3), int)
    file_array = np.empty((0), str)
    count = 0
    print(f'image_array.shape = {image_array.shape}')
    folder_list = os.listdir(folder)
    print(f'folder_list count of {folder} is {len(folder_list)}')
    for d in folder_list:
        if not d.lower().endswith(('.jpg', 'png')):
            print(f'skip non image file: {d}')
            continue
        im = load_image_to_rgb(os.path.join(folder, d))
        if im is None:
            print(f'Error when loading image:{os.path.join(folder, d)}')
            continue
        # img_array[count] = im
        #print(f'im.shape = {im.shape}')
        a = np.empty((1, SMALL, SMALL, 3), int)
        a[0] = im
        image_array = np.append(image_array, a, axis=0)
        file_array = np.append(file_array, [os.path.splitext(d)[0]], axis=0)
        count += 1
        
        #print(f'count = {count}')
        # print(f'd={d}')
        # print(f'image_array.shape = {image_array.shape}')
        
    if showfile:
        return (image_array, count, file_array)
    else:
        return (image_array, count)

In [125]:

x_img_train = np.empty((0, SMALL, SMALL, 3), int)
y_label_train = np.empty((0, 1), int)

for index, flower in label_dict.items():
    print(f'Processing training data, flower={flower}, index={index}')
    image_cache_low = image_cache_folder_path(os.path.join('.\Day_101\ml100marathon-final-exam\image_data', 'train', flower), 'low')
    
    x, count = load_image_and_label(image_cache_low)
    x_img_train = np.append(x_img_train, x, axis = 0)
    
    y = [[ index ] for x in range(0, count)]
    y_label_train = np.append(y_label_train, np.asarray(y), axis = 0)
    
    print(x_img_train.shape, y_label_train.shape)
    
print(x_img_train[0])
print(y_label_train)

Processing training data, flower=daisy, index=0
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\daisy\.cache\low is 500
(500, 32, 32, 3) (500, 1)
Processing training data, flower=dandelion, index=1
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\dandelion\.cache\low is 687
(1187, 32, 32, 3) (1187, 1)
Processing training data, flower=rose, index=2
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\rose\.cache\low is 515
(1702, 32, 32, 3) (1702, 1)
Processing training data, flower=sunflower, index=3
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\sunflower\.cache\low is 488
(2190, 32, 32, 3) (2190, 1)
Processing training data, flower=tulip, index=4
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\train\

In [126]:

def plot_images_labels_prediction(images, labels, prediction, idx, num=10):
    # gcf: Get Current Figure
    fig = plt.gcf()
    fig.set_size_inches(10, 14)
    if num>25: num=25 
    for i in range(0, num):
        ax=plt.subplot(5,5, 1+i)
        ax.imshow(images[idx],cmap='binary')
                
        title = str(i) + ',' + label_dict[labels[i][0]]
        if len(prediction)>0:
            title += '=>' + label_dict[prediction[i]]
            
        ax.set_title(title,fontsize=10) 
        ax.set_xticks([]);ax.set_yticks([])        
        idx+=1 
    plt.show()

In [127]:
# plot_images_labels_prediction(x_img_train,y_label_train,[],0)

# Image normalize 

In [128]:
# Show first pixel
x_img_train[0][0][0]

array([136, 130,  98])

In [129]:
x_img_train_normalize = x_img_train.astype('float32') / 255.0


In [130]:
# Normalize R,G,B to 0.0 ~ 1.0
x_img_train_normalize[0][0][0]

array([0.53333336, 0.50980395, 0.38431373], dtype=float32)

# 轉換label 為OneHot Encoding

In [131]:
y_label_train.shape

(2823, 1)

In [132]:
y_label_train[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [133]:
from keras.utils import np_utils
y_label_train_OneHot = np_utils.to_categorical(y_label_train).astype('int')

In [134]:
y_label_train_OneHot.shape

(2823, 5)

In [135]:
y_label_train_OneHot[:5]

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [136]:
del x_img_train, y_label_train

# Keras sequential

In [137]:
import keras
# from keras.datasets import cifar10
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [138]:
batch_size = 16
num_classes = 5
epochs = 10

In [139]:
# build our CNN model, 多加幾層
model = Sequential()

model.add(Conv2D(filters=64, 
                 kernel_size=(3, 3),
                 input_shape=x_img_train_normalize.shape[1:],
                 activation='relu',
                 padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))
#model.add(Activation('softmax'))



In [140]:
# Dump all model layers
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 32, 32, 64)        1792      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 16, 64)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 16384)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               8389120   
_________________________________________________________________
activation_9 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 2565      
__________

In [141]:
# validation_split: use 20 % to validate, so 50000*0.2 = 10000. (also means 40000 samples to train)
# epochs: 10 times

In [142]:
# initiate Adam "optimizer"
opt = keras.optimizers.Adam()

# Let's train the model using Adam
model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

model_history = model.fit(x_img_train_normalize, y_label_train_OneHot, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=1)  

# Score trained model.
#scores = model.evaluate(x_test, y_test, verbose=1)
#print('Test accuracy:', scores[1])

Train on 2258 samples, validate on 565 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [143]:
del x_img_train_normalize, y_label_train_OneHot

# Start to Predict Data

In [144]:
# Load testing flower data
x_img_test = np.empty((0, SMALL, SMALL, 3), int)
y_label_test = np.empty((0, 1), int)
filelist_test = np.empty((0), str)

print(f'Processing testing flower')

image_cache_low = image_cache_folder_path(os.path.join('.\Day_101\ml100marathon-final-exam\image_data', 'test'), 'low')
x, count, filelist_test = load_image_and_label(image_cache_low, showfile=True)
x_img_test = np.append(x_img_test, x, axis = 0)
    
y = [[ 0 ] for x in range(0, count)]
y_label_test = np.append(y_label_test, np.asarray(y), axis = 0)
    
print(x_img_test.shape, y_label_test.shape)
print(x_img_test[0])
print(y_label_test)

Processing testing flower
image_array.shape = (0, 32, 32, 3)
folder_list count of .\Day_101\ml100marathon-final-exam\image_data\test\.cache\low is 2000
(2000, 32, 32, 3) (2000, 1)
[[[135 137 115]
  [143 142 122]
  [128 122 108]
  ...
  [ 14  24   0]
  [ 19  27  12]
  [ 51  59  48]]

 [[ 84  86  64]
  [141 138 121]
  [154 146 135]
  ...
  [ 39  49  25]
  [102 110  95]
  [105 113 102]]

 [[ 57  56  36]
  [109 106  89]
  [142 133 124]
  ...
  [ 35  45  21]
  [ 56  65  48]
  [ 11  17   5]]

 ...

 [[128  59  77]
  [126  57  75]
  [124  56  71]
  ...
  [ 91  45  48]
  [100  45  51]
  [127  67  75]]

 [[144  72  84]
  [131  59  70]
  [132  60  71]
  ...
  [ 86  42  43]
  [ 95  38  44]
  [119  54  62]]

 [[148  72  82]
  [137  62  69]
  [144  69  76]
  ...
  [119  75  76]
  [106  47  53]
  [106  39  46]]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [145]:
print(f'filelist_test.shape = {filelist_test.shape}')
filelist_test[:5]

filelist_test.shape = (2000,)


array(['0028624c49b3e0610ff9f1d111f5d532',
       '002c30700185b7971369258b438070d5',
       '00852f4f666acecd0c0d140365b42efd',
       '00c08828fce04e360c732cac01edad9e',
       '00d366e7877b6a78b104b57d67b60e6b'], dtype='<U32')

In [146]:
print('x_img_test:',x_img_test.shape)
print('y_label_test :',y_label_test.shape)
x_img_test_normalize = x_img_test.astype('float32') / 255.0

x_img_test: (2000, 32, 32, 3)
y_label_test : (2000, 1)


In [147]:
y_label_test_OneHot = model.predict(x_img_test_normalize)

In [148]:
y_label_test_OneHot.shape

(2000, 5)

In [149]:
y_label_test_OneHot[:10]

array([[1.3219657e-03, 1.8079878e-03, 4.5336541e-02, 2.2863748e-03,
        7.9935929e-04],
       [2.5627911e-03, 1.7287708e-03, 3.2592849e-03, 1.3987745e-01,
        2.5442237e-04],
       [3.9744170e-04, 8.7482640e-06, 1.0051860e-03, 2.1811591e-02,
        1.1030125e-03],
       [3.6341205e-06, 7.4583646e-08, 9.3259607e-03, 9.0099435e-05,
        1.9664073e-04],
       [3.0617954e-02, 5.4759085e-03, 8.5567307e-01, 4.7584781e-03,
        8.6003326e-02],
       [5.3492299e-04, 6.5702232e-05, 7.5249869e-01, 1.4787638e-03,
        1.7333299e-02],
       [1.3364902e-02, 6.0101103e-02, 1.7739547e-02, 1.4020076e-01,
        3.3059993e-03],
       [3.2094168e-04, 6.6651835e-04, 2.1752694e-05, 7.4762739e-03,
        1.7474022e-05],
       [5.5941568e-06, 2.5678815e-05, 1.6383909e-01, 7.3045428e-04,
        7.9642632e-04],
       [2.8016358e-03, 4.9100179e-02, 2.8442871e-04, 2.6822281e-05,
        1.5178222e-06]], dtype=float32)

In [150]:
idx = np.argmax(y_label_test_OneHot, axis=-1)
idx.shape

(2000,)

In [151]:
submit = np.column_stack([filelist_test, idx])

In [152]:
import pandas as pd 

In [153]:
submit = pd.DataFrame(submit, columns=['id', 'flower_class'])

In [154]:
submit['id'] = submit['id'].astype(str)
submit['flower_class'] = submit['flower_class'].astype(int)

In [155]:
submit.dtypes

id              object
flower_class     int32
dtype: object

In [156]:
submit.head()

Unnamed: 0,id,flower_class
0,0028624c49b3e0610ff9f1d111f5d532,2
1,002c30700185b7971369258b438070d5,3
2,00852f4f666acecd0c0d140365b42efd,3
3,00c08828fce04e360c732cac01edad9e,2
4,00d366e7877b6a78b104b57d67b60e6b,2


In [157]:
label_array = [[0, 'daisy'], [1, 'dandelion'], [2, 'rose'], [3, 'sunflower'], [4, 'tulip']]

In [158]:
label_array = pd.DataFrame(label_array, columns=['flower_class', 'flower_name'])

In [159]:
label_array['flower_class'] = label_array['flower_class'].astype(int)
label_array['flower_name'] = label_array['flower_name'].astype(str)

In [160]:
label_array.dtypes

flower_class     int32
flower_name     object
dtype: object

In [161]:
label_array.head()

Unnamed: 0,flower_class,flower_name
0,0,daisy
1,1,dandelion
2,2,rose
3,3,sunflower
4,4,tulip


In [162]:
pd.merge(submit, label_array, how='left', on='flower_class')

Unnamed: 0,id,flower_class,flower_name
0,0028624c49b3e0610ff9f1d111f5d532,2,rose
1,002c30700185b7971369258b438070d5,3,sunflower
2,00852f4f666acecd0c0d140365b42efd,3,sunflower
3,00c08828fce04e360c732cac01edad9e,2,rose
4,00d366e7877b6a78b104b57d67b60e6b,2,rose
5,00e803f7bc6d21b6d6d3a98136ea4635,2,rose
6,00e9cb1dca407810856e77b31309d5ab,3,sunflower
7,014d33090eb706769ff782d8c500dc2a,3,sunflower
8,015c8f0e6b95baf9dcbb34647624c5b8,2,rose
9,0194948a29f0e891c54f88004fb4c51c,1,dandelion


In [163]:
submit.to_csv('./Day101_Final_Exam_Classfication_of_Flowers.csv', sep=',', encoding='utf-8', index=False)