In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns


#model selection
from tensorflow.keras.models import Model, load_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Add,Dropout, Dense, Activation, ZeroPadding2D, \
BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalAveragePooling2D
#preprocess.
from keras.preprocessing.image import ImageDataGenerator

#dl libraraies
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.utils import to_categorical

# specifically for cnn
from keras.layers import Dropout, Flatten,Activation
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
import tensorflow as tf
import random as rn

# specifically for manipulating zipped images and getting numpy arrays of pixel values of images.
import cv2                  
import numpy as np  
from tqdm import tqdm
import os                   
from random import shuffle  
from zipfile import ZipFile
import PIL
import PIL.Image

import glob   
import shutil


# Data pre-processing

In [2]:
data_dir = os.curdir + "/Flowers"
# Training data dir
training_dir = os.curdir + '/Train'

# Test data dir
testing_dir = os.curdir + '/Test'

# Ratio of training and testing data
train_test_ratio = 0.8 

def split_dataset_into_test_and_train_sets(all_data_dir = data_dir, training_data_dir = training_dir, \
                                           testing_data_dir=testing_dir, train_test_ratio = 0.8):
    # Recreate testing and training directories
    
    if not os.path.exists(training_data_dir):
            os.mkdir(training_data_dir)

    if not os.path.exists(testing_data_dir):
            os.mkdir(testing_data_dir)               
    
    num_training_files = 0
    num_testing_files = 0


    for subdir, dirs, files in os.walk(all_data_dir):
        
        category_name = os.path.basename(subdir)
        
        # print(category_name + " vs " + os.path.basename(all_data_dir))
        if category_name == os.path.basename(all_data_dir):
              continue

        training_data_category_dir = training_data_dir + '/' + category_name
        testing_data_category_dir = testing_data_dir + '/' + category_name
        
        # creating subdir for each sub category
        if not os.path.exists(training_data_category_dir):
            os.mkdir(training_data_category_dir)   

        if not os.path.exists(testing_data_category_dir):
            os.mkdir(testing_data_category_dir)
            
        file_list = glob.glob(os.path.join(subdir,'*.jpg'))

        #print(os.path.join(all_data_dir, subdir))
        print(str(category_name) + ' has ' + str(len(files)) + ' images') 
        random_set = np.random.permutation((file_list))
        # copy percentage of data from each category to train and test directory
        train_list = random_set[:round(len(random_set)*(train_test_ratio))] 
        test_list = random_set[-round(len(random_set)*(1-train_test_ratio)):]

  

        for lists in train_list : 
            shutil.copy(lists, training_data_dir + '/' + category_name + '/' )
            num_training_files += 1
  
        for lists in test_list : 
            shutil.copy(lists, testing_data_dir + '/' + category_name + '/' )
            num_testing_files += 1
  

    print("Processed " + str(num_training_files) + " training files.")
    print("Processed " + str(num_testing_files) + " testing files.")

In [3]:
split_dataset_into_test_and_train_sets()

Babi has 931 images
Calimerio has 353 images
Chrysanthemum has 696 images
Hydrangeas has 518 images
Lisianthus has 969 images
Pingpong has 360 images
Rosy has 171 images
Tana has 623 images
Processed 3696 training files.
Processed 925 testing files.


In [4]:
# Defining the imagedatagenerator for train and test image for pre-processing
# We don't give horizonal_flip or other preprocessing for validation data generator

image_size = 224
batch_size = 64

train_data_gen = ImageDataGenerator(
    rescale = 1./255, #normalization
    rotation_range = 40,
    width_shift_range = 0.2,
    height_shift_range = 0.2,
    shear_range = 0.2,
    zoom_range = 0.1,
    fill_mode = 'nearest',
    horizontal_flip = True)
valid_data_gen = ImageDataGenerator(rescale = 1./255)

#My train and set data used in model
train_generator = train_data_gen.flow_from_directory(training_dir, (image_size,image_size), batch_size=batch_size, class_mode='categorical')
valid_generator = valid_data_gen.flow_from_directory(testing_dir, (image_size,image_size), batch_size=batch_size, class_mode='categorical')

#Balance dataset with different image resolution
train200 = train_data_gen.flow_from_directory(training_dir, (200,200), batch_size=batch_size, class_mode='categorical')
test200 = valid_data_gen.flow_from_directory(testing_dir, (200,200), batch_size=batch_size, class_mode='categorical')

train250 = train_data_gen.flow_from_directory(training_dir, (250,250), batch_size=batch_size, class_mode='categorical')
test250 = valid_data_gen.flow_from_directory(testing_dir, (250,250), batch_size=batch_size, class_mode='categorical')

train300 = train_data_gen.flow_from_directory(training_dir, (300,300), batch_size=batch_size, class_mode='categorical')
test300 = valid_data_gen.flow_from_directory(testing_dir, (300,300), batch_size=batch_size, class_mode='categorical')

train_raw = valid_data_gen.flow_from_directory(training_dir, (250,250), batch_size=batch_size, class_mode='categorical')
test_raw = valid_data_gen.flow_from_directory(testing_dir, (250,250), batch_size=batch_size, class_mode='categorical')

#Balance dataset with grayscale images
train_grayscale = train_data_gen.flow_from_directory(training_dir, (250,250), batch_size=batch_size, class_mode='categorical',color_mode="grayscale")
valid_grayscale = valid_data_gen.flow_from_directory(testing_dir, (250,250), batch_size=batch_size, class_mode='categorical',color_mode="grayscale")


Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.
Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.
Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.
Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.
Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.
Found 3696 images belonging to 8 classes.
Found 925 images belonging to 8 classes.


## Base model

In [5]:
def create_cnn_basemodel(size):
    model = Sequential()
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',activation ='relu', input_shape = (size,size,3)))
    model.add(MaxPooling2D(pool_size=(2,2)))


    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',activation ='relu'))
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))


    model.add(Conv2D(filters =96, kernel_size = (3,3),padding = 'Same',activation ='relu'))
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))

    model.add(Conv2D(filters = 96, kernel_size = (3,3),padding = 'Same',activation ='relu'))
    model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))

    model.add(Flatten())

    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(8, activation = "softmax"))
    
    model.compile(optimizer=Adam(lr=0.001),loss='categorical_crossentropy',metrics=['accuracy'])
    return model

In [6]:
cnn200 = create_cnn_basemodel(200)
cnn250 =  create_cnn_basemodel(250)
cnn300 =  create_cnn_basemodel(300)

# CNN model using raw data also have size 250,250
cnn_raw = cnn250

  super().__init__(name, **kwargs)


### CNN with Raw Data

In [10]:
history = cnn_raw.fit(
    train_raw,
    steps_per_epoch=train_raw.n//batch_size,
    validation_data=test_raw,
    validation_steps=test_raw.n//batch_size,
    epochs=10,
    verbose=1)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

Epoch 1/10

KeyboardInterrupt: 

### CNN with image dataset size 200

In [12]:
history = cnn200.fit(
    train200,
    steps_per_epoch=train200.n//batch_size,
    validation_data=test200,
    validation_steps=test200.n//batch_size,
    epochs=10,
    verbose=1)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy for image dataset size 200')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

Epoch 1/10

KeyboardInterrupt: 

### CNN with image dataset size 250

In [None]:
history = cnn250.fit(
    train250,
    steps_per_epoch=train250.n//batch_size,
    validation_data=test250,
    validation_steps=test250.n//batch_size,
    epochs=10,
    verbose=1)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy for image dataset size 250')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

### CNN with image dataset size 300

In [None]:
history = cnn300.fit(
    train300,
    steps_per_epoch=train300.n//batch_size,
    validation_data=test300,
    validation_steps=test300.n//batch_size,
    epochs=10,
    verbose=1)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy for image dataset size 300')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

### Base model for grayscale

In [14]:
grayscaleModel = Sequential()
grayscaleModel.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',activation ='relu', input_shape = (250,250,1)))
grayscaleModel.add(MaxPooling2D(pool_size=(2,2)))


grayscaleModel.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',activation ='relu'))
grayscaleModel.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))
 

grayscaleModel.add(Conv2D(filters =96, kernel_size = (3,3),padding = 'Same',activation ='relu'))
grayscaleModel.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))

grayscaleModel.add(Conv2D(filters = 96, kernel_size = (3,3),padding = 'Same',activation ='relu'))
grayscaleModel.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))

grayscaleModel.add(Flatten())

grayscaleModel.add(Dense(512))
grayscaleModel.add(Activation('relu'))
grayscaleModel.add(Dense(8, activation = "softmax"))

In [15]:
grayscaleModel.compile(optimizer=Adam(lr=0.001),loss='categorical_crossentropy',metrics=['accuracy'])
grayscaleModel.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 250, 250, 32)      832       
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 125, 125, 32)     0         
 g2D)                                                            
                                                                 
 conv2d_13 (Conv2D)          (None, 125, 125, 64)      18496     
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 62, 62, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_14 (Conv2D)          (None, 62, 62, 96)        55392     
                                                                 
 max_pooling2d_14 (MaxPoolin  (None, 31, 31, 96)      

In [16]:
#Replace the train data to test run
history = grayscaleModel.fit(
    train_grayscale,
    steps_per_epoch=train_grayscale.n//batch_size,
    validation_data=valid_grayscale,
    validation_steps=valid_grayscale.n//batch_size,
    epochs=10,
    verbose=1)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy for image dataset size 300')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train', 'test'])
plt.show()

Epoch 1/10
 5/57 [=>............................] - ETA: 1:40 - loss: 2.1825 - accuracy: 0.1625

KeyboardInterrupt: 