# Data Preparation
Data preparation will include the augmented images. 
### Steps
1. Set the paths to data
2. Convert All images to the same dimension  
3. Convert to NumPy array
4. Change the data type to Unsigned integer (space and file size consideration).
5. Create the labels for the images
6. Save the data file for later use.


In [1]:
# initialization
import matplotlib.pyplot as plt
import numpy 
import PIL
import pandas as pd
import os
import glob
import time
import numpy as np
import h5py
import tensorflow as tf
import deepdish as dd

from tqdm import tqdm
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.layers import Input, Concatenate, Dense, Dropout, Flatten, Activation
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical
from keras import backend as K
from keras.optimizers import Adam
from keras.preprocessing import image

%matplotlib inline
K.set_image_dim_ordering( 'tf' )

Using TensorFlow backend.


In [2]:
# directory structure of the data

dpath_train = '/home/ubuntu/train'      # Data path to training data set
dpath_test  = '/home/ubuntu/test'       # Data path to test data set
dpath       = '/home/ubuntu'
label_f     = '/home/ubuntu/labels.csv' # Path to the label file 

## Start processing the train images
* ** Get the data **
* ** Resize and convert to numpy array **
* ** Add the 4th axis**
* ** concatenate images** 
* ** Store as unsigned** this will keep the file size small and manageable.
* ** create the labels for the images** 

### Long way to create the X_test data set. This code is here for legacy reason. (python code)
** This data set will be used in the CNN models and X_train needs to be normalized NumPy array with type "float32" **

In [3]:
# Preparing the input files Following loop will read the csv file for labels
# Make sure we are in the correct directory then rea the csv file for all the different breeds

os.chdir(dpath)
id_label= pd.read_csv(label_f)

os.chdir(dpath_train)    # need to be in train data dir
# these are the files in the train directory
dogs = sorted(glob.glob('*.jpg'))

# Size for all images.  we have selected the 224 since the VGG 19 pretrained model we will use needs to
# be in 224 X 224 shape.
im_W= 224
im_H= 224

# create X_train and y_train  with rescaling the images

# rescale is a function we will call in for loop
def rescale(image):
    org_image = PIL.Image.open(image)
    trans_image = org_image.resize( (im_W, im_H) )
    tempAry = np.array(trans_image)
    tempAry = np.expand_dims(tempAry, axis = 0)    
    return tempAry

# declearing some varialbles for loop counter and book keeping
flag = 0
X_train=[]  #Training vector
y_train=[]
Count=0
for dog in tqdm(dogs):
#    Count+=1
#    if Count==11000:  # debug just to see if the loop is working.
#        break
    if flag == 0:  #. this is used to handel the first picture. should come up with elegant situation.
        X_train=rescale(dog)
        flag=1
    else :   
        X_train=np.append(X_train,rescale(dog),axis=0) # creat a numpy array for tarining.  

#building y_train 
    newFlag = 0
    for x in range(id_label.shape[0]):
    
        if id_label.iloc[x,0] == dog[:-4]:       # Pilling off the ".jpg" from file name
            y_train.append(id_label.iloc[x,1]) 
            id_label.drop([x])
            newFlag = 1
            break
# The following can be added to the above loop to make sure indexing is working correctly and not missing anything. 
# Don't need it in reqular run 
#    if newFlag == 0:
#        y_train.append("Oops!") 
X_train= X_train.astype('float32' )/ 255  #normalize the X_train.     

100%|██████████| 10222/10222 [58:19<00:00,  1.47it/s]


In [11]:
y_train[1]

'dingo'

In [12]:
# Now let's make sure we have all the unique dog names identified and check the total number.

unique_Dog_Breed = []
for i in y_train:
    if i not in unique_Dog_Breed:
        unique_Dog_Breed.append(i)
    
unique_Dog_Breed.sort()   # Now sort the unique label alphabetically. Next, we need to count howmany unique label

count = 0
unique_Dog_Breed_Num = []   # integer associated with every unique Dog name

# with the next loops we are creating integers for each unique label. in preparation of catagorizing the Y.

for i in unique_Dog_Breed:
    unique_Dog_Breed_Num.append([i, count])
    count += 1

for i in range(len(y_train)):
    for j in unique_Dog_Breed_Num:
        if y_train[i] == j[0]:
            y_train[i] = j[1]
            break

print("unique Dog Breed Numbers = ",len(unique_Dog_Breed_Num))
y_train = to_categorical(y_train)

unique Dog Breed Numbers =  120


## Data Shuffling 
**Now Shuffle the Training set just in case if they are sorted** 

In [13]:
# now shuffling training data. 
indx = np.array(list(range(len(dogs))))
np.random.shuffle(indx)
print(indx[:10]) #check the shuffling
#shuffle data
Xs_train = X_train[indx]  # Shuffled training set

[ 2973  9551  8383 10054   295  1125  9119  7263  1675  1950]


## Create the X_test

In [14]:
# We need to be in test directory. 

os.chdir(dpath_test)  # need to be in test data dir
# these are the files in the train directory
test_dogs = sorted(glob.glob('*.jpg'))

# declaring some variables for loop counter and book keeping
flag = 0
X_test = []  # Test vector

Count = 0
for w in tqdm(test_dogs):
    Count += 1
#    if Count == 1000:  # debug just to see if it is working.
#        break
    if flag == 0:  # . this is used to handle the first picture. should come up with elegant situation.
        X_test = rescale(w)
        flag = 1
    else:
        X_test = np.append(X_test, rescale(w), axis=0)  # create a numpy array for test.  

X_test= X_test.astype( 'float32' ) / 255  #normalize

100%|██████████| 10357/10357 [52:25<00:00,  1.68it/s]


## Save the "X_train", "X_test", and "y_train" for future use.
** This is a hdf5 format for large files and has to be retreated by hdf5 **

In [15]:
# Saving the X_train, X_test, and y_train with hdf5 format.
# makesure we are in the correct directory
os.chdir(dpath)
os.getcwd()          # Print Path just to make sure it is in correct directory.
# now save to hdf5 
# also hdf5 is a hierarchical data which lets you assign tags/groups or subgroups to your data

with h5py.File('traindata-org.hdf5','w') as f:   # the -org. refers to pre augmented data set
    f.create_dataset('X', data=X_train)
    f.create_dataset('Y', data=y_train)
with h5py.File('testdata.hdf5','w') as f:
    f.create_dataset('X', data=X_test)
# check point for the shape.  y_train is one hot encoed 
print(X_test.shape,Xs_train.shape, y_train.shape)

(10357, 224, 224, 3) (10222, 224, 224, 3) (10222, 120)


## Second Method for creating X_train

In [16]:
# point to location of the Train images
images = glob.glob('/home/ubuntu/train/*.jpg')
img_data = []
# resize and add new axis to images
for i in tqdm(images):
    img = image.load_img(i, target_size=(im_H, im_W))
    img_data.append(image.img_to_array(img)[np.newaxis, :, :, :])
# concatenate the images    
tr_img_data_np = np.concatenate(img_data, )
# save them as "unsigned Integer" for the smaller size. We will use this for Transfer learning Model. 
tr_img_data_np = tr_img_data_np.astype('uint8')
# create the label 
labels = pd.read_csv(label_f)

100%|██████████| 10222/10222 [00:37<00:00, 272.15it/s]


In [17]:
print(tr_img_data_np.shape,tr_img_data_np.dtype)

(10222, 224, 224, 3) uint8


In [18]:
# point to location of the Test images
images_Tst = glob.glob('/home/ubuntu/test/*.jpg')
img_data_Tst = []
# resize and add new axis to images
for i in tqdm(images_Tst):
    img = image.load_img(i, target_size=(224, 224))
    img_data_Tst.append(image.img_to_array(img)[np.newaxis, :, :, :])
# concatenate the images    
ts_img_data_np = np.concatenate(img_data_Tst, )
# save them as "unsigned Integer" for the smaller size.  
ts_img_data_np = ts_img_data_np.astype('uint8')

100%|██████████| 10357/10357 [00:37<00:00, 277.04it/s]


## Image Augmentation 

* **This function peforms various data augmentation techniques to the dataset (Flip, Rotate, Zoom, Shear)**
* [Source](https://www.kaggle.com/dhayalkarsahilr/easy-image-augmentation-techniques "Code Source")

In [19]:
# Function will take each image and create four more images.  
'''
    @parameters:
        dataset: the feature training dataset in numpy array with shape [num_examples, num_rows, num_cols, num_channels] (since it is an image in numpy array)
        dataset_labels: the corresponding training labels of the feature training dataset in the same order, and numpy array with shape [num_examples, <anything>]
        augmentation_factor: how many times to perform augmentation.
        use_random_rotation: whether to use random rotation. default: true
        use_random_shift: whether to use random shift. default: true
        use_random_shear: whether to use random shear. default: true
        use_random_zoom: whether to use random zoom. default: true
        
    @returns:
        augmented_image: augmented dataset
        augmented_image_labels: labels corresponding to augmented dataset in order.
        
    for the augmentation techniques documentation, go here:
        https://www.tensorflow.org/api_docs/python/tf/contrib/keras/preprocessing/image/random_rotation
        https://www.tensorflow.org/api_docs/python/tf/contrib/keras/preprocessing/image/random_shear
        https://www.tensorflow.org/api_docs/python/tf/contrib/keras/preprocessing/image/random_shift
        https://www.tensorflow.org/api_docs/python/tf/contrib/keras/preprocessing/image/random_zoom
'''
def augment_data(dataset, dataset_labels, augementation_factor=1, use_random_rotation=True, use_random_shear=True, use_random_shift=True, use_random_zoom=True):
    augmented_image = []
    augmented_image_labels = []

    for num in tqdm(range(0, dataset.shape[0])):  # tqdm progress bar
        for i in range(0, augementation_factor):
            # original image:
            augmented_image.append(dataset[num])
            augmented_image_labels.append(dataset_labels[num])

            if use_random_rotation:
                
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_rotation(dataset[num], 20, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            if use_random_shear:
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_shear(dataset[num], 0.2, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            if use_random_shift:
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_shift(dataset[num], 0.2, 0.2, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            if use_random_zoom:
                # update: zoomrange (second arg) should be tuple of floats
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_zoom(dataset[num], (0.9, 0.9), row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

    return np.array(augmented_image), np.array(augmented_image_labels)

## Creating the Augmented Images 
** the aug_image and aug_label. are X_train and y_train **

In [20]:
# Augment train data set this will increase the data set size by 5X
aug_images, aug_labels = augment_data(tr_img_data_np, labels.breed)

100%|██████████| 10222/10222 [07:50<00:00, 22.33it/s]


In [21]:
# check point make sure the size and shape are as expected.
print(aug_images.shape,aug_labels.shape,aug_images.dtype)
# Make sure the data type is expected.
aug_images.dtype

(51110, 224, 224, 3) (51110,) uint8


dtype('uint8')

In [22]:
print(y_train.shape, y_train.dtype)

(10222, 120) float32


In [23]:
y_train[:2]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,

## Save X_train and y_train data for later usage.
** This is input file with the Augmented images and since the sizer is very large DeepDish.io was used with compresion**

In [24]:
# Save the X_train and y_train to a HD5 file with Deep Dish.io 
Aug_train_data = '/home/ubuntu/train_aug_data.hdf5'
dd.io.save(Aug_train_data, {'X': aug_images, 'Y': aug_labels}, compression=('blosc', 9))

In [25]:
# write the test_data as hd5 with deepdish.io
test_data = '/home/ubuntu/test_data.hdf5'
dd.io.save(test_data, {'X': ts_img_data_np}, compression=('blosc', 9))

## preprocessing of the data
* Create the X_train and y_train
* write the arrays to a file and upload

In [None]:
#Xs_train = aug_images.astype('float32')/255 
Xs_train = aug_images.astype('uint8') 

### legacy:  saving files with h5py

In [None]:
os.chdir(dpath)
with h5py.File('Aug_traindata.hdf5','w') as f:
    f.create_dataset('X', data=aug_images)
    f.create_dataset('Y', data=aug_labels)

### A shuffling of data just in case we need prior to running the models.

In [None]:
# now shuffling training data. this part was done for the CNN style deeplearning. 
indx = np.array(list(range(len(aug_images))))
np.random.shuffle(indx)
print(indx[:10]) #check the shuffling
#shuffle data
Xs_train = aug_images[indx]