In [0]:
import numpy as np
## Pyplot from Matplotlib for vizualising the test results in plots
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import skimage.color as skcolor
from PIL import Image
import math
import h5py
import os
import zipfile
import requests
import datetime
import random

In [0]:
# Function padding the int to the given size
## length:           int, the length of
## return value:     String, %0(length)d
def formatint(length):
  return str('%0' + str('%d'% length)+'d')

In [0]:
#Function for making folder if it does not exist in the given path.
#@ Used packages:
  #@ os
## path:     String, filepath needed to be verified
def MakePath(path):
  #Creating folder if it does not exist.
  if not os.path.exists(path):
    os.makedirs(path)

In [0]:
#Function for printing text and the time
#@ Used packages:
  #@ datetime
## string:        String, the printed text
## return value:  datetime, the printings time
def PrintTime(string):
    time = datetime.datetime.now()
    print(string+ str(time));
    return time

In [0]:
# Class for Collecting Data
#@ Used packages:
  #@ requests
  #@ zipfile
  #@ os
class DataCollect:
    # Class Variables
    ## url:                   String, url of the zipped dataset 
    ## dataset_zip_path:      String, filepath of the downloaded zip
    ## raw_dataset_path:      String, intended folder path of raw dataset
    url              = "Not initialized."
    dataset_zip_path = "Not initialized."
    raw_dataset_path = "Not initialized."
    # Class Constructor
    ## url:           String, url of the zipped dataset 
    def __init__(self, url):
        ## Initializing the url class variable
        self.url = url

    # Function for downloading the dataset.
    ## target_path:   String, intended filepath of the downloaded dataset
    def download_dataset(self, target_path):
        ## Saving the dataset zip path to class variable
        self.dataset_zip_path = target_path

        PrintTime('Start zip file download from: ' + self.url +'\n\t')

        ## Downloading the file in chunks to avoid memory overrun.
        r = requests.get(self.url, stream = True)
        with open(target_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                ## Filtering out keep-alive new chunks.
                if chunk: 
                    f.write(chunk)

        PrintTime('Zip file download.\n\t')
    # Function for extracting zipped dataset.
    ## raw_dataset_path:      String, intended folder path of raw dataset
    ## return value:          String, final folder path of raw dataset
    def extract_dataset(self, raw_dataset_path):
        ## Creating directory for the raw dataset, if it does not exist.
        MakePath(raw_dataset_path)

        PrintTime('Start images extraction.\n\t')

        ## Extracting dataset to the intended folder path.
        zip_ref = zipfile.ZipFile(self.dataset_zip_path, 'r')
        zip_ref.extractall(raw_dataset_path)
        zip_ref.close()

        ## Determining and returning final path of the raw dataset.
        dirlist = os.listdir(raw_dataset_path)

        ## Saving the raw dataset path to class variable
        self.raw_dataset_path = raw_dataset_path + dirlist[0] + '/'

        PrintTime('Images are extracted.\n\t')

In [0]:
# Class for Collecting Data
#@ Used packages:
  #@ from PIL Image
  #@ numpy as np
  #@ os
  #@ h5py
  #@ skimage.color as skcolor
  #@ random
  #@ datetime
#@ Used functions:
  #@ MakePath
  #@ PrintTime
class DataPreProcess:
    # Class Constructor
    ## initial_path:           String, the filepath with the raw images
    ## target_path:            String, filepath for the transformed images
    ## image_size:             tuple with 2 integer element, (width, height)
    def __init__(self, initial_path, target_path, image_size, train_split = 0.9, validation_split = 0.05, test_split = 0.05):
        self.raw_dataset_path         = initial_path
        self.transformed_dataset_path = target_path
        self.image_size               = image_size
        self.train_sp                 = train_split 
        self.valid_sp                 = validation_split
        self.test_sp                  = test_split

    # Function for transforming the images of the dataset to 1:1 ratio, and target size.
    def dataset_transform(self):
        ## Creating directory for the transformed dataset, if it does not exist.
        MakePath(self.transformed_dataset_path)

        PrintTime('Start dataset transforming.\n\t')

        ## Iterating over the raw images.
        for filename in os.listdir(self.raw_dataset_path):
            im = Image.open(self.raw_dataset_path + filename)
            ## Filtering out the grayscale images, and images with improper dimensions.
            if((self.is_gray_scale(im) == False) and (self.has_proper_dim(im) == True)):
                ## Making the images to 1:1 ratio, and resizing them to the target size.
                im = self.crop_resize_Image(im)
                ## Saving the images to the target directory.
                im.save(self.transformed_dataset_path + filename, quality=90)
        PrintTime('Dataset transformed.\n\t')
    # Function for grayscale check of an image. Returns True if grayscale, False if not.
    ## im:            PIL.Image object, input image
    ## return value:  boolean, True if grayscale, False if not grayscale
    def is_gray_scale(self, im):
        w,h = im.size
        ## Generating 10 random pixel coordinate.
        rand_pixel_array = np.zeros((10,2))
        for i in range(10):
            rand_pixel_array[i,:] = [random.randint(0,w-1), random.randint(0,h-1)]
        ## If all of the 10 pixels have the same values on each channels, the image is regarded grayscale.
        for i in range(10):
            r,g,b = im.getpixel((rand_pixel_array[i,0], rand_pixel_array[i,1]))
            if r != g != b: return False
        return True
    # Function for dimension check of an image. Returns True if if image has the proper dimensions (3D, 3 channels), False if not.
    ## im:            PIL.Image object, input image
    ## return value:  boolean, True if image has the proper dimensions (3D, 3 channels), False if not
    def has_proper_dim(self, im):
        ## Get the image data to numpy array.
        im_array = np.array(im)
        shape = im_array.shape
        ## The image shall have 3 dimensions, and 3 channels.
        if((len(shape) != 3) or (shape[2] != 3)):
            return False
        else:
            return True    
    # Function for making the images to 1:1 ratio, and resizing them to the targetted image size.
    ## im:           PIL.Image object, input image
    ## return value: PIL.Image object, transformed image
    def crop_resize_Image(self, im):
        ## Taking out the image data (width,height).
        width,height = im.size
        ## Deciding if the image is landscape or portrait.
        if(width > height):
            ## Landscape
            top     = 0
            left    = int((width - height)/2)
            bottom  = height
            right    = width - int((width - height)/2)
        else:
            ## Portrait.
            top     = int((height - width)/2)
            left    = 0
            bottom  = height-int((height - width)/2)
            right    = width
        ## Cropping the image to conform 1:1 ratio, the resizing to target size.
        return im.crop((left,top,right,bottom)).resize(self.image_size, resample=PIL.Image.LANCZOS)
    #Function for converting RGB images from path to LAB images.
    ## path:          String, filepath of the images
    ## return value:  Float array 128x128x3, The array of the images in LAB colorization
    def path2labimage(self, path):
        ## Opening the image from path and converting it to float array
        raw_image_array = np.array(Image.open(path)).astype('float32')
        ## Converting RGB image to LAB
        image_array = (skcolor.rgb2lab(raw_image_array/255.0)).astype('int8')
        ## Returning the float array with the LAB image values
        
    # Function converting the rgb images to LAB and saving it to a hd5f file
    ## target_path:  String, the file path
    ## in_one_file:  Boolean, default value false, if true then the dataset is saved in one file
    def dataset_rgb2lab_hdf5(self, target_path, in_one_file = True):
        PrintTime('Start dataset transforming.\n\t')
        ## Creating directory for the transformed dataset, if it does not exist.
        MakePath(target_path)
        if not(in_one_file):
            MakePath(target_path + 'train/')
            MakePath(target_path + 'valid/')
            MakePath(target_path + 'test/')
            
        ## Initializing local variables
        files = os.listdir(self.transformed_dataset_path)
        t1=datetime.datetime.now()
        iterator = 0
        
        ## Initializing local variables used when saving data in one file
        if in_one_file:
            train_iterator = 0
            val_iterator = 0
            test_iterator = 0
            h5 = h5py.File(target_path + 'dataset.h5df','w')
            h5_train = h5.create_group('train')
            h5_test = h5.create_group('test')
            h5_val = h5.create_group('val')
            
        ## Iterating over the raw images.
        for fileName in (files):
            
            ## Loading screen, for estimating the time needed 
            t2 = datetime.datetime.now()
            if(t2.minute != t1.minute):
                print(str(iterator) + '/' + str(len(files)) + '\t' + str(t2))
                t1=t2
            ## Converting path to LAB image
            image_array = self.path2labimage(self.transformed_dataset_path + fileName)
            
            ## Saving the dataset to files
            if not(in_one_file):
                ## Seperate train, valid, test data
                if iterator < len(files)*self.train_sp:
                    ## Opening file for writing
                    h5  = h5py.File(target_path + 'train/' + fileName[:-3] + 'h5df','w')
                    ## Saving lab image array 
                    h5.create_dataset('images_dataset', data = image_array, dtype = 'int8')
                    ## Saving to a h5df files
                    h5.close()
                elif iterator < len(files)*(self.train_sp + self.valid_sp):
                    ## Opening file for writing 
                    h5  = h5py.File(target_path + 'valid/' + fileName[:-3] + 'h5df','w')
                    ## Saving lab image array 
                    h5.create_dataset('images_dataset', data = image_array, dtype = 'int8')
                    ## Saving to a h5df files
                    h5.close()
                else:
                    ## Opening file for writing 
                    h5  = h5py.File(target_path + 'test/' + fileName[:-3] + 'h5df','w')
                    ## Saving lab image array 
                    h5.create_dataset('images_dataset', data = image_array, dtype = 'int8')
                    ## Saving to a h5df files
                    h5.close()
                iterator += 1
            
            ## Saving the dataset to file
            if in_one_file:
                ## Seperate train, valid, test data
                if iterator < len(files)*self.train_sp:
                    ## Saving lab image array 
                    h5_train.create_dataset('image'+str(train_iterator),data = image_array, dtype = 'int8')
                    train_iterator +=1
                elif iterator < len(files)*(self.train_sp + self.valid_sp):
                    ## Saving lab image array 
                    h5_val.create_dataset('image'+str(val_iterator),data = image_array, dtype = 'int8')
                    val_iterator +=1
                else:
                    ## Saving lab image array 
                    h5_test.create_dataset('image'+str(test_iterator),data = image_array, dtype = 'int8')
                    test_iterator +=1
                iterator += 1
        ## Closing the file and saving after all data is written
        if in_one_file:       
            h5.close()

In [0]:
image_size = (128,128)
# Specifying zipped dataset url.
dataset_url = 'https://datasets.figure-eight.com/figure_eight_datasets/open-images/test_challenge.zip'
# Specifying intended filepath for the dataset to be downloaded.
dataset_zipped_path = os.getcwd() + '/zipped_dataset.zip'
# Specifying intended folder path of raw dataset.
raw_dataset_path = os.getcwd() + '/raw_dataset/'
# Specifying intended folder path of transformed dataset.
transformed_dataset_path = os.getcwd() + '/transformed_dataset/'
hdf5_images_path = os.getcwd()+'/images_hdf5/'

In [0]:
datac = DataCollect(dataset_url)
#Downloading the Dataset zip
datac.download_dataset(dataset_zipped_path)
#Extracting the dataset from the previously downloaded zip file
datac.extract_dataset(raw_dataset_path)

In [0]:
datap = DataPreProcess(datac.raw_dataset_path, raw_dataset_path+'/transformed_dataset/', image_size)
#Transforming the dataset
datap.dataset_transform()
#Preparing the dataset for training
datap.dataset_rgb2lab_hdf5(hdf5_images_path, True)