In [1]:
import PIL
import os
import imageio
import numpy as np
from six.moves import cPickle as pickle

In [2]:
img_size = 256
pixel_depth = 255.0
num_channels = 3
ratio = 0.8
src_dir = 'Train'
src_label = "label.npy"


In [3]:
def make_dataset(folder):
    #Get files
    image_files = os.listdir(folder)
    #initialize array for all images
    dataset = np.ndarray(shape = 
                         (len(image_files),img_size,img_size,num_channels),dtype = np.float32)
    for image in image_files:
        #load all images into dataset
        image_file = os.path.join(folder, image)
        name = os.path.splitext(image)
        #get unique image_id
        num_extracted = int(name[0])
        try:
            image_data = (imageio.imread(image_file).astype(float) - pixel_depth/2)/pixel_depth
            if image_data.shape != (img_size,img_size,num_channels):
                raise Exception('Wrong image shape {}'.format(image_file))
            dataset[num_extracted-1,:,:,:] = image_data
            #print(num_images)
        except(IOError, ValueError) as e:
            print("Could not read:", image_file,":",e)
    return dataset

In [4]:
def make_arrays(t_size, size_of_dataset, mode):
    #automatic initialization of arrays
    num_of_images = size_of_dataset[0]
    img_size = size_of_dataset[1]
    num_channels = size_of_dataset[3]
    test_size = num_of_images - t_size
    if mode=="test": #test
        dataset = np.ndarray((test_size, img_size, img_size, num_channels), dtype=np.float32)
        label = np.ndarray((test_size,2),dtype = np.int32)
    if mode=="train": #training
        dataset = np.ndarray((t_size, img_size, img_size, num_channels), dtype=np.float32)
        label = np.ndarray((t_size,2),dtype = np.int32)
    else: print("bad mode")
    return dataset, label

In [9]:
def split_dataset(dataset, src_label):
    #split dataset into randomized train set and test set
    size_of_dataset = dataset.shape
    num_of_images = size_of_dataset[0]
    print('found {} images'.format(num_of_images))
    #Getting random order for datasets
    arr = np.arange(num_of_images)
    np.random.shuffle(arr)
    t_size = int(ratio*num_of_images)
    #initialize arrays with fn
    train_dataset, train_label = make_arrays(t_size,size_of_dataset, mode="train")
    #open label file to assign labels to proper places
    label = np.load("label.npy")
    for n in range(0,t_size):
        train_dataset[n] = dataset[arr[n]]
        train_label[n] = label[arr[n]]
    test_dataset, test_label =  make_arrays(t_size,size_of_dataset, mode="test")
    for n in range(t_size, num_of_images):
        i=0
        test_dataset[i] = dataset[arr[n]]
        test_label[i] = label[arr[n]]
        i+=1
    return test_dataset, train_dataset, test_label, train_label

In [6]:
def randomize(dataset, labels):
    #futher randomization of order inside arrays
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels

In [7]:
def main(name, force=False):
    #Pickle obtained datasets into 1 files for data handling convinience
    if os.path.exists(name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % name)
    else:
        print('Pickling %s.' % name)
        #get one big dataset from src_dir
        dataset = make_dataset(src_dir)
        print("made dataset")
        #split dataset into sub parts
        test_dataset, train_dataset, test_labels, train_labels = split_dataset(dataset, src_label)
        #additional randomization for data handling quality
        del dataset
        print("splitted in parts")
        train_dataset, train_labels = randomize(train_dataset, train_labels)
        test_dataset, test_labels = randomize(test_dataset, test_labels)
        print("saving")
        try:
            f = open(name, 'wb')
            save = {
                'train_dataset': train_dataset,
                'train_labels': train_labels,
                'test_dataset': test_dataset,
                'test_labels': test_labels,
                }
            pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            print("Done")
        except Exception as e:
            print('Unable to save data to', pickle_file, ':', e)
            raise


In [8]:
if __name__ == '__main__': 
    name = "Dataset.pickle"
    main(name)
    statinfo = os.stat(name)
    print('Compressed pickle size:', statinfo.st_size)

Pickling Dataset.pickle.
made dataset
found 6000 images


TypeError: bad operand type for unary +: 'str'