In [1]:
import PIL
import os
import imageio
from six.moves import cPickle as pickle
import tensorflow as tf

In [2]:
#Script handles packaging data from folder into folder.pickle dataset
#Checks as well if images going into dataset are of proper size and adjusts their color depth
image_size = 256
pixel_depth = 255.0
channels = 4
train_folder = ['Train']
#test_folder = './Test'

def load_picture(folder):
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files),image_size,image_size,channels),dtype=np.float32)
    print(dataset.shape)
    num_images = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
        #image_file = image
        try:
            image_data = (imageio.imread(image_file).astype(float)) #- pixel_depth/2)/pixel_depth
            if image_data.shape != (image_size,image_size,channels):
                raise Exception('Wrong image shape {}'.format(image_file))
            dataset[num_images,:,:,:] = image_data
            num_images+=1
            #dataset = dataset[0:num_images,:,:,:]
        except(IOError, ValueError) as e:
            print("Could not read:", image_file,":",e)
    return dataset

In [3]:
def do_pickle(data_folders, force=False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
            print("You already have dataset present")
        else:
            print("Pickling {}".format(set_filename))
            dataset = load_picture(folder)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset,f,pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print("Unable to save data",e)
    return dataset_names

In [4]:
train_datasets = do_pickle(train_folder)

Pickling Train.pickle
(100, 256, 256, 4)


In [5]:
def make_arrays(nb_rows, img_size,channels):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size,channels), dtype=np.float32)
    labels = np.ndarray((nb_rows,2), dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_size, channels)
    train_dataset, train_labels = make_arrays(train_size, image_size, channels)

    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes

    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    for label, pickle_file in enumerate(pickle_files):       
        try:
            with open(pickle_file, 'rb') as f:
                letter_set = pickle.load(f)
                # let's shuffle the letters to have random validation and training set
                np.random.shuffle(letter_set)
                if valid_dataset is not None:
                    valid_letter = letter_set[:vsize_per_class, :, :,:]
                    valid_dataset[start_v:end_v, :, :,:] = valid_letter
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class

            train_letter = letter_set[vsize_per_class:end_l, :, :,:]
            train_dataset[start_t:end_t, :, :,:] = train_letter
            train_labels[start_t:end_t] = label
            start_t += tsize_per_class
            end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise

    return valid_dataset, valid_labels, train_dataset, train_labels

            
train_size = 80
valid_size = 10
test_size = 10

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
#_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
#print('Testing:', test_dataset.shape, test_labels.shape)
print(valid_labels)
print(valid_dataset[:,0,0,0])

Training: (80, 256, 256, 4) (80, 2)
Validation: (10, 256, 256, 4) (10, 2)
[[0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]]
[ 56. 132. 204. 171. 180.  66. 204.  76.  86. 140.]


###
image_file = os.path.join("./Test", "20.png")
size =(len(os.listdir("./Test")))
image_data = (imageio.imread(image_file).astype(float) - pixel_depth/2)/pixel_depth
print(image_data.shape)

In [6]:
pickle_file = "Train.pickle"

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
  print('completed')
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

completed
