In [7]:
import os
import numpy as np

from tqdm import tqdm
from joblib import Parallel, delayed
from skimage.io import imread, imshow, imsave
from skimage.transform import resize
from skimage.exposure import rescale_intensity
from data_helper import load_image_paths

%matplotlib inline

In [2]:
IMAGE_FOLDER_PATH = 'dataset/frames/'

image_paths = load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                               data_type='train',
                               label_type='obj',
                               image_type='left')

image_paths += load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                                data_type='train',
                                label_type='obj',
                                image_type='right')

image_paths += load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                                data_type='train',
                                label_type='obj',
                                image_type='head')

image_paths += load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                                data_type='test',
                                label_type='obj',
                                image_type='left')

image_paths += load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                                data_type='test',
                                label_type='obj',
                                image_type='right')

image_paths += load_image_paths(image_folder_path=IMAGE_FOLDER_PATH,
                                data_type='test',
                                label_type='obj',
                                image_type='head')

In [3]:
len(image_paths)

41652

In [38]:
def preprocess(image_path, smallest_size=256):
    image = imread(image_path)
    
    # Resize image.
    height, width = float(image.shape[0]), float(image.shape[1])
    scale = smallest_size / width if (height > width) else smallest_size / height
    new_height = int(height*scale)
    new_width = int(width*scale)
    resized_image = resize(image, output_shape=(new_height, new_width))
    
    # Normalize to 0-255 uint8
    resized_image = rescale_intensity(resized_image, out_range=(0, 255)).astype(np.uint8)
    
    # Save image
    # Create a new path for saving.
    dataset_folder_idx = [i for i, name in enumerate(image_path.split('/')) if name == 'dataset'][0]
    resized_folder_idx = dataset_folder_idx + 1
    
    f_names = image_path.split('/')
    f_names.insert(resized_folder_idx, 'resized')
    
    resized_image_path = os.path.join(*f_names)
    resized_image_folder = os.path.join(*resized_image_path.split('/')[:-1])
    
    # Handle race condition when making directory.
    if not os.path.exists(resized_image_folder):
        while True:
            try:
                os.makedirs(resized_image_folder)
                break
            except: pass
    
    imsave(resized_image_path, resized_image)
    
def preprocess_in_batch(image_paths):
    for image_path in image_paths:
        preprocess(image_path)

def preprocess_in_parallel(image_paths, num_splits=100, num_job=-1):
    batch_size = len(image_paths) // num_splits + 1
    with Parallel(n_jobs=num_job, backend='threading', verbose=50) as parallel:
        parallel(delayed(preprocess_in_batch)(image_paths[i*batch_size:(i+1)*batch_size]) 
                 for i in range(num_splits))

In [None]:
preprocess_in_parallel(image_paths)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  5.8min
