# Prepare data from labels.csv -> subdirectories for Keras

Convert from training images in a single directory with an accompanying labels.csv (see "Dog Breeds" Kaggle competition) to folder structure necessary for Keras data generators.

In [1]:
import numpy as np
import pandas as pd
import os, shutil
from tqdm import tqdm, tqdm_notebook

In [2]:
from subprocess import check_output
src_folder        = 'imgs/trus/'
train_folder      = 'data/train'
validation_folder = 'data/validation'
ext = '.jpg'
min_train_samples = 40

# Create train folder
if not os.path.exists(train_folder):
    os.makedirs(train_folder)

In [10]:
categories = [c for c in os.listdir(validation_folder) if os.path.isdir(os.path.join(validation_folder, c))]

In [11]:
import cv2
from matplotlib import pyplot as plt

def to_RGB(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

def synthesize_image(src_img_path, num_samples, file_id):
    src_img = cv2.imread(src_img_path, cv2.IMREAD_COLOR)

    shifted = cv2.pyrMeanShiftFiltering(src_img, sp=6, sr=9)
    border = cv2.copyMakeBorder(shifted, 1, 1, 1, 1, borderType=cv2.BORDER_CONSTANT, value=[255,255,255])
    mask = np.zeros((border.shape[0]+2, border.shape[1]+2), np.uint8)
    flags = 8 | cv2.FLOODFILL_MASK_ONLY
    tolerance = (1,)*3
    cv2.floodFill(image=border, mask=mask, seedPoint=(0,0), newVal=255, loDiff=tolerance, upDiff=tolerance, flags=flags)
    mask = mask[2:-2,2:-2]

    wt_bk = np.float(np.count_nonzero(mask))/mask.size*100.
    if wt_bk < 5.0:
        return None

    for i in range(num_samples):
        bk = np.random.randint(low=0, high=256, size=(src_img.shape[0],src_img.shape[1],3))
        new_img = np.zeros_like(src_img)
        new_img[mask==0] = src_img[mask==0]
        new_img[np.logical_and((shifted==255).all(axis=-1), mask==0)] = bk[np.logical_and((shifted==255).all(axis=-1), mask==0)]
        new_img[mask==1] = bk[mask==1]

        f, ext = os.path.splitext(src_img_path)
        new_img_path = '{}-{}{}'.format(f, file_id+i, ext)
        cv2.imwrite(new_img_path, new_img)
    
    return i+1

In [12]:
print ("BEFORE: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )

BEFORE: 
 Src: 34038 files, Train: 0 files, Validation: 2053 files


## NOTE: After running the cell below, no coming back - the files are moved. 

In [13]:
for category in tqdm_notebook(categories, total=len(categories)):
    print ('Category: {}'.format(category))
    src_cat_dir = os.path.join(src_folder, str(category))
    train_cat_dir = os.path.join(train_folder, str(category))
    valid_cat_dir = os.path.join(validation_folder, str(category))
    
    # create category directory in train/
    if not os.path.exists(train_cat_dir):
        os.makedirs(train_cat_dir)
    
    # then move files from src to train/
    for file in os.listdir(src_cat_dir):
        shutil.copy(os.path.join(src_cat_dir, file), os.path.join(train_cat_dir, file))
            
    # then synthesize as necessary - create roughly min_train_samples per category (doesn't always end up 
    # at min_train_samples because some images cannot be synthesized because of non-white backgrounds)
    orig_samples = os.listdir(train_cat_dir)
    for orig_sample in orig_samples:
        synthesize_image(os.path.join(train_cat_dir, orig_sample), 
                         (min_train_samples - len(os.listdir(train_cat_dir)))//len(orig_samples)+1,
                         len(os.listdir(train_cat_dir)))

Category: Razor_E_Glow_Electric_Scooter
Category: Me_Reader_Jr_-_Sesame_Street_Book
Category: Fisher-Price_Laugh_Learn_Jumperoo
Category: Razor_Jetts_Adjustable_Skates
Category: Fisher-Price_Laugh_Learn_Around_the_Town_Learning_Table
Category: VTech_Ultimate_Alphabet_Activity_Cube
Category: Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set
Category: Little_Tikes_Remote_Control_Bumper_Cars_Set
Category: Huffy_20_inch_Drastic_Green_Machine
Category: Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle
Category: Radio_Flyer_Wagon
Category: VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center
Category: Globber_3_Wheel_5-in-1_Scooter
Category: LeapFrog_LeapStart_Interactive_Learning_System
Category: Imaginarium_5_Way_Activity_Cube
Category: Fisher-Price_Brilliant_Basics_Stroller_Styled_Walker
Category: Marvel_Spider-Man_6_Volt_Ride_On
Category: Disney_Frozen_Magical_Adventure_Activity_Ride_On
Category: Fisher-Price_Little_People_Pony_Stable
Category: Yvolution_Y_Velo_Flippa_Tricycle_and_Bal

In [14]:
print ("AFTER: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )

AFTER: 
 Src: 34038 files, Train: 1128 files, Validation: 2053 files


In [15]:
df = pd.DataFrame(index=categories)
for category in tqdm_notebook(categories, total=len(categories)):
    df.loc[category, 'train'] = len(os.listdir(os.path.join(train_folder, str(category))))
    df.loc[category, 'valid'] = len(os.listdir(os.path.join(validation_folder, str(category))))
df




Unnamed: 0,train,valid
Razor_E_Glow_Electric_Scooter,19.0,16.0
Me_Reader_Jr_-_Sesame_Street_Book,4.0,8.0
Fisher-Price_Laugh_Learn_Jumperoo,14.0,11.0
Razor_Jetts_Adjustable_Skates,30.0,11.0
Fisher-Price_Laugh_Learn_Around_the_Town_Learning_Table,26.0,5.0
VTech_Ultimate_Alphabet_Activity_Cube,34.0,2.0
Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set,24.0,11.0
Little_Tikes_Remote_Control_Bumper_Cars_Set,21.0,5.0
Huffy_20_inch_Drastic_Green_Machine,33.0,9.0
Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle,28.0,11.0
