# Prepare data

This will take imgs from ebay (more than `min_files_per_category` imgs per class), and split them in the `train_valid_ratio` between training and validation folders

In [1]:
import numpy as np
import pandas as pd
import os, shutil, random
from tqdm import tqdm, tqdm_notebook

In [2]:
from subprocess import check_output
src_folder        = 'imgs/src'
train_folder      = 'data/train'
valid_folder = 'data/validation'
test_folder = 'imgs/test'
train_valid_ratio = 0.8
min_files_per_category = 10
ext = '.jpg'

if not os.path.exists(train_folder):
    os.makedirs(train_folder)
    
if not os.path.exists(valid_folder):
    os.makedirs(valid_folder)    

In [3]:
all_categories = [c for c in os.listdir(src_folder) if os.path.isdir(os.path.join(src_folder, c))]
processed_categories = []

In [4]:
print ("BEFORE: \n Src: {} files in {} classes, Train: {} files in {} classes, Validation: {} files in {} classes, Test: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]),
    len(all_categories),
    sum([len(files) for r, d, files in os.walk(train_folder)]),
    len(processed_categories),
    sum([len(files) for r, d, files in os.walk(valid_folder)]),
    len(processed_categories),
    sum([len(files) for r, d, files in os.walk(test_folder)]))
      )

BEFORE: 
 Src: 2349 files in 49 classes, Train: 0 files in 0 classes, Validation: 0 files in 0 classes, Test: 32 files


## NOTE: After running the cell below, no coming back - the files are moved. 

In [5]:
for category in tqdm_notebook(all_categories, total=len(all_categories)):
    src_cat_dir = os.path.join(src_folder, str(category))
    files = [f for f in os.listdir(src_cat_dir) if os.path.isfile(os.path.join(src_cat_dir, f))]

    if len(files) < min_files_per_category:
        print ('Ignoring category: {}'.format(category))
        continue

    else:
        print ('Processing category: {}'.format(category))
        processed_categories.extend([category])
        train_cat_dir = os.path.join(train_folder, str(category))
        valid_cat_dir = os.path.join(valid_folder, str(category))
        test_cat_dir = os.path.join(test_folder, str(category))

        # create category directory in train/
        if not os.path.exists(train_cat_dir):
            os.makedirs(train_cat_dir)

        # create category directory in valid/
        if not os.path.exists(valid_cat_dir):
            os.makedirs(valid_cat_dir)

        # then move files from src to train/ and valid/
        random.shuffle(files)
        for i, file in enumerate(files):
            if i < train_valid_ratio * len(files):
                shutil.copy(os.path.join(src_cat_dir, file), os.path.join(train_cat_dir, file))
            else:
                shutil.copy(os.path.join(src_cat_dir, file), os.path.join(valid_cat_dir, file))

        # finally, if there are any test images for this category, move them to validation folder as well
        # if os.path.exists(test_cat_dir):
        #    files = [f for f in os.listdir(test_cat_dir) if os.path.isfile(os.path.join(test_cat_dir, f))]
        #    for file in files:
        #        shutil.copy(os.path.join(test_cat_dir, file), os.path.join(valid_cat_dir, file))

HBox(children=(IntProgress(value=0, max=49), HTML(value='')))

Processing category: Razor_E_Glow_Electric_Scooter
Ignoring category: Me_Reader_Jr_-_Sesame_Street_Book
Processing category: Fisher-Price_Laugh_Learn_Jumperoo
Processing category: Razor_Jetts_Adjustable_Skates
Ignoring category: Fisher-Price_Laugh_Learn_Around_the_Town_Learning_Table
Ignoring category: VTech_Ultimate_Alphabet_Activity_Cube
Processing category: kid_ring_stacker
Processing category: Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set
Ignoring category: Little_Tikes_Remote_Control_Bumper_Cars_Set
Ignoring category: Huffy_20_inch_Drastic_Green_Machine
Processing category: Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle
Processing category: kid_zoo_puzzle
Processing category: kid_zoo_sing-along
Processing category: kid_mobile_phone
Processing category: Radio_Flyer_Wagon
Processing category: VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center
Ignoring category: Globber_3_Wheel_5-in-1_Scooter
Processing category: LeapFrog_LeapStart_Interactive_Learning_System
Ignoring cat

In [6]:
print ("AFTER: \n Src: {} files in {} classes, Train: {} files in {} classes, Validation: {} files in {} classes, Test: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]),
    len(all_categories),
    sum([len(files) for r, d, files in os.walk(train_folder)]),
    len(processed_categories),
    sum([len(files) for r, d, files in os.walk(valid_folder)]),
    len(processed_categories),
    sum([len(files) for r, d, files in os.walk(test_folder)]))
      )

AFTER: 
 Src: 2349 files in 49 classes, Train: 1725 files in 33 classes, Validation: 418 files in 33 classes, Test: 32 files


In [7]:
df = pd.DataFrame(index=processed_categories)
for category in tqdm_notebook(processed_categories, total=len(processed_categories)):
    df.loc[category, 'train'] = len(os.listdir(os.path.join(train_folder, str(category))))
    df.loc[category, 'valid'] = len(os.listdir(os.path.join(valid_folder, str(category))))

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [8]:
df

Unnamed: 0,train,valid
Razor_E_Glow_Electric_Scooter,12.0,3.0
Fisher-Price_Laugh_Learn_Jumperoo,8.0,2.0
Razor_Jetts_Adjustable_Skates,8.0,2.0
kid_ring_stacker,13.0,3.0
Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set,8.0,2.0
Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle,9.0,2.0
kid_zoo_puzzle,15.0,3.0
kid_zoo_sing-along,16.0,4.0
kid_mobile_phone,13.0,3.0
Radio_Flyer_Wagon,512.0,127.0
