# Prepare data

This will take imgs from ebay (more than 10 imgs per class), and split them into 8 for training and 2 for validation.

In [1]:
import numpy as np
import pandas as pd
import os, shutil, random
from tqdm import tqdm, tqdm_notebook

In [2]:
from subprocess import check_output
src_folder        = 'imgs/src'
train_folder      = 'data/train'
valid_folder = 'data/validation'
test_folder = 'imgs/test'
train_valid_ratio = 0.8
ext = '.jpg'

if not os.path.exists(train_folder):
    os.makedirs(train_folder)
    
if not os.path.exists(valid_folder):
    os.makedirs(valid_folder)    

In [3]:
categories = [c for c in os.listdir(src_folder) if os.path.isdir(os.path.join(src_folder, c))]

In [4]:
print ("BEFORE: \n Src: {} files, Train: {} files, Validation: {} files, Test: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]),
    sum([len(files) for r, d, files in os.walk(train_folder)]),
    sum([len(files) for r, d, files in os.walk(valid_folder)]),
    sum([len(files) for r, d, files in os.walk(test_folder)]))
      )

BEFORE: 
 Src: 1172 files, Train: 0 files, Validation: 0 files, Test: 116 files


## NOTE: After running the cell below, no coming back - the files are moved. 

In [5]:
for category in tqdm_notebook(categories, total=len(categories)):
    print ('Category: {}'.format(category))
    src_cat_dir = os.path.join(src_folder, str(category))
    train_cat_dir = os.path.join(train_folder, str(category))
    valid_cat_dir = os.path.join(valid_folder, str(category))
    test_cat_dir = os.path.join(test_folder, str(category))
    
    # create category directory in train/
    if not os.path.exists(train_cat_dir):
        os.makedirs(train_cat_dir)
    
    # create category directory in valid/
    if not os.path.exists(valid_cat_dir):
        os.makedirs(valid_cat_dir)

    # then move files from src to train/ and valid/
    files = [f for f in os.listdir(src_cat_dir) if os.path.isfile(os.path.join(src_cat_dir, f))]
    random.shuffle(files)
    for i, file in enumerate(files):
        if i < train_valid_ratio * len(files):
            shutil.copy(os.path.join(src_cat_dir, file), os.path.join(train_cat_dir, file))
        else:
            shutil.copy(os.path.join(src_cat_dir, file), os.path.join(valid_cat_dir, file))
            
    # finally, if there are any test images for this category, move them to validation folder as well
    if os.path.exists(test_cat_dir):
        files = [f for f in os.listdir(test_cat_dir) if os.path.isfile(os.path.join(test_cat_dir, f))]
        for file in files:
            shutil.copy(os.path.join(test_cat_dir, file), os.path.join(valid_cat_dir, file))

Category: kid_ring_stacker
Category: kid_zoo_puzzle
Category: kid_zoo_sing-along
Category: kid_mobile_phone
Category: VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center
Category: LeapFrog_LeapStart_Interactive_Learning_System
Category: LeapFrog_Learn_Groove_Musical_Table
Category: kid_whiteboard_chalkboard
Category: VTech_Sit-to-Stand_Learning_Walker
Category: kid_bounceroo
Category: VTech_Alphabet_Apple
Category: LeapFrog_Shapes_Sharing_Picnic_Basket
Category: Fisher-Price_Kick_and_Play_Piano_Gym
Category: Paw_Patrol_Jungle_Rescue_Paw_Terrain_Vehicle
Category: Baby_Einstein_Sea_Dreams_Soother
Category: kid_pink_unicorn
Category: VTech_Disney_Minnie_ABC_Fashion_Purse
Category: Radio_Flyer_Fold_2_Go_Trike
Category: VTech_Kidizoom_Smartwatch_DX2
Category: Fisher-Price_Auto_Rock_n_Play_Sleeper
Category: kid_shopping_cart



In [6]:
print ("AFTER: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(valid_folder)]))
      )

AFTER: 
 Src: 1172 files, Train: 947 files, Validation: 341 files


In [7]:
df = pd.DataFrame(index=categories)
for category in tqdm_notebook(categories, total=len(categories)):
    df.loc[category, 'train'] = len(os.listdir(os.path.join(train_folder, str(category))))
    df.loc[category, 'valid'] = len(os.listdir(os.path.join(valid_folder, str(category))))




In [17]:
df

Unnamed: 0,train,valid
kid_ring_stacker,13.0,3.0
kid_zoo_puzzle,15.0,3.0
kid_zoo_sing-along,16.0,4.0
kid_mobile_phone,13.0,3.0
VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center,26.0,6.0
LeapFrog_LeapStart_Interactive_Learning_System,31.0,7.0
LeapFrog_Learn_Groove_Musical_Table,50.0,42.0
kid_whiteboard_chalkboard,22.0,5.0
VTech_Sit-to-Stand_Learning_Walker,84.0,20.0
kid_bounceroo,30.0,7.0
