# Prepare data

This will take imgs from ebay (more than 10 imgs per class), and split them into 8 for training and 2 for validation.

In [8]:
import numpy as np
import pandas as pd
import os, shutil
from tqdm import tqdm, tqdm_notebook

In [9]:
from subprocess import check_output
src_folder        = 'imgs/ebay/'
train_folder      = 'data/train'
valid_folder = 'data/validation'
ext = '.jpg'

if not os.path.exists(train_folder):
    os.makedirs(train_folder)
    
if not os.path.exists(valid_folder):
    os.makedirs(valid_folder)    

In [10]:
categories = [c for c in os.listdir(src_folder) if (os.path.isdir(os.path.join(src_folder, c)) 
                                                    and len(os.listdir(os.path.join(src_folder, c))) > 10)]

In [12]:
print ("BEFORE: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(valid_folder)]))
      )

BEFORE: 
 Src: 2053 files, Train: 0 files, Validation: 0 files


## NOTE: After running the cell below, no coming back - the files are moved. 

In [13]:
for category in tqdm_notebook(categories, total=len(categories)):
    print ('Category: {}'.format(category))
    src_cat_dir = os.path.join(src_folder, str(category))
    train_cat_dir = os.path.join(train_folder, str(category))
    valid_cat_dir = os.path.join(valid_folder, str(category))
    
    # create category directory in train/
    if not os.path.exists(train_cat_dir):
        os.makedirs(train_cat_dir)
    
    # create category directory in valid/
    if not os.path.exists(valid_cat_dir):
        os.makedirs(valid_cat_dir)

    # then move files from src to train/
    files = [f for f in os.listdir(src_cat_dir) if os.path.isfile(os.path.join(src_cat_dir, f))] 
    
    for i, file in enumerate(np.random.choice(files, 10)):
        if i < 8:
            shutil.copy(os.path.join(src_cat_dir, file), os.path.join(train_cat_dir, file))
        else:
            shutil.copy(os.path.join(src_cat_dir, file), os.path.join(valid_cat_dir, file))

Category: Razor_E_Glow_Electric_Scooter
Category: Fisher-Price_Laugh_Learn_Jumperoo
Category: Razor_Jetts_Adjustable_Skates
Category: Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set
Category: Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle
Category: Radio_Flyer_Wagon
Category: VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center
Category: LeapFrog_LeapStart_Interactive_Learning_System
Category: Marvel_Spider-Man_6_Volt_Ride_On
Category: Fisher-Price_Little_People_Pony_Stable
Category: Baby_Einstein_Caterpillar_and_Friends_Activity_Gym
Category: LeapFrog_Learn_Groove_Musical_Table
Category: VTech_2-in-1_Learn_Zoom_Motorbike
Category: PAW_Patrol_Air_Patroller_Plane
Category: VTech_Sit-to-Stand_Learning_Walker
Category: VTech_Alphabet_Apple
Category: LeapFrog_Shapes_Sharing_Picnic_Basket
Category: Fisher-Price_Kick_and_Play_Piano_Gym
Category: Paw_Patrol_Jungle_Rescue_Paw_Terrain_Vehicle
Category: Baby_Einstein_Sea_Dreams_Soother
Category: VTech_Disney_Minnie_ABC_Fashion_Purse
Cate

In [15]:
print ("AFTER: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(valid_folder)]))
      )

AFTER: 
 Src: 2053 files, Train: 179 files, Validation: 51 files


In [16]:
df = pd.DataFrame(index=categories)
for category in tqdm_notebook(categories, total=len(categories)):
    df.loc[category, 'train'] = len(os.listdir(os.path.join(train_folder, str(category))))
    df.loc[category, 'valid'] = len(os.listdir(os.path.join(valid_folder, str(category))))




In [17]:
df

Unnamed: 0,train,valid
Razor_E_Glow_Electric_Scooter,5.0,2.0
Fisher-Price_Laugh_Learn_Jumperoo,5.0,2.0
Razor_Jetts_Adjustable_Skates,5.0,2.0
Disney_Pixar_Cars_3_Willy_s_Butte_Transforming_Track_Set,7.0,2.0
Thomas_Friends_Jumbo_Mega_Playmat_with_Vehicle,6.0,2.0
Radio_Flyer_Wagon,8.0,2.0
VTech_Go_Go_Smart_Wheels_Fire_Command_Rescue_Center,8.0,2.0
LeapFrog_LeapStart_Interactive_Learning_System,7.0,2.0
Marvel_Spider-Man_6_Volt_Ride_On,4.0,2.0
Fisher-Price_Little_People_Pony_Stable,6.0,2.0
