# Prepare data from labels.csv -> subdirectories for Keras

Convert from training images in a single directory with an accompanying labels.csv (see "Dog Breeds" Kaggle competition) to folder structure necessary for Keras data generators.

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import os
from tqdm import tqdm, tqdm_notebook

In [44]:
from subprocess import check_output
src_folder        = 'kaggle/train/'
labels_loc        = 'kaggle/labels.csv'
ext = '.jpg'

### Prep for **development** or **actual training**??

To prep only limited categories for **development**, make limit_categories=N (not 0), and set new values for train_folder and validation_folder

In [45]:
limit_categories = 16
# limit_categories = 0
train_folder      = 'devel/data/train'
validation_folder = 'devel/data/validation'
# train_folder      = 'data/train'
# validation_folder = 'data/validation'
limit_train_samples = 5
limit_valid_samples = 2

In [46]:
validation_split = 0.2

# Create train folder
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
# Create validation folder
if not os.path.exists(validation_folder):
    os.makedirs(validation_folder)

In [47]:
import random

# Create categories folders
labels = pd.read_csv(labels_loc)
categories = list(labels.groupby('breed').count().index)
if limit_categories: 
    categories = np.random.choice(categories, limit_categories, replace=False)

for category in tqdm_notebook(categories):
    if not os.path.exists(os.path.join(train_folder, str(category))):
        os.makedirs(os.path.join(train_folder, str(category)))
    if not os.path.exists(os.path.join(validation_folder, str(category))):
        os.mkdir(os.path.join(validation_folder, str(category)))




In [48]:
# 80% = train data 
SEED=2018
np.random.seed(seed=SEED)
rnd = np.random.random(len(labels))
train_idx = rnd < (1-validation_split)
valid_idx = rnd >= (1-validation_split)

## NOTE: After running the cell below, no coming back - the files are moved. 

In [49]:
print ("BEFORE: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )

if limit_categories:
    category_train_count = dict()
    category_valid_count = dict()

for i, row in tqdm_notebook(enumerate(labels.itertuples()), total=len(labels)):
    if limit_categories:
        if row.breed in categories:
            if train_idx[i]:
                if row.breed in category_train_count:
                    if category_train_count[row.breed] < limit_train_samples:
                        os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(train_folder, row.breed, str(row.id)+ext))
                        category_train_count[row.breed] += 1
                else:
                    category_train_count[row.breed] = 1
                    os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(train_folder, row.breed, str(row.id)+ext))
            else:
                if row.breed in category_valid_count:
                    if category_valid_count[row.breed] < limit_valid_samples:
                        os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(validation_folder, row.breed, str(row.id)+ext))
                        category_valid_count[row.breed] += 1
                else:
                    category_valid_count[row.breed] = 1
                    os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(validation_folder, row.breed, str(row.id)+ext))

    else:
        if train_idx[i]:
            os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(train_folder, row.breed, str(row.id)+ext))
        else:
            os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(validation_folder, row.breed, str(row.id)+ext))
        
print ("AFTER: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )

BEFORE: 
 Src: 10222 files, Train: 0 files, Validation: 0 files



AFTER: 
 Src: 10110 files, Train: 80 files, Validation: 32 files
