# Prepare data from labels.csv -> subdirectories for Keras

Convert from training images in a single directory with an accompanying labels.csv (see "Dog Breeds" Kaggle competition) to folder structure necessary for Keras data generators.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import os
from tqdm import tqdm, tqdm_notebook

In [None]:
from subprocess import check_output
src_folder        = 'data/'
train_folder      = 'data/train'
validation_folder = 'data/validation'
ext = '.jpg'

# Create train folder
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
# Create validation folder
if not os.path.exists(validation_folder):
    os.makedirs(validation_folder)

In [None]:
# Create categories folders
labels = pd.read_csv('data/labels.csv')
categories = list(labels.groupby('breed').count().index)
for category in tqdm_notebook(categories):
    if not os.path.exists(os.path.join(train_folder, str(category))):
        os.makedirs(os.path.join(train_folder, str(category)))
    if not os.path.exists(os.path.join(train_folder, str(category))):
        os.mkdir(os.path.join(validation_folder, str(category)))

In [None]:
# 80% = train data 
SEED=2018
np.random.seed(seed=SEED)
rnd = np.random.random(len(labels))
train_idx = rnd < 0.8
valid_idx = rnd >= 0.8

## NOTE: After running the cell below, no coming back - the files are moved. 

In [None]:
print ("BEFORE: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )

for i, row in tqdm_notebook(enumerate(labels.itertuples()), total=len(labels)):
    if train_idx[i]:
        os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(train_folder, row.breed, str(row.id)+ext))
    else:
        os.rename(os.path.join(src_folder, str(row.id)+ext), os.path.join(validation_folder, row.breed, str(row.id)+ext))

print ("AFTER: \n Src: {} files, Train: {} files, Validation: {} files".format(
    sum([len(files) for r, d, files in os.walk(src_folder)]), 
    sum([len(files) for r, d, files in os.walk(train_folder)]), 
    sum([len(files) for r, d, files in os.walk(validation_folder)]))
      )