# Building the HDF5 dataset

In [None]:
from config import dogs_vs_cats_config as config

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras_example.preprocessing.aspectawareprocessor import AspectAwareProcessor
from keras_example import HDF5DatasetWriter

from imutils import paths

import numpy as np
import progressbar
import json
import cv2
import os

In [None]:
trainPaths = list(paths.list_images(config.IMAGES_PATH))
trainLabels = [p.split(os.paths.sep)[-1].split(".")[0] 
    for p in trainPaths]

le = LabelEncoder()
trainLabels = le.fit_transform(trainLabels)

In [None]:
(trainPaths, testPaths,trainLabels,testLabels) = train_test_split(
    trainPaths, 
    trainLabels,
    test_size=config.NUM_TEST_IMAGES,
    stratify=trainLabels, 
    random_state = 42)

(trainPaths, valPaths,trainLabels,valLabels) = train_test_split(
    trainPaths, 
    trainLabels,
    test_size=config.NUM_VAL_IMAGES,
    stratify=trainLabels, 
    random_state = 42)


In [None]:
datasets = [
    ('train', trainPaths, trainLabels, config.TRAIN_HDF5),
    ('val', valPaths, valLabels, config.VAL_HDF5),
    ('test', testPaths, testLabels, config.TEST_HDF5),
]

aap = AspectAwareProcessor(256,256)
(R,G,B) = ([],[],[])

for (dType, paths, labels, outputPath) in datasets:
    print(f"[INFO] building {outputPath}")
    writer = HDF5DatasetWriter((len(paths), 256,256,3), outputPath)

    widgets = [
        f"Building Dataset {dType}: ",
        progressbar.Percentage(),
        " ",
        progressbar.Bar(),
        " ",
        progressbar.ETA()
    ]

    pbar = progressbar.ProgressBar(
        maxval=len(paths),
        widgets=widgets
        ).start()
    
    for (i, (path,label)) in enumerate(zip(paths, labels)):
        image = cv2.imread(path)
        image = aap.preprocess(image)

        if dType == "train":
            (b,g,r) = cv2.mean(image)[:3]
            R.append(r)
            G.append(g)
            B.append(b)
        
        writer.add([image], [label])
        pbar.update()
    
    pbar.finish()
    writer.close()

In [None]:
print("[INFO] serializing means...")
D = {
    "R": np.mean(R),
    "G": np.mean(G),
    "B": np.mean(B)
}

with open(config.DATASET_MEAN, "w") as f:
    f.write(json.dumps(D))