## Serializing the dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tesis_lib.preprocessing.aspectawareprocessor import AspectAwareProcessor
from tesis_lib.io.hdf5datasetwriter import HDF5DatasetWriter

from imutils import paths
import numpy as np
import progressbar
import json
import cv2
import os

In [None]:
DATASET_PATH = './DB'

IM_SIZE = 256
NUM_CLASSES = 2

In [None]:
DATASET_HDF5_PATH = os.path.sep.join([DATASET_PATH, 'hdf5'])
if os.path.exists(DATASET_HDF5_PATH):
    !rm -r {DATASET_HDF5_PATH}

os.mkdir(DATASET_HDF5_PATH)

In [None]:
aap = AspectAwareProcessor(IM_SIZE,IM_SIZE)
# iap = ImageToArrayPreprocessor()
(R,G,B) = ([],[],[])

path = os.path.sep.join([DATASET_PATH, "Training"])
class_paths = [os.path.sep.join([path, im_class]) for im_class in os.listdir(path)]

imagePaths = []
[imagePaths.extend(paths.list_images(cp)) for cp in class_paths]
labels = [pt.split(os.path.sep)[-2] for pt in imagePaths]

le = LabelEncoder()
labels = le.fit_transform(labels)

(trainPaths, valPaths,trainLabels,valLabels) = train_test_split(
  imagePaths,
  labels,
  train_size=450,
  test_size=100,
  stratify=labels, 
  random_state = 42)

assert trainLabels.shape[0] == len(trainPaths)
assert valLabels.shape[0] == len(valPaths)

In [None]:
print(f"Training data points = {trainLabels.shape[0]}")
print(f"Training data points = {valLabels.shape[0]}")

Training data points = 450
Training data points = 100


In [None]:
path = os.path.sep.join([DATASET_PATH, "Test"])
class_paths = [os.path.sep.join([path, im_class]) for im_class in os.listdir(path)]

imagePaths = []
[imagePaths.extend(paths.list_images(cp)) for cp in class_paths]
labels = [pt.split(os.path.sep)[-2] for pt in imagePaths]

le = LabelEncoder()
labels = le.fit_transform(labels)

(_, testPaths,_,testLabels) = train_test_split(
  imagePaths,
  labels,
  train_size=450,
  test_size=124,
  stratify=labels, 
  random_state = 42)

assert testLabels.shape[0] == len(testPaths)

In [None]:
print(f"Training data points = {testLabels.shape[0]}")

Training data points = 124


In [None]:
DATA_PATHS = [
    ('train', trainPaths, trainLabels, f'./DB/hdf5/Training.hdf5'),
    ('val', valPaths, valLabels, f'./DB/hdf5/Validation.hdf5'),
    ('test', testPaths, testLabels, f'./DB/hdf5/Testing.hdf5'),
]

for (dType, imagePaths, labels, output) in DATA_PATHS:
  if os.path.exists(output):
    os.remove(output)
  writer = HDF5DatasetWriter((len(imagePaths), IM_SIZE,IM_SIZE,3), output)

  widgets = [
      f"Building {dType} Set: ",
      progressbar.Percentage(),
      " ",
      progressbar.Bar(),
      " ",
      progressbar.ETA()
  ]

  pbar = progressbar.ProgressBar(
      maxval=len(imagePaths),
      widgets=widgets
      ).start()

  for (i, (path,label)) in enumerate(zip(imagePaths, labels)):
      image = cv2.imread(path)
      try:
        image = aap.preprocess(image)
      except Exception:
        display(f"[WARNING] Skipped {path.split('/')[-1]}")
        continue

      if dType == "train":
        (b,g,r) = cv2.mean(image)[:3]
        R.append(r)
        G.append(g)
        B.append(b)
      
      writer.add([image], [label])
      pbar.update(i)

  pbar.finish()
  writer.close()

Building train Set: 100% |######################################| Time: 0:00:29
Building val Set: 100% |########################################| Time: 0:00:08
Building test Set: 100% |#######################################| Time: 0:00:11


In [None]:
print("[INFO] serializing means...")
D = {
    "R": np.mean(R),
    "G": np.mean(G),
    "B": np.mean(B)
}
with open('./DB/hdf5/diat_ret.json', "w") as f:
    f.write(json.dumps(D))

[INFO] serializing means...
