In [25]:
import h5py
import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm
from image_preprocessing.PicHandler import *
import os
import json

In [7]:
dataset_path = 'D:\\projects\\datasets\HKR\\20200923_Dataset_Words_Public\\'
ann_path = dataset_path + 'ann'
img_path = dataset_path + 'img'
dataset_filename = '../../resources/dataset_light/dataset.hdf5'

In [8]:
def read_json(ann_path_name: str) -> Tuple[str, str]:
    # пара <расшифровка, имя файла>
    with open(ann_path_name, 'rb') as file:
        data = json.loads(file.readline())
        return data["description"], data["name"]

In [9]:
def show(img):
    plt.imshow(img.astype(np.uint8))

In [58]:
def pad(arr, new_shape):
    vertical_pad = new_shape[0] - arr.shape[0]
    horizontal_pad = new_shape[1] - arr.shape[1]
    vert_add, hor_add = vertical_pad % 2, horizontal_pad % 2

    return np.pad(arr, ((vertical_pad // 2, vertical_pad // 2 + vert_add),
                        (horizontal_pad // 2, horizontal_pad // 2 + hor_add)),
                  'constant', constant_values=(255, ))

In [59]:
default_shape = (128, 1024)

def create_dataset(n_samples):
    images, labels = [], []
    cnt = 0
    files = random.sample(os.listdir(ann_path), n_samples)
    for filename in tqdm(files):
        word, img_name = read_json(ann_path + '\\' + filename)
        ph = PicHandler(img_path + '\\' + img_name + '.jpg')
        ph.apply_adaptive_bin_filter()

        arr = ph.get_image()
        if arr.shape[0] > default_shape[0]:
            arr = resize(arr, (default_shape[0], int(arr.shape[1] * default_shape[0] / arr.shape[0])))
        elif arr.shape[1] > default_shape[1]:
            arr = resize(arr, (int(arr.shape[0] * default_shape[1] / arr.shape[1]), default_shape[1]))

        images.append(pad(arr, default_shape))
        labels.append(word)

        cnt += 1

    return images, labels

In [60]:
images, labels = create_dataset(2000)

100%|██████████| 2000/2000 [03:27<00:00,  9.65it/s]


In [83]:
with h5py.File(dataset_filename, 'w') as f:
    f.create_dataset('images', data=images, compression="gzip", compression_opts=4)
    f.create_dataset('labels', data=labels)

In [81]:
images[0].dtype

dtype('uint8')

In [84]:
with h5py.File(dataset_filename, 'r') as f:
    data = f['images']
    print(data[0])

[[255. 255. 255. ... 255. 255. 255.]
 [255. 255. 255. ... 255. 255. 255.]
 [255. 255. 255. ... 255. 255. 255.]
 ...
 [255. 255. 255. ... 255. 255. 255.]
 [255. 255. 255. ... 255. 255. 255.]
 [255. 255. 255. ... 255. 255. 255.]]
