## 訓練データの読み込みとHDF5への変換

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.datasets import fashion_mnist
from sklearn.preprocessing import OneHotEncoder

### 訓練データの読み込み

In [2]:
from pathlib import Path

path = Path(".")
dir_path_data = path / "data"
file_path_hdf5 = dir_path_data/"train_data.hdf5"

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# channel次元追加
x_train = x_train[:, np.newaxis]
x_test = x_test[:, np.newaxis]

print("x_train.shape=", x_train.shape)
print("y_train.shape=", y_train.shape)
print("x_test.shape=", x_test.shape)
print("y_test.shape=", y_test.shape)

x_train.shape= (60000, 1, 28, 28)
y_train.shape= (60000,)
x_test.shape= (10000, 1, 28, 28)
y_test.shape= (10000,)


### データ拡張

In [3]:
import Augmentor

p = Augmentor.DataPipeline(x_train, y_train.tolist())
# 拡大
p.zoom(probability=0.5, min_factor=1.1, max_factor=1.2)
# # 回転
# p.rotate(probability=0.5, max_left_rotation=10, max_right_rotation=10)
# # 歪み
# p.random_distortion(probability=0.5, grid_width=2, grid_height=2, magnitude=8)

aug_x, aug_label= p.sample(20000)
aug_x = np.array(aug_x)
aug_label = np.array(aug_label).reshape(-1, 1)

enc = OneHotEncoder(categories="auto", sparse=False, dtype=np.float32)
aug_label = enc.fit_transform(aug_label)

print(aug_x.shape)
print(aug_label.shape)

(20000, 1, 28, 28)
(20000, 10)


In [4]:
# 教師ラベルをonehotエンコード
enc = OneHotEncoder(categories="auto", sparse=False, dtype=np.float32)
y_train=enc.fit_transform(y_train[:, np.newaxis])
y_test=enc.fit_transform(y_test[:, np.newaxis])

# 連結
x_train = np.concatenate([x_train, aug_x], 0)
y_train = np.concatenate([y_train, aug_label], 0)

print("x_train.shape=", x_train.shape)
print("y_train.shape=", y_train.shape)

x_train.shape= (80000, 1, 28, 28)
y_train.shape= (80000, 10)


### HDF5へ変換

In [5]:
import h5py

with h5py.File(file_path_hdf5, mode='w') as f:  
    f.create_dataset('x_train', x_train.shape, dtype = np.float32)
    f.create_dataset('y_train', y_train.shape, dtype = np.int8)
    
    f['x_train'][...] = x_train
    f['y_train'][...] = y_train

In [6]:
with h5py.File(file_path_hdf5, mode='r') as f:  
    dset = f['x_train']
    dset_labels = f['y_train']
    print(dset.shape)
    print(dset_labels.shape)


(80000, 1, 28, 28)
(80000, 10)
