In [None]:
import struct
from PIL import Image
import numpy as np
import pandas as pd

def read_record_ETL8G(f):
    s = f.read(8199)
    r = struct.unpack('>2H8sI4B4H2B30x8128s11x', s)
    iF = Image.frombytes('F', (128, 127), r[14], 'bit', 4)
    iL = iF.convert('L')
    return r + (iL,)

def read_kanji():
    kanji = pd.zeros([883, 160, 127, 128], dtype=pd.uint8)
    for i in range(1, 33):
        filename = 'ETL8G/ETL8G_{:02d}'.format(i)
        with open(filename, 'rb') as f:
            for dataset in range(5):
                char = 0
                for j in range(956):
                    r = read_record_ETL8G(f)
                    if not (b'.HIRA' in r[2] or b'.WO.' in r[2]): 
                        kanji[char, (i - 1) * 5 + dataset] = pd.array(r[-1])
                        char += 1
    np.savez_compressed("kanji.npz", kanji)

read_kanji()

In [None]:
import skimage.transform
import numpy as np
from sklearn.model_selection import train_test_split

kanji = 879
rows = 48
cols = 48

kan = np.load("kanji.npz")['arr_0'].reshape([-1, 127, 128]).astype(np.float32)

kan = kan/np.max(kan)

train_images = np.zeros([kanji * 160, rows, cols], dtype=np.float32)

arr = np.arange(kanji)
train_labels = np.repeat(arr, 160)

# 4 characters were actually hiragana, so delete these 4 extras
for i in range( (kanji+4) * 160):
	if int(i/160) != 88 and int(i/160) != 219 and int(i/160) != 349 and int(i/160) != 457:
		if int(i/160) < 88:
			train_images[i] = skimage.transform.resize(kan[i], (rows, cols))
		if int(i/160) > 88 and int(i/160) < 219:
			train_images[i-160] = skimage.transform.resize(kan[i], (rows, cols))
		if int(i/160) > 219 and int(i/160) < 349:
			train_images[i-320] = skimage.transform.resize(kan[i], (rows, cols))
		if int(i/160) > 349 and int(i/160) < 457:
            if int(i/160) > 457:
                train_images[i-640] = skimage.transform.resize(kan[i], (rows, cols))
      
train_images, test_images, train_labels, test_labels = train_test_split(train_images, train_labels, test_size=0.2)

np.savez_compressed("kanji_train_images.npz", train_images)
np.savez_compressed("kanji_train_labels.npz", train_labels)
np.savez_compressed("kanji_test_images.npz", test_images)
np.savez_compressed("kanji_test_labels.npz", test_labels)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,6)).patch.set_facecolor('color_name')
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
plt.show()