In [39]:
#import libraries
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Convolution2D
from keras.layers.convolutional import MaxPooling2D
from tensorflow.keras.utils import to_categorical
from PIL import Image
from keras import backend as K
import numpy as np
import pandas as pd
import zipfile
import sklearn
import glob
import random
import matplotlib.pyplot as plt
import os

np.random.seed(2)

In [2]:
#Read csv files with labels into a dataframe
datalabels = pd.read_csv('chinese_mnist.csv', low_memory = False)
datalabels.head()

Unnamed: 0,suite_id,sample_id,code,value,character
0,1,1,10,9,九
1,1,10,10,9,九
2,1,2,10,9,九
3,1,3,10,9,九
4,1,4,10,9,九


In [5]:
print("dataframe rows:", datalabels.shape[0]) 
print("image files :", len(os.listdir("data/")))

dataframe rows: 15000
image files : 15000


In [42]:
# Match labels to image file names, code taken from Kaggle User Richard Kuo's classifier, found at https://www.kaggle.com/rkuo2000/chinese-mnist
def file_path_col(data):    
    file_path = f"input_{data[0]}_{data[1]}_{data[2]}.jpg" #input_1_1_10.jpg    
    return file_path

In [9]:
datalabels["file_path"] = datalabels.apply(file_path_col, axis = 1)
datalabels.head()

Unnamed: 0,suite_id,sample_id,code,value,character,file_path
0,1,1,10,9,九,input_1_1_10.jpg
1,1,10,10,9,九,input_1_10_10.jpg
2,1,2,10,9,九,input_1_2_10.jpg
3,1,3,10,9,九,input_1_3_10.jpg
4,1,4,10,9,九,input_1_4_10.jpg


In [12]:
#use sklearn to split data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(datalabels, test_size=0.2, random_state=0, stratify=datalabels["code"].values, shuffle = True)

print(train.shape[0])
print(test.shape[0])

12000
3000


In [40]:
#split data further into testing and val sets
val_data, test_data   = train_test_split(datalabels, test_size = 0.5, random_state = 42, shuffle = True, stratify = datalabels.code.values)

In [17]:
print(f"Train set rows: {train.shape[0]}")
print(f"Test set rows: {test.shape[0]}")
print(f"Val set rows: {val_data.shape[0]}")

Train set rows: 12000
Test set rows: 3000
Val set rows: 7500


In [18]:
import skimage.io
import skimage.transform

In [43]:

# One hot encoder for all 15 labels, taken from Richard Kuo's submission, found at https://www.kaggle.com/rkuo2000/chinese-mnist
def character_encoder(data, var = "character"):
    x = np.stack(data["file_path"].apply(read_image))
    y = pd.get_dummies(data[var], drop_first = False)
    return x, y

In [21]:
file_paths = list(datalabels.file_path)
#Reading and resizing images in the dataset, code adapted from NIKOLA BOZHINOV's submission to kaggle for the dataset, found at https://www.kaggle.com/vislupus/cnn-model-chinese-mnist-98-9-accuracy
def read_image(file_paths):
    image = skimage.io.imread("data/" + file_paths)
    #resize image
    image = skimage.transform.resize(image, (64, 64, 1), mode="reflect") 
    return image[:, :, :]

In [30]:
#actually call the functions to get the arrays we'll be working from
X_train, y_train = character_encoder(train)
X_val, y_val = character_encoder(val_data)
X_test, y_test = character_encoder(test)

#normalize inputs
X_train = X_train / 255
X_test = X_test / 255

print(X_train.shape, ",", y_train.shape)
print(X_val.shape, ",", y_val.shape)
print(X_test.shape, ",", y_test.shape)

(12000, 64, 64, 1) , (12000, 15)
(7500, 64, 64, 1) , (7500, 15)
(3000, 64, 64, 1) , (3000, 15)


In [36]:
def create_model():
    
    model = Sequential()
    num_classes = 15
    input_shape = (64, 64, 1)
    model.add(Convolution2D(32,kernel_size=(3,3),padding='same',activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Convolution2D(64, kernel_size = (3, 3), activation = 'relu', padding = 'same'))
    model.add(MaxPooling2D(pool_size = (2, 2)))
    model.add(Convolution2D(128, kernel_size = (3, 3), activation = 'relu', padding = 'same'))
    model.add(MaxPooling2D(pool_size = (2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dense(num_classes, activation = 'softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

    return model

In [24]:
model = create_model

In [28]:
#fit model, parameters not optimized
model().fit(X_train, y_train, batch_size=128, epochs=20, validation_data=(X_val, y_val))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_9 (Conv2D)           (None, 64, 64, 32)        320       
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 32, 32, 32)       0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 32, 32, 64)        18496     
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 16, 16, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 16, 16, 128)       73856     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 8, 8, 128)       

<keras.callbacks.History at 0x18100175130>

In [29]:
#test model to gauge accuracy
ModelLoss, ModelAccuracy = model().evaluate(X_test, y_test)

print(f'Test Loss is {ModelLoss}')
print(f'Test Accuracy is {ModelAccuracy}')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 64, 64, 32)        320       
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 32, 32, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_13 (Conv2D)          (None, 32, 32, 64)        18496     
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 16, 16, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_14 (Conv2D)          (None, 16, 16, 128)       73856     
                                                                 
 max_pooling2d_14 (MaxPoolin  (None, 8, 8, 128)       

In [27]:
#save the model to an h5 file
model().save("mnistmodelmk2.h5")
print("The model has successfully saved")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           (None, 64, 64, 32)        320       
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 32, 32, 32)       0         
 2D)                                                             
                                                                 
 conv2d_7 (Conv2D)           (None, 32, 32, 64)        18496     
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 16, 16, 64)       0         
 2D)                                                             
                                                                 
 conv2d_8 (Conv2D)           (None, 16, 16, 128)       73856     
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 8, 8, 128)       