In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from glob import glob
import time, gc
import cv2
import pyarrow.parquet as pq
import pyarrow as pa

from tensorflow import keras
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.models import clone_model
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,Dropout,BatchNormalization, Input
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import os
import json

Using TensorFlow backend.


ModuleNotFoundError: No module named 'sklearn'

In [2]:
parent_directory = os.path.dirname(os.getcwd())

def get_dummies(df):
    cols = []
    for col in df:
        cols.append(pd.get_dummies(df[col].astype(str)))
    return pd.concat(cols, axis=1)

# IMG_SIZE=64
global IMG_X_SIZE
IMG_X_SIZE = 87
global IMG_Y_SIZE
IMG_Y_SIZE = 106
global N_CHANNELS
N_CHANNELS=1

In [3]:
# Preparing the preprocessed data for fitting in the model
# this is for GCP or local
proc_img_0 = pq.read_table(parent_directory+"/data/preprocessed/preprop_0.parquet").to_pandas()
proc_img_1 = pq.read_table(parent_directory+"/data/preprocessed/preprop_1.parquet").to_pandas()
proc_img_2 = pq.read_table(parent_directory+"/data/preprocessed/preprop_2.parquet").to_pandas()
proc_img_3 = pq.read_table(parent_directory+"/data/preprocessed/preprop_3.parquet").to_pandas()
train_images = pd.concat([proc_img_0, proc_img_1, proc_img_2, proc_img_3])
train_images.drop(columns=['image_id'],inplace=True)
del proc_img_0
del proc_img_1
del proc_img_2
del proc_img_3

In [4]:
# CNN takes images in shape `(batch_size, h, w, channels)`, so reshape the images
train_images = train_images.values.reshape(-1, IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS)

In [5]:
train_labels = pd.read_csv(parent_directory+"/data/train.csv")
Y_train_root = pd.get_dummies(train_labels['grapheme_root']).values
Y_train_vowel = pd.get_dummies(train_labels['vowel_diacritic']).values
Y_train_consonant = pd.get_dummies(train_labels['consonant_diacritic']).values
del train_labels
# print(f'Training images: {train_images.shape}')
# print(f'Training labels root: {Y_train_root.shape}')
# print(f'Training labels vowel: {Y_train_vowel.shape}')
# print(f'Training labels consonants: {Y_train_consonant.shape}')

In [6]:
# below this should take around 5 minutes
x_train, x_test, y_train_root, y_test_root, y_train_vowel, y_test_vowel, y_train_consonant, y_test_consonant \
    = train_test_split(train_images, Y_train_root, Y_train_vowel, Y_train_consonant, test_size=0.3, random_state=666)
del train_images
x_val, x_test, y_val_root, y_test_root, y_val_vowel, y_test_vowel, y_val_consonant, y_test_consonant \
    = train_test_split(x_test, y_test_root, y_test_vowel, y_test_consonant, test_size=0.33, random_state=666)
# print(f'x_train size: {x_train.shape}')
# print(f'x_val size: {x_val.shape}')
# print(f'x_test size: {x_test.shape}')

In [7]:
class MultiOutputDataGenerator(keras.preprocessing.image.ImageDataGenerator):

    def flow(self,
             x,
             y=None,
             batch_size=32,
             shuffle=True,
             sample_weight=None,
             seed=None,
             save_to_dir=None,
             save_prefix='',
             save_format='png',
             subset=None):

        targets = None
        target_lengths = {}
        ordered_outputs = []
        for output, target in y.items():
            if targets is None:
                targets = target
            else:
                targets = np.concatenate((targets, target), axis=1)
            target_lengths[output] = target.shape[1]
            ordered_outputs.append(output)


        for flowx, flowy in super().flow(x, targets, batch_size=batch_size,
                                         shuffle=shuffle):
            target_dict = {}
            i = 0
            for output in ordered_outputs:
                target_length = target_lengths[output]
                target_dict[output] = flowy[:, i: i + target_length]
                i += target_length

            yield flowx, target_dict

In [8]:
# Preparing the data generator (should take two minutes)
# Data augmentation for creating more training data
datagen = MultiOutputDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    rotation_range=8,  # randomly rotate images in the range (degrees, 0 to 180)
    zoom_range = 0.15, # Randomly zoom image 
    width_shift_range=0.15,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.15,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=False,  # randomly flip images
    vertical_flip=False)  # randomly flip images
# This will just calculate parameters required to augment the given data. This won't perform any augmentations
datagen.fit(x_train)

In [9]:
learning_rate_reduction_root = ReduceLROnPlateau(monitor='dense_3_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)
learning_rate_reduction_vowel = ReduceLROnPlateau(monitor='dense_4_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)
learning_rate_reduction_consonant = ReduceLROnPlateau(monitor='dense_5_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)

In [10]:
# this model has 5 convolutional layers
def build_model(activation, dropout_prob):
    inputs = Input(shape = (IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS))
    # first convolutional layer
    model = Conv2D(filters=16, kernel_size=(3, 3), padding='SAME', activation=activation, input_shape=(IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS))(inputs)
    model = Conv2D(filters=16, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=16, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=16, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=16, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 2nd convolutional layer
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(inputs)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=32, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 3rd CL
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 4th CL
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=128, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 5th CL
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=256, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # dense layer
    model = Flatten()(model)
    model = Dense(1024, activation=activation)(model)
    model = Dropout(rate=dropout_prob)(model)
    dense = Dense(512, activation=activation)(model)
    # softmax layer
    head_root = Dense(168, activation = 'softmax', name = "dense_root")(dense)
    head_vowel = Dense(11, activation = 'softmax', name = "dense_vowel")(dense)
    head_consonant = Dense(7, activation = 'softmax', name = "dense_consonant")(dense)
    # output
    model = Model(inputs=inputs, outputs=[head_root, head_vowel, head_consonant])
    return model

In [11]:
# model.summary()
# from keras.utils import plot_model
# plot_model(model, to_file=f'{parent_directory}/figures/final_model_architecture.png')

In [12]:
activation = 'relu'
dropout_prob = 0.2
optimizer = 'adam'
batch_size = 256
epochs = 30

In [None]:
model = build_model(activation, dropout_prob)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
callbacks=[learning_rate_reduction_root, learning_rate_reduction_vowel, learning_rate_reduction_consonant]
history = model.fit_generator(
    datagen.flow(
        x_train, {'dense_root': y_train_root, 'dense_vowel': y_train_vowel, 'dense_consonant': y_train_consonant}, 
        batch_size=batch_size),
    epochs = epochs, validation_data = (x_val, [y_val_root, y_val_vowel, y_val_consonant]), 
    steps_per_epoch=x_train.shape[0] // batch_size, 
    callbacks=callbacks
    )
for key in history.history.keys():
    history.history[key] = [np.float64(val) for val in history.history[key]]
with open(parent_directory+"/models/final_model.json", "w") as fp:
    json.dump(history.history, fp, sort_keys = True, indent = 4)

model.save(parent_directory+ "/models/final_model_v1.h5")

Epoch 1/30
Epoch 2/30




Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
del x_train
del x_test
del y_train_root
del y_test_root
del y_train_vowel
del y_test_vowel
del y_train_consonant
del y_test_consonant
gc.collect()