In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from glob import glob
import time, gc
import cv2
import pyarrow.parquet as pq
import pyarrow as pa

from tensorflow import keras
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.models import clone_model
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,Dropout,BatchNormalization, Input
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import os
import json
import pickle

Using TensorFlow backend.


In [2]:
parent_directory = os.path.dirname(os.getcwd())

def get_dummies(df):
    cols = []
    for col in df:
        cols.append(pd.get_dummies(df[col].astype(str)))
    return pd.concat(cols, axis=1)

In [3]:
# IMG_SIZE=64
global IMG_X_SIZE
IMG_X_SIZE = 87
global IMG_Y_SIZE
IMG_Y_SIZE = 106
global N_CHANNELS
N_CHANNELS=1

In [4]:
# Preparing the preprocessed data for fitting in the model
# this is for GCP or local
proc_img_0 = pq.read_table(parent_directory+"/data/preprocessed/preprop_0.parquet").to_pandas()
proc_img_1 = pq.read_table(parent_directory+"/data/preprocessed/preprop_1.parquet").to_pandas()
proc_img_2 = pq.read_table(parent_directory+"/data/preprocessed/preprop_2.parquet").to_pandas()
proc_img_3 = pq.read_table(parent_directory+"/data/preprocessed/preprop_3.parquet").to_pandas()
train_images = pd.concat([proc_img_0, proc_img_1, proc_img_2, proc_img_3])
train_images.drop(columns=['image_id'],inplace=True)
del proc_img_0
del proc_img_1
del proc_img_2
del proc_img_3

In [5]:
# CNN takes images in shape `(batch_size, h, w, channels)`, so reshape the images
train_images = train_images.values.reshape(-1, IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS)

In [6]:
train_labels = pd.read_csv(parent_directory+"/data/train.csv")
Y_train_root = pd.get_dummies(train_labels['grapheme_root']).values
Y_train_vowel = pd.get_dummies(train_labels['vowel_diacritic']).values
Y_train_consonant = pd.get_dummies(train_labels['consonant_diacritic']).values
del train_labels
# print(f'Training images: {train_images.shape}')
# print(f'Training labels root: {Y_train_root.shape}')
# print(f'Training labels vowel: {Y_train_vowel.shape}')
# print(f'Training labels consonants: {Y_train_consonant.shape}')

In [7]:
# below this should take around 5 minutes
x_train, x_test, y_train_root, y_test_root, y_train_vowel, y_test_vowel, y_train_consonant, y_test_consonant \
    = train_test_split(train_images, Y_train_root, Y_train_vowel, Y_train_consonant, test_size=0.3, random_state=666)
del train_images
x_val, x_test, y_val_root, y_test_root, y_val_vowel, y_test_vowel, y_val_consonant, y_test_consonant \
    = train_test_split(x_test, y_test_root, y_test_vowel, y_test_consonant, test_size=0.33, random_state=666)
# print(f'x_train size: {x_train.shape}')
# print(f'x_val size: {x_val.shape}')
# print(f'x_test size: {x_test.shape}')

In [8]:
class MultiOutputDataGenerator(keras.preprocessing.image.ImageDataGenerator):

    def flow(self,
             x,
             y=None,
             batch_size=32,
             shuffle=True,
             sample_weight=None,
             seed=None,
             save_to_dir=None,
             save_prefix='',
             save_format='png',
             subset=None):

        targets = None
        target_lengths = {}
        ordered_outputs = []
        for output, target in y.items():
            if targets is None:
                targets = target
            else:
                targets = np.concatenate((targets, target), axis=1)
            target_lengths[output] = target.shape[1]
            ordered_outputs.append(output)


        for flowx, flowy in super().flow(x, targets, batch_size=batch_size,
                                         shuffle=shuffle):
            target_dict = {}
            i = 0
            for output in ordered_outputs:
                target_length = target_lengths[output]
                target_dict[output] = flowy[:, i: i + target_length]
                i += target_length

            yield flowx, target_dict

In [9]:
# Preparing the data generator (should take two minutes)
# Data augmentation for creating more training data
datagen = MultiOutputDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    rotation_range=8,  # randomly rotate images in the range (degrees, 0 to 180)
    zoom_range = 0.15, # Randomly zoom image 
    width_shift_range=0.15,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.15,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=False,  # randomly flip images
    vertical_flip=False)  # randomly flip images
# This will just calculate parameters required to augment the given data. This won't perform any augmentations
datagen.fit(x_train)

In [10]:
"""
Not going to use exponential anymore after realizing it sucks
"""
# need to edit these when we run the actual model and not doing hyperparameter tuning
# initial_learning_rate = 0.01
# decay_steps = 5 # this would be more like 10 or 20, since we'll be running more epochs
# decay_rate = 0.1
# learning_rate_exp_root = tf.keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = initial_learning_rate, decay_steps = decay_steps, decay_rate=decay_rate, name="lr_expD_root")
# learning_rate_exp_vowel = tf.keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = initial_learning_rate, decay_steps = decay_steps, decay_rate=decay_rate, name="lr_expD_vowel")
# learning_rate_exp_consonant = tf.keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate = initial_learning_rate, decay_steps = decay_steps, decay_rate=decay_rate, name="lr_expD_consonant")
# LR_scheduler_exp_root = tf.keras.callbacks.LearningRateScheduler(learning_rate_exp_root)
# LR_scheduler_exp_vowel = tf.keras.callbacks.LearningRateScheduler(learning_rate_exp_vowel)
# LR_scheduler_exp_consonant = tf.keras.callbacks.LearningRateScheduler(learning_rate_exp_consonant)

# def exponential_decay_fn(epoch):
#     return 0.5 * 0.1 **(epoch / 3) # 1st var is initial lr, 2nd is decay_rate, 3rd is decay_steps, i think
# lr_exp_root = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
# lr_exp_vowel = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
# lr_exp_consonant = keras.callbacks.LearningRateScheduler(exponential_decay_fn)

learning_rate_reduction_root = ReduceLROnPlateau(monitor='dense_3_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)
learning_rate_reduction_vowel = ReduceLROnPlateau(monitor='dense_4_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)
learning_rate_reduction_consonant = ReduceLROnPlateau(monitor='dense_5_accuracy', 
                                            patience=3, 
                                            verbose=1,
                                            factor=0.5, 
                                            min_lr=0.00001)

In [11]:
def build_model(activation, dropout_prob):
    inputs = Input(shape = (IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS))
    # first convolutional layer
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation, input_shape=(IMG_X_SIZE, IMG_Y_SIZE, N_CHANNELS))(inputs)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=32, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 2nd CL
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 3rd CL
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=128, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # 4th CL
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = MaxPool2D(pool_size=(2, 2))(model)
    model = Conv2D(filters=256, kernel_size=(5, 5), padding='SAME', activation=activation)(model)
    model = BatchNormalization(momentum=0.15)(model)
    model = Dropout(rate=dropout_prob)(model)
    # dense layer
    model = Flatten()(model)
    model = Dense(1024, activation=activation)(model)
    model = Dropout(rate=dropout_prob)(model)
    dense = Dense(512, activation=activation)(model)
    # softmax layer
    head_root = Dense(168, activation = 'softmax', name = "dense_root")(dense)
    head_vowel = Dense(11, activation = 'softmax', name = "dense_vowel")(dense)
    head_consonant = Dense(7, activation = 'softmax', name = "dense_consonant")(dense)
    # output
    model = Model(inputs=inputs, outputs=[head_root, head_vowel, head_consonant])
    return model

In [12]:
activations = ["tanh", "relu"]
dropout_probs = [0.2, 0.4]
optimizers = ['adam', 'nadam']
# lr_schedulers = ['exp', 'power']
batch_sizes = [256,128]

epochs = 10

In [16]:
# TUNE THE MODEL
if not os.path.exists(parent_directory+"/models"):
    os.makedirs(parent_directory+"/models")
histories = {}
counter = 0 
for activation in activations:
    for dropout_prob in dropout_probs:
        for optimizer in optimizers:
            for batch_size in batch_sizes:
    #             # MAKE SURE YOU EDIT THIS OUT LATER BUT THIS IS JUST TO SKIP MODEL 0 CUZ WE ALREADY TRIED IT
                if not (counter==8 or counter==12):
                    counter += 1
                    continue
                print("==========================================================================================")
                print("Training model_"+str(counter) +":")
                print("\t Activation: " + activation)
                print("\t Dropout Probability: " + str(dropout_prob))
                print("\t Optimizer: " + optimizer)
                print("\t Batch Size: " + str(batch_size))
                model = build_model(activation, dropout_prob)
                model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
                callbacks=[learning_rate_reduction_root, learning_rate_reduction_vowel, learning_rate_reduction_consonant]
                history = model.fit_generator(
                        datagen.flow(
                            x_train, {'dense_root': y_train_root, 'dense_vowel': y_train_vowel, 'dense_consonant': y_train_consonant}, 
                            batch_size=batch_size),
                        epochs = epochs, validation_data = (x_val, [y_val_root, y_val_vowel, y_val_consonant]), 
                        steps_per_epoch=x_train.shape[0] // batch_size, 
                        callbacks=callbacks
                    )
                # need to change values of history to float64s or floats, float32 is not json serializable
                for key in history.history.keys():
                    history.history[key] = [np.float64(val) for val in history.history[key]]
                # add history to histories
                histories["model_" + str(counter)] = (activation, dropout_prob, optimizer, batch_size, history.history)
                # save histories as json file
                with open(parent_directory+"/models/model_" + str(counter)+".json", "w") as fp:
                    json.dump(history.history, fp, sort_keys = True, indent = 4)
                counter += 1
                del model
                del history
with open(parent_directory+"/models/histories.json", "w") as fp:
    json.dump(histories, fp, sort_keys = True, indent = 4)

Training model_8:
	 Activation: relu
	 Dropout Probability: 0.2
	 Optimizer: adam
	 Batch Size: 256
Epoch 1/10
Epoch 2/10
  1/549 [..............................] - ETA: 48s - loss: 3.9388 - dense_root_loss: 2.9029 - dense_vowel_loss: 0.6232 - dense_consonant_loss: 0.4128 - dense_root_accuracy: 0.2045 - dense_vowel_accuracy: 0.8182 - dense_consonant_accuracy: 0.8636



Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model_12:
	 Activation: relu
	 Dropout Probability: 0.4
	 Optimizer: adam
	 Batch Size: 256
Epoch 1/10
Epoch 2/10
  1/549 [..............................] - ETA: 48s - loss: 5.7625 - dense_root_loss: 4.1924 - dense_vowel_loss: 0.9676 - dense_consonant_loss: 0.6025 - dense_root_accuracy: 0.0682 - dense_vowel_accuracy: 0.6591 - dense_consonant_accuracy: 0.8409



Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [117]:
with open(parent_directory+"/models/histories.json", "w") as fp:
    json.dump(histories, fp, sort_keys = True, indent = 4)

In [116]:
histories

{'model_0': ('tanh',
  0.2,
  'adam',
  'exp',
  {'dense_consonant_accuracy': [0.7153489589691162,
    0.830986499786377,
    0.8645355105400085,
    0.8792933821678162,
    0.8850440382957458,
    0.8889134526252747,
    0.8886640071868896,
    0.8907376527786255,
    0.8902531266212463,
    0.8910797238349915],
   'dense_consonant_loss': [0.8337407112121582,
    0.5041878819465637,
    0.410670667886734,
    0.3695290684700012,
    0.34943297505378723,
    0.34197208285331726,
    0.337189257144928,
    0.33394184708595276,
    0.33507055044174194,
    0.3337043225765228],
   'dense_root_accuracy': [0.08045167475938797,
    0.3248724341392517,
    0.4970569908618927,
    0.5793760418891907,
    0.6145996451377869,
    0.6340250372886658,
    0.6380725502967834,
    0.648861289024353,
    0.6514408588409424,
    0.6503862142562866],
   'dense_root_loss': [4.170259475708008,
    2.5790419578552246,
    1.8359626531600952,
    1.528229832649231,
    1.39825439453125,
    1.3231263160705

In [None]:
del x_train
del x_test
del y_train_root
del y_test_root
del y_train_vowel
del y_test_vowel
del y_train_consonant
del y_test_consonant
gc.collect()