In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import ast
from glob import glob

import gc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
train_files = glob("../input/train_simplified/*.csv")

# Any results you write to the current directory are saved as output.

In [None]:
# Configuration for data

# Width/Height (images are squares) of each image
IMAGE_SIZE = 32

# Total number of classes of the given problem 
NUM_CLASSES = 340

# Total number of instances used for train or validation tests
NUM_ROWS_TOTAL = 49707919//96
NUM_ROWS_TOTAL -= (NUM_ROWS_TOTAL % NUM_CLASSES)

# How many lines of the input file must be readed, at a maximum, each iteration
CHUNKSIZE = 1024

# Number of instances of each class
EACH_SET_SIZE = NUM_ROWS_TOTAL // NUM_CLASSES

print("total instances:", NUM_ROWS_TOTAL)
print("images per class:", EACH_SET_SIZE)

In [None]:
from PIL import Image, ImageDraw
from dask import bag

# Rasterizarion algorithm from https://www.kaggle.com/jpmiller/image-based-cnn#
def draw_it(strokes, imheight=32, imwidth=32):
    image = Image.new("P", (256,256), color=255)
    image_draw = ImageDraw.Draw(image)
    for stroke in ast.literal_eval(strokes):
        for i in range(len(stroke[0])-1):
            image_draw.line([stroke[0][i], 
                             stroke[1][i],
                             stroke[0][i+1], 
                             stroke[1][i+1]],
                            fill=0, width=5)
    image = image.resize((imheight, imwidth))
    return np.array(image)/255.

In [None]:
# Allocate heavy RAM memory for all train data images & classes at one shot
# in order to speed up the processing
raster_array = np.zeros((NUM_ROWS_TOTAL, IMAGE_SIZE, IMAGE_SIZE, 1))
classes = pd.Series([None] * NUM_ROWS_TOTAL)

In [None]:
# Reading and rasterizing on-demand, to save memory
i = 0
for f in train_files:
    for df in pd.read_csv(f, index_col="key_id", chunksize=CHUNKSIZE, nrows=EACH_SET_SIZE):
        imagebag = bag.from_sequence(df.drawing.values).map(draw_it, imheight=IMAGE_SIZE, imwidth=IMAGE_SIZE)
        imagebag = np.array(imagebag.compute())
        classes[i:(i + imagebag.shape[0])] = df["word"].replace("\s+", "_", regex=True)
        raster_array[i:(i + imagebag.shape[0])] = imagebag.reshape((*imagebag.shape, 1))
        i += imagebag.shape[0]


In [None]:
# Shuffle raster_array and classes in unison
import numpy.core.defchararray as np_f

def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

classes = classes.values
shuffle_in_unison(raster_array, classes)

In [None]:
# Get class dummies
dummies = pd.get_dummies(classes)
dummies.head(n=10)

In [None]:
# Print rasterized images
plt.figure(figsize=(16, 16))
for index, image in enumerate(raster_array[:100]):
    plt.subplot(10, 10, index+1)
    plt.gca().invert_yaxis()
    plt.axis("off")
    plt.imshow(image.reshape((IMAGE_SIZE, IMAGE_SIZE)), cmap="binary")

plt.show()

In [None]:
# Split training data for validation
from sklearn.model_selection import train_test_split
train_X,valid_X,train_label,valid_label = train_test_split(raster_array, dummies, test_size=0.2, random_state=13)

In [None]:
train_X.shape,valid_X.shape,train_label.shape,valid_label.shape

In [None]:
# Remove raster_array to free up RAM memory
del raster_array
gc.collect()

In [None]:
# Constants
BATCH_SIZE = 1024
EPOCHS = 40
NUM_CLASSES = 340
SHAPE = (IMAGE_SIZE, IMAGE_SIZE, 1)
FILTER_SIZE = (4, 4)
PROB_DO_HIDDEN = 0.3

In [None]:
# CNN Architecture 
import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Activation

model = Sequential()
model.add(Conv2D(32, kernel_size=FILTER_SIZE, strides=1, input_shape=SHAPE))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(PROB_DO_HIDDEN))

model.add(Conv2D(64, kernel_size=FILTER_SIZE, strides=1))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(PROB_DO_HIDDEN))

model.add(Flatten())
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(PROB_DO_HIDDEN))

model.add(Dense(NUM_CLASSES))
model.add(Activation('softmax'))

In [None]:
# Fonte: https://stackoverflow.com/questions/42327006/how-to-calculate-top5-accuracy-in-keras
import functools
top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'

In [None]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), metrics=['accuracy', top3_acc])

In [None]:
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=int(EPOCHS*0.15))

history = model.fit(train_X, train_label, batch_size=BATCH_SIZE,epochs=EPOCHS,verbose=1,validation_data=(valid_X, valid_label), callbacks=[early_stopping_monitor])

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation accuracy values
plt.plot(history.history['top3_acc'])
plt.plot(history.history['val_top3_acc'])
plt.title('Model accuracy (top 3)')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Free train/validation memory up
del train_X, valid_X, train_label, valid_label
gc.collect()

In [None]:
# Reading test file
test = pd.read_csv('../input/test_simplified.csv', index_col="key_id")
test.head()
testidx = test.index

In [None]:
# Rasterizing vector images of test set
imagebag = bag.from_sequence(test.drawing.values).map(draw_it, 
                                                      imheight=IMAGE_SIZE, 
                                                      imwidth=IMAGE_SIZE)

test_X = np.array(imagebag.compute())

In [None]:
# Reshape test set
test_X = test_X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1)
test_X.shape

In [None]:
# Predict test dataset with model
test_Y = model.predict(test_X, batch_size=BATCH_SIZE)

In [None]:
# Get top 3 classes
ans = np.argsort(-test_Y)[:,:3]

In [None]:
ans

In [None]:
# Generating output
out_aux = []
for id, i in zip(testidx, dummies.columns[ans]):
    out_aux.append([id, " ".join(map(str, i))])

output = pd.DataFrame(columns=['key_id', 'word'], data=out_aux)

In [None]:
output.head(n=10)

In [None]:
output.to_csv("submission.csv", index=False)