In [27]:
import pandas as pd
import numpy as np
import cv2
import os
import re
import sys
from collections import Counter
import seaborn as sns
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from multiprocessing import Pool

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.applications.resnet_v2 import ResNet152V2


from datetime import datetime
import pickle
import scipy.stats as stats
import os
import sklearn.metrics

In [None]:
y = pd.read_csv("data/train.csv")
yEval = pd.read_csv("data/test.csv")
classMap = pd.read_csv("data/class_map.csv")

In [None]:
labels = ["grapheme_root","vowel_diacritic","consonant_diacritic"]

In [None]:
y = y.set_index("image_id")

In [None]:
tables = [pq.read_table('data/train_image_data_{0}.parquet'.format(i)) for i in range(4)]
tables = [table.to_pandas() for table in tables]
df = pd.concat(tables)
df = df.set_index("image_id")
tables = [pq.read_table('data/test_image_data_{0}.parquet'.format(i)) for i in range(4)]
tables = [table.to_pandas() for table in tables]
dfEval = pd.concat(tables)
dfEval = dfEval.set_index("image_id")
del tables

In [None]:
size=(60,60)

def transformImg(img):
    img=255-img
    mu = cv2.moments(img, False)
    x, y= mu["m10"]/mu["m00"] , mu["m01"]/mu["m00"]
    M = np.float32([[1,0,(236/2)-x],[0,1,(137/2)-y]])
    img = cv2.warpAffine(img,M,(img.shape[1],img.shape[0]))
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    img = cv2.resize(img, (size[0],size[1]))
    return img

imgs=df.values.reshape(-1,137,236)
with Pool() as p:
    imap=p.imap(transformImg,imgs)
    imgs=list(tqdm(imap,total=imgs.shape[0]))

X = np.asarray(imgs).astype(np.float32)/255.0
#X = X.reshape(-1,size[0],size[1],3)
#X = X.reshape(-1,size[0],size[1],1)

In [None]:
#plt.imshow(X[15].reshape(size))
plt.imshow(X[15])

In [28]:
def getModel(cat=168):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(*size,1)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(cat, activation='softmax'))
    return model

def getModel2(cat=168):
    model = Sequential()
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu', input_shape=(*size, 1)))
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Flatten())
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(192, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(cat, activation = "softmax"))
    return model

def getModel3(cat=168):
    model = Sequential()
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu', input_shape=(*size, 1)))
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=128, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Flatten())
    model.add(Dense(512, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(192, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(cat, activation = "softmax"))
    return model


def getModel4(cat=168):
    model = Sequential()
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu', input_shape=(*size, 1)))
    model.add(Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=64, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=128, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=128, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))
    
    model.add(Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(Conv2D(filters=256, kernel_size=(3, 3), padding='SAME', activation='relu'))
    model.add(BatchNormalization(momentum=0.15))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=256, kernel_size=(5, 5), padding='SAME', activation='relu'))
    model.add(Dropout(rate=0.3))

    model.add(Flatten())
    model.add(Dense(512, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(192, activation = "relu"))
    model.add(Dropout(0.10))
    model.add(Dense(cat, activation = "softmax"))
    return model


def getResBased(cat=168):
    model = ResNet152V2(weights="imagenet", include_top=False)
    x = model.output
    x = GlobalAveragePooling2D()(x)
    #x = Dense(256, activation="relu")(x)
    classifier = Dense(cat, activation="softmax")(x)
    model = Model(inputs=model.input, outputs=classifier)
    return model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.9)

In [10]:
early_stopping =  EarlyStopping(monitor='val_loss', min_delta=0.0, patience=3)

In [29]:
label=labels[0]
model = getResBased(len(set(y[label])))

Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [30]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_6[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
pool1_pad (ZeroPadding2D)       (None, None, None, 6 0           conv1_conv[0][0]                 
____________________________________________________________________________________________

In [None]:
model.compile(optimizer=Adam(), metrics=["acc"],loss="categorical_crossentropy")
model.fit(X_train,to_categorical(y_train[label]),epochs=20,validation_data=(X_test,to_categorical(y_test[label])), callbacks=[early_stopping])

In [34]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

In [35]:
accuracy_score(y_test[label],y_pred)

0.879008165704043

In [None]:
print("hey")

In [None]:
for i in tqdm(range(100)):
    for label in tqdm(labels):
        cat=len(set(y[label]))
        for model in [getModel(cat), getModel2(cat), getModel3(cat), getModel4(cat),getModel5(cat),]:
            model.compile(optimizer=Adam(), metrics=["acc"],loss="categorical_crossentropy")
            model.fit(X_train,to_categorical(y_train[label]),epochs=20,validation_data=(X_test,to_categorical(y_test[label])), callbacks=[early_stopping])
            os.makedirs("models/{0}".format(label),exist_ok=True)
            with open('./{0}/models/{1}.dump'.format(label,int(float(datetime.now().timestamp()))), 'wb') as f:
                pickle.dump(model,f)

In [None]:
ans = pd.DataFrame(index=y_test.index)
for label in labels:
    y_preds = []
    for file in tqdm(os.listdir("./{0}".format(label))):
        with open("./{0}/{1}".format(label,file),"rb") as f:
            model=pickle.load(f)
            try:
                y_pred = model.predict(X_testl)
                y_pred = np.argmax(y_pred, axis=1)
                y_preds.append(y_pred)
            except:
                print(model)
    y_preds = np.asarray(y_preds)
    y_pred, _ = stats.mode(y_preds,axis=0)
    y_pred = y_pred.reshape(-1,)
    ans[label] = y_pred

In [None]:
scores = []
for label in labels:
    y_true_subset = y_test[label]
    y_pred_subset = ans[label]
    scores.append(sklearn.metrics.recall_score(
        y_true_subset, y_pred_subset, average='macro'))
final_score = np.average(scores, weights=[2,1,1])

In [None]:
final_score