In [1]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

import skimage.io as io
import pickle

import time

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Sequential
#for colab
from tensorflow.keras.optimizers import Adam

# from keras.optimizers import Adam
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, Concatenate
from keras.models import Model

from tensorflow.keras.layers import BatchNormalization,Dropout
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import Concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform

from tensorflow.keras.layers import Layer
from keras.regularizers import l2
from keras import backend as K

from tensorflow.keras.applications.inception_v3 import InceptionV3
# from tensorflow.keras.preprocessing import image
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

from sklearn.metrics import confusion_matrix, classification_report


# Loading data
a function that loads the images locations and labels.<br>
input: the data path.<br>
output:<br>
&emsp;&emsp;dataHiero -> a dataframe with index= location of images and label= their labels <br>
&emsp;&emsp;img_groups -> a dictionary in the shape of { "label" : [array of locations of images labeled with this label] }

In [None]:
path="../GlyphDataset/Dataset/Manual/Preprocessed/"

def loadData(folderPictures=path):
    
    folders=next(os.walk(folderPictures))[1]
    img_groups = {}
    img_list={}

    for folder in folders:
        for img_file in os.listdir(folderPictures+folder):
            name, label = img_file.strip('.png').split("_")
            
            
            # One image per class

            #if label not in img_groups.keys():
            #    img_groups[label] = [folder + "_" + name]


            # Multiple images per class

            if label in img_groups.keys():
                img_groups[label].append(folder+"_"+name)
            else:
                img_groups[label] = [folder+"_"+name]

            img_list[folder+"_"+name]=[label]


    # Remove class with only one hieroglyph


    for k,v in list(img_groups.items()):
        if len(v)==1: del img_groups[k]

    # Extract only N hieroglyph classes randomly

    nclass = len(img_groups.keys())

    list_of_class = random.sample(list(img_groups.keys()), nclass)
#     print(list_of_class)

    short_dico = {x: img_groups[x] for x in list_of_class if x in img_groups}

    dataHiero=pd.DataFrame.from_dict(img_list,orient='index')
    dataHiero.columns = ["label"]
    dataHiero = dataHiero[dataHiero.label != 'UNKNOWN']

    dataHiero = dataHiero.loc[dataHiero['label'].isin(short_dico)]


    dataHiero.reset_index(level=0, inplace=True)

    return dataHiero,img_groups

a function that takes the image groups and load those images<br>
input: img_proups dictionary<br>
output:<br>
&emsp;&emsp;X -> np array of the images<br>
&emsp;&emsp;y -> np array of labels<br>
&emsp;&emsp;glyph_sizes -> a dictionary in the form of {'label' : (starting index, ending index in X and y)}


In [None]:
def read_images(img_groups,path):
    X=[]
    y=[]
    glyph_sizes={}
    low=0
    for glyph in img_groups:
        category_images=[]
        high=low
        for img_path in img_groups[glyph] :
            folder,name = img_path.split('_')
            image = io.imread(path+folder+'/'+name+'_'+glyph+'.png')
            X.append(image)
            y.append(glyph)
            high+=1
#         X.append(np.array(category_images))
        glyph_sizes[glyph]=(low,high-1)
        low=high
        
    return np.array(X),np.array(y).reshape((-1,1)),glyph_sizes
            
    

In [None]:
dataHiero,img_groups=loadData(folderPictures=path)
dataHiero.head()
# img_groups

In [None]:
X,y,sizes=read_images(img_groups,path)

In [None]:
len(np.unique(y))

In [None]:
type(X)
print(y.shape)
print(X.shape)
sizes['D21'][1]

saving the images into a pickle

In [None]:
#train val split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=42)

In [None]:
X_test.shape

In [None]:
def get_sizes(X,Y):
    sizes={}
    for i ,(x,y) in enumerate(zip(X,Y)):
#         print(i,x,y)
        if y[0] in sizes:
            sizes[y[0]].append(i)
        else:
            sizes[y[0]]=[i]
    return sizes
    

In [None]:
sizes=get_sizes(X_train,y_train)
X=X_train
y=y_train

In [None]:
#saving data as pickle
with open("train.pickle", "wb") as f:
    pickle.dump((X,y,sizes),f)

In [None]:
sizes_val=get_sizes(X_test,y_test)
Xval=X_test
yval=y_test

In [None]:
#saving data as pickle
with open("test.pickle", "wb") as f:
    pickle.dump((Xval,yval,sizes_val),f)

**Creating all pairs**

In [None]:
def create_all_pairs(X,y):
    n=y.shape[0]
    _,w,h=X.shape
    input1_ind=[]
    input2_ind=[]
    label=[]
    for i in range(n):
        for j in range(i+1,n):
            if y[i] == y[j]:
                  label.append(1)
            else:
                label.append(0)

            input1_ind.append(i)
            input2_ind.append(j)



    label=np.asarray(label)
    input1,input2,label = shuffle(input1_ind,input2_ind,label)
    pairs=[input1,input2]

    return X,np.asarray(pairs),np.asarray(label)

In [None]:
images_all_train,pairs_all_train,labels_all_train=create_all_pairs(X,y)

In [None]:
images_all_test,pairs_all_test,labels_all_test=create_all_pairs(Xval,yval)

In [None]:
print(pairs_all_test.shape)
pairs_all_train.shape

In [None]:
print(labels_all_train[labels_all_train==0].shape)
labels_all_train[labels_all_train==1].shape

In [None]:
with open(data_path+"pairs_all_train.pickle", "wb") as f:
    pickle.dump((images_all_train,pairs_all_train,labels_all_train),f)

In [None]:
with open(data_path+"pairs_all_test.pickle", "wb") as f:
    pickle.dump((images_all_test,pairs_all_test,labels_all_test),f)

# reading the Data with pickle

In [2]:
# #colab
# data_path= '/content/drive/MyDrive/hiero_cv/'
#local|
# data_path='./'
#kaggle
data_path='../input/oneshot/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [25]:
with open(data_path+"train.pickle", "rb") as f:
    (X,y,sizes) = pickle.load(f)

In [26]:
with open(data_path+"test.pickle", "rb") as f:
    (Xval,yval,sizes_val) = pickle.load(f)

In [6]:
# read digital data
with open(data_path+"train_digital.pickle", "rb") as f:
    (X_digital,y_digital,sizes_digital) = pickle.load(f)

In [None]:
#read all pairs train
with open(data_path+"pairs_all_train.pickle", "rb") as f:
    (images_all_train,pairs_all_train,labels_all_train) = pickle.load(f)

In [None]:
#read all pairs test
with open(data_path+"pairs_all_test.pickle", "rb") as f:
    (images_all_test,pairs_all_test,labels_all_test) = pickle.load(f)

In [None]:
labels_all_train.shape

# The Model

In [127]:
def get_siamese_model(input_shape):
    """
        Model architecture based on the one provided in: http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf
    """
    
    # Define the tensors for the two input images
    left_input = Input(input_shape,name='Input_1')
    right_input = Input(input_shape,name='Input_2')
    
    # Convolutional Neural Network
    model = Sequential(name='CNN')
    model.add(Conv2D(64, (3,3), activation='relu', input_shape=input_shape, kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(64, (3,3), activation='relu', kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(128, (4,4), activation='relu', kernel_regularizer=l2(2e-4)))
    model.add(MaxPooling2D())
    model.add(Conv2D(256, (4,4), activation='relu',  kernel_regularizer=l2(2e-4)))
    model.add(Flatten())

    model.add(Dense(4096, activation='relu',
                   kernel_regularizer=l2(1e-3)))
    # model.add(Dropout(0.2))
    # Generate the encodings (feature vectors) for the two images
    encoded_l = model(left_input)
    encoded_r = model(right_input)
    
    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]),name='Subtract')
    
    L1_distance = L1_layer([encoded_l, encoded_r])
    L1_distance = Dense(512,activation='relu',kernel_regularizer=l2(1e-3),name='Dense_1')(L1_distance)
    L1_distance = Dense(256,activation='relu',kernel_regularizer=l2(1e-3),name='Dense_2')(L1_distance)
    # Add a dense layer with a sigmoid unit to generate the similarity score

    prediction = Dense(1,activation='sigmoid',name='Output')(L1_distance)

    
    
    # prediction = Lambda(cosine_distance, output_shape=1)([encoded_l, encoded_r])
    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input,right_input],outputs=prediction)
    
    # return the model
    return siamese_net

In [128]:
model = get_siamese_model((75, 50, 1))
model.summary()

In [129]:
optimizer = Adam(learning_rate= 0.001)
model.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy'])

In [10]:
tf.keras.utils.plot_model(
    model, to_file='model.png',
    show_layer_names=True, rankdir='TB'
)



# Functions

## a function to predict which glyph

In [11]:
def create_glyphlist(X,sizes):
    images=[]
    labels=[]
    _,w,h=X.shape
    for glyph in sizes:
        # index=sizes[glyph][0]
        if glyph != 'UNKNOWN':
            index = np.random.choice(sizes[glyph])
            images.append(X[index].reshape( w , h, 1))
            labels.append(glyph)
    return np.asarray(images), np.asarray(labels) 

In [12]:
anchor_img, anchor_label=create_glyphlist(X,sizes)

**a function that create pairs from the image that we want to test and the anchor list**

In [13]:
def whichGlyph_pair(image,anchor_img,anchor_label):
    N,w,h,_=anchor_img.shape
#     pairs=[np.zeros((N, w, h,1)) for i in range(2)]
    
    test_image= np.asarray([image]*N).reshape(N, w, h,1)
    
    anchor_label, test_image, anchor_img = shuffle(anchor_label, test_image, anchor_img)
#     pairs = [test_image,anchor_img]
    
    return test_image, anchor_img, anchor_label
    

**a funnction that compairs the image with each character from the anchor list and outputs the propabilities.**

In [14]:
def whichGlyph(model,image,anchor_img,anchor_label):
    test_image,anchor_img,targets = whichGlyph_pair(image,anchor_img,anchor_label)
    probs = model.predict([test_image,anchor_img])
    return probs,anchor_img,targets
    

## creating pairs of images

**a function that create pairs of images with a certain batch size with y= 1 if they are similar and 0 if they are different.**

In [15]:
def createPairs(X,y,sizes,batch_size):
    ##create a batch with half it's size are similar glyphs and the other half are different.
    n=0
    i=0
    
    label=[]
    _,w,h=X.shape
    # initialize 2 empty arrays for the input image batch
#     pairs=[np.zeros((batch_size, w, h,1)) for i in range(2)]
    input1=np.zeros((batch_size, w, h,1))
    input2=np.zeros((batch_size, w, h,1))
    
    while n < batch_size:
        random_key1=random.choice(list(sizes))
#         low=sizes[random_key1][0]
#         high=sizes[random_key1][1]
        index1, index3 = np.random.choice(sizes[random_key1], size=2)
        index2 = np.random.choice(sizes[random_key1])
        
        random_key2=random.choice(list(sizes))
        
        while random_key2 == random_key1:
            random_key2=random.choice(list(sizes))
            
#         low=sizes[random_key2][0]
#         high=sizes[random_key2][1]
        index4=np.random.choice(sizes[random_key2])
        n += 2
        # appending images 1 and 3 into input1 and input2 corresponding to y=1 
        #and images 2 and 4 corresponding to y=0
    
        input1[i,:,:,:] = X[index1].reshape( w , h, 1)
        input1[i+1,:,:,:] = X[index2].reshape(w, h, 1)
        input2[i,:,:,:] = X[index3].reshape(w, h, 1)
        input2[i+1,:,:,:] = X[index4].reshape(w, h, 1)
        i += 2
#         input1+=[X[index1],X[index2]]
#         input2+=[X[index3],X[index4]]
        label+=[1,0]
        
#         print(index1,index2,index3,index4)
#         print(y[index1],y[index2],y[index3],y[index4])
#         print(random_key1,random_key2)
    input1,input2,label = shuffle(input1,input2,label)
    pairs=[input1,input2]
    
    return pairs,label
pairs,label=createPairs(X,y,sizes,32)   

In [16]:
def get_batch_all(images,pairs_all,labels_all,batch_size,it):
    _,w,h=images.shape
    input1=np.zeros((batch_size, w, h,1))
    input2=np.zeros((batch_size, w, h,1))
    # label=np.zeros((batch_size,1))
    index=0
    start=it*batch_size
    end= it*batch_size + batch_size
    for i in range(start,end):
      # random_index=random.choice(range(batch_size))
      ind1=pairs_all[0][i]
      ind2=pairs_all[1][i]
      input1[index,:,:,:] = images[ind1].reshape( w , h, 1)
      input2[index,:,:,:] = images[ind2].reshape( w , h, 1)
      index+=1

    input1,input2,label = shuffle(input1,input2,labels_all[start:end])
    pairs=[input1,input2]
    
    return pairs,label


In [None]:
# pp,mm=get_batch_all(images_all_train,pairs_all_train,labels_all_train,5)

## Testing

### Testing accuracy random

In [17]:
def calc_accuracy(N,Xval,yval,anchor_img,anchor_label,model,sizes):
    count_first=0
    count_first3=0
    for i in range(N):
        ind=random.choice(range(yval.shape[0]))
#         random_key = random.choice(list(sizes))
#         ind = np.random.choice(sizes[random_key])
                               
        predicted,anchor_imgs,targets=whichGlyph(model,Xval[ind],anchor_img,anchor_label)
        sort_index = np.argsort(np.asarray(predicted).reshape(len(predicted),))
        if targets[sort_index[-1]] == yval[ind][0]:
            count_first+=1
        if yval[ind][0] in targets[sort_index[127:]]:
            count_first3+=1
    accuracy_first=count_first/N
    accuracy_first3=count_first3/N
    
    return accuracy_first, accuracy_first3

In [18]:
def predict_random(N,Xval,yval,anchor_img,anchor_label,model,sizes):
    preds=[]
    actual=[]
    for i in range(N):
        # ind=random.choice(range(yval.shape[0]))
        random_key = random.choice(list(sizes))
        ind = np.random.choice(sizes[random_key])
        
        predicted,anchor_imgs,targets=whichGlyph(model,Xval[ind],anchor_img,anchor_label)
        sort_index = np.argsort(np.asarray(predicted).reshape(len(predicted),))
        preds.append(targets[sort_index[-1]])
        actual.append(yval[ind][0])
        
    
    return np.asarray(preds),np.asarray(actual)

**a function to test accuracy**

In [19]:
def test_accuracy_score(N,Xval,yval,anchor_img,anchor_label,model,sizes):
    count_first=0
    for i in range(N):
        random_key = random.choice(list(sizes))
        ind = np.random.choice(sizes[random_key])
        
#         ind=random.choice(range(yval.shape[0]))
        predicted,anchor_imgs,targets=whichGlyph(model,Xval[ind],anchor_img,anchor_label)
        if targets[np.argmax(predicted)]== yval[ind][0]:
            count_first+=1
    percent_correct = (100.0 * count_first / N)
    print("Got an average of {}% accuracy in {} samples. \n".format(percent_correct,N))
    return percent_correct

## Under sampling - IGNORE

In [None]:
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)
# #undersampling training data

# in_df=pd.DataFrame({'0':pairs_all_train[0],'1':pairs_all_train[1]})
# in_df,labels_tr= rus.fit_resample(in_df,labels_all_train)
# p0,p1,labels_tr= shuffle(in_df['0'].to_numpy(),in_df['1'].to_numpy(),labels_tr)
# pairs_all_train=[p0,p1] 
# labels_all_train= labels_tr

# #undersampling test data

# test_df=pd.DataFrame({'0':pairs_all_test[0],'1':pairs_all_test[1]})
# test_df,labels_te= rus.fit_resample(test_df,labels_all_test)
# p0t,p1t,labels_te=shuffle(test_df['0'].to_numpy(),test_df['1'].to_numpy(),labels_te)
# pairs_all_test= [p0t,p1t]
# labels_all_test=labels_te


In [None]:
# labels_all_train.shape

In [None]:
print(labels_all_train[labels_all_train==0].shape)
print(labels_all_train[labels_all_train==1].shape)

# Training On Digital data

In [20]:
# Hyper parameters
evaluate_every = 200 # interval for evaluating on one-shot tasks
batch_size = 128 #64#32
n_iter = 10000 #5400 # No. of training iterations 20000
# N_way = 20 # how many classes for testing one-shot tasks
n_val = 256 # how many one-shot tasks to validate on
best = -1
# epochs=2


In [None]:
# labels_all_train.shape

In [21]:
model_path = './weights/'
# model_2_path= '/content/drive/MyDrive/hiero_cv/'
# model_cos_path= '/content/drive/MyDrive/hiero_cv/weights_cos/'
# model_regul='/content/drive/MyDrive/hiero_cv/regul_weights/'


In [22]:
print("Starting training process for Digital data !")
print("-------------------------------------")
train_loss=[]
validation_loss=[]
t_start = time.time()
for i in range(1, n_iter+1):
    (inputs,targets) = createPairs(X_digital,y_digital,sizes_digital,batch_size)
    targets=np.asarray(targets)
    loss = model.train_on_batch(inputs, targets)
    train_loss.append(loss[0])
    if i % evaluate_every == 0:
        print("\n ------------- \n")
        print("Time for {0} iterations: {1} mins".format(i, (time.time()-t_start)/60.0))
        print(f"Train Loss: {loss[0]} , accuracy : {loss[1]}")

        (inputs_val,targets_val) = createPairs(Xval,yval,sizes_val,n_val)
        targets_val=np.asarray(targets_val)
        val_loss= model.test_on_batch(inputs_val, targets_val)
        val_acc = test_accuracy_score(n_val,Xval,yval,anchor_img,anchor_label,model,sizes_val)
        
        validation_loss.append(val_loss[0])
        # model.save_weights(os.path.join(model_2_path, 'weights.{}.h5'.format(i)))
        print(f"validation Loss: {val_loss[0]}, val accuracy : {val_loss[1]} ") 

In [None]:
# #plot loss
# plt.figure(1)
# plt.subplot(211)
# plt.plot(range(2, len(train_loss)),train_loss[2:])
# plt.subplot(212)
# plt.plot(range(1, len(validation_loss)+1),validation_loss)


In [None]:
# inputs,targets = get_batch_all(images_all_test,pairs_all_test,labels_all_test,1000,0)
# model.evaluate(inputs,targets )

## Train on actual data

In [28]:
# Hyper parameters
evaluate_every = 200 # interval for evaluating on one-shot tasks
batch_size = 128 #64#32
n_iter = 15000 #5400 # No. of training iterations 20000
# N_way = 20 # how many classes for testing one-shot tasks
n_val = 256 # how many one-shot tasks to validate on
best = -1
# epochs=2

In [29]:
print("Starting training process for Digital data !")
print("-------------------------------------")
train_loss=[]
validation_loss=[]
t_start = time.time()
for i in range(1, n_iter+1):
    (inputs,targets) = createPairs(X,y,sizes,batch_size)
    targets=np.asarray(targets)
    loss = model.train_on_batch(inputs, targets)
    train_loss.append(loss[0])
    if i % evaluate_every == 0:
        print("\n ------------- \n")
        print("Time for {0} iterations: {1} mins".format(i, (time.time()-t_start)/60.0))
        print(f"Train Loss: {loss[0]} , accuracy : {loss[1]}")

        (inputs_val,targets_val) = createPairs(Xval,yval,sizes_val,n_val)
        targets_val=np.asarray(targets_val)
        val_loss= model.test_on_batch(inputs_val, targets_val)
        val_acc = test_accuracy_score(n_val,Xval,yval,anchor_img,anchor_label,model,sizes_val)
        
        validation_loss.append(val_loss[0])
        # model.save_weights(os.path.join(model_2_path, 'weights.{}.h5'.format(i)))
        print(f"validation Loss: {val_loss[0]}, val accuracy : {val_loss[1]} ") 

## Training on less represented class

In [30]:
less_represented={}
for m in sizes:
  if len(sizes[m]) <= 10:
    less_represented[m]=sizes[m]

In [None]:
# ccc=[]
# for m in sizes:
#   if len(sizes[m]) <= 10:
#     ccc.append(m)
# print(len(ccc))

In [31]:
# Hyper parameters
evaluate_every = 200 # interval for evaluating on one-shot tasks
batch_size = 32 #64#32
n_iter = 1500 #5400 # No. of training iterations 20000
N_way = 20 # how many classes for testing one-shot tasks
n_val = 256 # how many one-shot tasks to validate on
best = -1

In [32]:
optimizer = Adam(1e-6)
model.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy'])

In [33]:
print("Starting training process for the less represented classes !")
print("-------------------------------------")
t_start = time.time()
for i in range(1, n_iter+1):
    (inputs,targets) = createPairs(X,y,less_represented,batch_size)
    targets=np.asarray(targets)
    loss = model.train_on_batch(inputs, targets)
    if i % evaluate_every == 0:
        print("\n ------------- \n")
        print("Time for {0} iterations: {1} mins".format(i, (time.time()-t_start)/60.0))
        print("Train Loss: {0}, accuracy : {1}".format(loss[0],loss[1])) 
#         val_acc = test_oneshot(model,Xval,yval,sizes_val, N_way, n_val, verbose=True)
        val_acc = test_accuracy_score(n_val,Xval,yval,anchor_img,anchor_label,model,sizes_val)
        # model.save_weights(os.path.join(model_2_path, 'weights.{}.h5'.format(i)))
        if val_acc >= best:
            print("Current best: {0}, previous best: {1}".format(val_acc, best))
            best = val_acc

## loading model from weights

In [None]:
# model_path=

In [None]:
model.save_weights(os.path.join(data_path, 'weights_BN.h5'))

In [None]:
model.load_weights(os.path.join(data_path,'weights_finetuned.h5'))

# language model

In [None]:
# !pip install ipynb

In [None]:
from collections import Counter, defaultdict
def lm_next(model,prev):
    pred = dict( eval('model'+ str(prev)))
    next_scores = sorted(pred.items(), key=lambda item: item[1],reverse=True)
    out = dict(next_scores)
    if len(list(out.keys()))==0:
        out ={'None':0}
    return out

In [None]:
import dill as pickle
with open("language_model_sent.pkl", "rb") as f:
    language_model = pickle.load(f)

In [None]:
import dill as pickle
with open("lm_sentences.pickle", "rb") as f:
    sentences = pickle.load(f)

In [None]:
def choose_next(clf_3,score_3,language_model,prev):
#     print("prev",str(prev))
    
    freq=lm_next(language_model,prev)
#     print(freq)
    freq_3=[]
    for pred in clf_3:
        if pred in freq.keys():
            freq_3.append(freq[pred])
        else:
            freq_3.append(0)
    freq_3=np.array(freq_3)
#     freq_3= 5*freq_3
    score_3 =np.array(score_3).flatten()
    
#     freq_3_sum = freq_3/np.sum(freq_3)
#     print(score_3.flatten())
    score_3_sum = score_3 / np.sum(score_3)
#     score_3_exp = np.exp(score_3)/sum(np.exp(score_3))
    freq_3_exp = np.exp(freq_3)/sum(np.exp(freq_3))
    
#     print(freq_3)

    scores = score_3_sum + 2*freq_3_exp
#     print('scores : ',scores)
    predicted=clf_3[np.argmax(scores)]
#     print(f"clf predicted : {clf_3[-1]} lm freq max : {list(freq.items())[0]} final :{predicted}")
#     print(clf_3)
#     print('act',score_3)
# # #     print('exp',score_3_exp)
#     print('sum',score_3_sum)
#     print('frq',freq_3)
#     print('frE',freq_3_exp)
#     print('fr3',3*freq_3_exp)
#     print('frS',freq_3_sum)
    return predicted

In [None]:
def predict_lm(Xtest,anchor_img,anchor_label,model,language_model,sizes):
    preds=[]
    clf_preds=[]
    for new in Xtest:
        if len(preds) < 1 :
            predicted,anchor_imgs,targets=whichGlyph(model,new,anchor_img,anchor_label)
            sort_index = np.argsort(np.asarray(predicted).reshape(len(predicted),))
            targ=targets[sort_index[-1]]
#             print(targ)
            if targ =='UNKNOWN':
                targ=targets[sort_index[-2]] 
            preds.append(targ)
            clf_preds.append(targ)
        else:
            predicted,anchor_imgs,targets=whichGlyph(model,new,anchor_img,anchor_label)
            sort_index = np.argsort(np.asarray(predicted).reshape(len(predicted),))
#             print('target : ',targets[sort_index[-1:]])
            predicted = choose_next(targets[sort_index[-3:]],predicted[sort_index[-3:]],language_model,preds[-1:])
            preds.append(predicted)
            clf_preds.append(targets[sort_index[-1]])
    return preds,clf_preds


In [None]:
def test_lm(Xtest,ytest,anchor_img,anchor_label,model,language_model,sizes):
    preds,clf_preds=predict_lm(Xtest,anchor_img,anchor_label,model,language_model,sizes)
    preds=np.asarray(preds)
    clf_preds=np.asarray(clf_preds)
    ytest=np.asarray(ytest)
    accuracy_lm = sum(preds==ytest)/len(ytest)
    accuracy_clf = sum(clf_preds==ytest)/len(ytest)
    return accuracy_lm, accuracy_clf

In [None]:
# test_sentences
test_sentences = np.random.choice(np.asarray(sentences),replace=False, size=500)

In [None]:
t_start = time.time()

count=0
correct_lm=0
correct_clf=0
i=0
for word in test_sentences:
    i+=1
    image_lm_test=[]
    for w in word:
        count+=1
        ind = np.random.choice(sizes_val[w])
        image_lm_test.append(Xval[ind])
#     print("word = ",np.asarray(word).flatten())
#     accuracy=test_lm(image_lm_test,word,anchor_img,anchor_label,model,language_model,sizes)
    preds_lm,preds_clf=predict_lm(image_lm_test,anchor_img,anchor_label,model,language_model,sizes_val)
    preds_lm=np.asarray(preds_lm)
    preds_clf=np.asarray(preds_clf)
    correct_lm+=sum(preds_lm==np.asarray(word))
    correct_clf+=sum(preds_clf==np.asarray(word))
    if i%25 == 0:
        print(f" language model accuracy : {correct_lm/count} , clf accuracy :{correct_clf/count}")
        print("{0} sentences took {1} mins".format(i,(time.time()-t_start)/60.0))
#     accuracies.append(accuracy)
# print("accuracy fn took {0} mins".format((time.time()-t_start)/60.0))

In [None]:
lm_acc=correct_lm/count
print("language model: ",correct_lm)
print("classifier: ",correct_clf)
print("difference: ",correct_lm-correct_clf)
print("total count: ",count)
lm_acc

In [None]:
t_start = time.time()
count=0
correct=0
i=0
for word in test_sentences :
    i+=1
    for w in word:
        count+=1
        ind = np.random.choice(sizes_val[w])
        predicted,anchor_imgs,targets=whichGlyph(model,Xval[ind],anchor_img,anchor_label)
        pred=targets[ np.argmax(predicted)]
        if pred == yval[ind]:
            correct+=1
    if i%25 == 0:
        print(correct/count)
        print("{0} sentences took {1} mins".format(i,(time.time()-t_start)/60.0))
accuracy=correct/count

In [None]:
accuracy

In [None]:
count=0
for sent in test_sentences:
    count+=len(sent)
count

## Multi anchor array

In [34]:
def create_multi_anchor(N,X,sizes):
  multi_anchor_img=[]
  multi_anchor_label=[]
  for n in range(N):
    anchor_img, anchor_label=create_glyphlist(X,sizes)
    multi_anchor_img.append(anchor_img)
    multi_anchor_label.append(anchor_label)
  return np.asarray(multi_anchor_img),np.asarray(multi_anchor_label)

In [35]:
multi_anchor_img,multi_anchor_label=create_multi_anchor(2,X,sizes)

In [36]:
def test_multi_anchor_random(N,Xval,yval,multi_anchor_img,multi_anchor_label,model,sizes):
  count_first=0
  multi_N=multi_anchor_img.shape[0]
  final_scores=np.zeros((multi_anchor_label[0].shape[0],1))
  for i in range(N):
    if i%100 == 0:
      print("test ",i)
#     random_key = random.choice(list(sizes))
#     ind = np.random.choice(sizes[random_key])
    ind=random.choice(range(yval.shape[0]))

    # print(yval[ind][0])
    for j in range(multi_N):
      predicted,anchor_imgs,targets=whichGlyph(model,Xval[ind],multi_anchor_img[j],multi_anchor_label[j])
      zipped_lists = zip(targets,predicted)
      sorted_pairs = sorted(zipped_lists)
      tuples = zip(*sorted_pairs)
      targets,predicted = [ list(tuple) for tuple in  tuples]
      # print(targets[:3]," - ",predicted[:3])
      final_scores = np.asarray(final_scores) + np.asarray(predicted)

    final_pred = targets[np.argmax(final_scores)]
    # print(final_pred ,'  - ', yval[ind][0],' - ',count_first)
    if final_pred == yval[ind][0]:
        count_first+=1
    final_scores=np.zeros((multi_anchor_label[0].shape[0],1))
    
  percent_correct = (100.0 * count_first / N)
  return percent_correct



In [37]:
t_start = time.time()
acc=test_multi_anchor_random(1000,Xval,yval,multi_anchor_img,multi_anchor_label,model,sizes_val)
print(f'testing:\nfound first accuracy with multi anchors = {acc}')
print("accuracy fn took {0} mins".format((time.time()-t_start)/60.0))

In [None]:

    # print(m,":",len(sizes[m]))

# Some Testing

In [41]:
t_start = time.time()
acc1,acc3=calc_accuracy(1000,Xval,yval,anchor_img,anchor_label,model,sizes_val)
print(f'testing:\nfound first accuracy = {acc1} , first 3 accuracy = {acc3}')
print("accuracy fn took {0} mins".format((time.time()-t_start)/60.0))

In [39]:
t_start = time.time()
acc1,acc3=calc_accuracy(250,X,y,anchor_img,anchor_label,model,sizes)
print(f'training:\nfound first accuracy = {acc1} , first 3 accuracy = {acc3} ')
print("accuracy fn took {0} mins".format((time.time()-t_start)/60.0))

In [42]:
t_start = time.time()
y_pred,y_true= predict_random(1000,Xval,yval,anchor_img,anchor_label,model,sizes_val)
print("accuracy fn took {0} mins".format((time.time()-t_start)/60.0))

In [None]:
sizes['M40']

In [None]:
sizes_val['M40']
plt.imshow(X[1114],cmap='gray')

In [99]:
comp=pd.DataFrame({'actual':y_true,"pred":y_pred})
diff = comp.query("actual != pred")
diff['count'] = diff.apply(lambda row: len(sizes_val[row['actual']]), axis=1)
diff.head()

In [125]:
diff['count'].value_counts()
diff[diff['count']==1]['actual'].value_counts()
diff[diff['count']==1].head()

In [126]:
g='M12'
pr='D156'
print(f"train: {len(sizes[g])} , test : {len(sizes_val[g])}")
tr=sizes[g][0]
te=sizes_val[g][0]
pre=sizes_val[pr][0]
f, ax = plt.subplots(1,3)
ax[0].imshow(X[tr],cmap='gray')
ax[0].set_title('anchor')
ax[1].imshow(Xval[te],cmap='gray')
ax[1].set_title('test');
ax[2].imshow(X[pre],cmap='gray')
ax[2].set_title('predicted');

In [43]:
# confusion_matrix = pd.crosstab(index=y_true, columns=y_pred, rownames=['True'], colnames=['predictions']).astype(int)
# sns.heatmap(confusion_matrix, annot=True, fmt='.2f', cmap="YlGnBu").set_title('Confusion Matrix')
print(classification_report(y_true,y_pred))

In [None]:
y[y=='UNKNOWN'].shape

In [None]:
# t_start = time.time()
# print(calc_accuracy(1,Xval,yval,anchor_img,anchor_label,model))
# print("accuracy fn took {0} sec".format((time.time()-t_start)))

### Testing new images

In [None]:
def preprocess(img):
    if len(img.shape) == 3 and img.shape[2]==3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
    img = cv2.resize(img,(50,75))
    return img

In [None]:
test_img1=io.imread('3.jpg')
test_img1=preprocess(test_img1)
print(test_img1.shape)
io.imshow(test_img1)


In [None]:
print(targets[64])
targets

In [None]:
predicted[100]

In [None]:
predicted,anchor_imgs,targets=whichGlyph(model,test_img1.reshape(75,50,1),anchor_img,anchor_label)

In [None]:
sort_index = np.argsort(np.asarray(predicted).reshape(len(predicted),))
sort_index 

In [None]:
io.imshow(anchor_imgs[100].reshape(75,50))