In [None]:
import cv2
from glob import glob
import numpy as np
from PIL import Image, ImageOps
import pandas as pd

import tensorflow as tf
import PIL.ImageOps

from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Add
from tensorflow.keras.layers import Activation, Dense, Flatten, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

## Connect Line With Session

In [None]:
import os
img_files = np.zeros((0), dtype=np.str)
img_targets = np.zeros((0), dtype=np.str)
directory = "Lines"
csv = pd.read_csv("updatedDataset.csv")

for filename in os.listdir(directory):
    img_files = np.append(img_files, "Lines/"+filename)
    if len(filename)==17:
        i=int(filename[7:8])
    elif len(filename)==18:
        i=int(filename[7:9])
    else:
        i=int(filename[7:10])
    img_targets = np.append(img_targets, csv['diag'][i])

In [None]:
print(img_files[0:20], img_targets[0:20])

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(img_targets)
encoded_img_targets = encoder.transform(img_targets)

print("Writer ID        : ", img_targets[:2])
print("Encoded writer ID: ", encoded_img_targets[:2])

In [None]:
CROP_SIZE = 113
NUM_LABELS = 2
BATCH_SIZE = 16

## Augmentation

In [None]:
from sklearn.utils import shuffle
from PIL import Image
import random

def get_augmented_sample(sample, label, sample_ratio):
    # Get current image details
    img = Image.open(sample).convert('L')
    img_width = img.size[0]
    img_height = img.size[1]

    # Compute resize dimensions such that aspect ratio is maintained
    height_fac = CROP_SIZE / img_height
    size = (int(img_width * height_fac), CROP_SIZE)

    # Resize image
    new_img = img.resize(size, Image.ANTIALIAS)
    new_img_width = new_img.size[0]
    new_img_height = new_img.size[1]

    # Generate a random number of crops of size 113x113 from the resized image
    x_coord = list(range(0, new_img_width - CROP_SIZE))
    num_crops = int(len(x_coord) * sample_ratio)
    random_x_coord = random.sample(x_coord, num_crops)

    # Create augmented images (cropped forms) and map them to a label (writer)
    images = []
    labels = []
    for x in random_x_coord:
        img_crop = new_img.crop((x, 0, x + CROP_SIZE, CROP_SIZE))
        # Transform image to an array of numbers
        images.append(np.asarray(img_crop))
        labels.append(label)

    return (images, labels)

In [None]:
import operator
from functools import reduce
from keras.utils import to_categorical

def generate_data(samples, labels, batch_size, sample_ratio):
    for offset in range(0, len(samples), batch_size):
        batch_samples = samples[offset:(offset + batch_size)]
        batch_labels = labels[offset:(offset + batch_size)]

        # Augment each sample in batch
        augmented_batch_samples = []
        augmented_batch_labels = []
        for i in range(len(batch_samples)):
            sample = batch_samples[i]
            label = batch_labels[i]
            augmented_samples, augmented_labels  = get_augmented_sample(sample, label, sample_ratio)
            augmented_batch_samples.append(augmented_samples)
            augmented_batch_labels.append(augmented_labels)

        # Flatten out samples and labels
        augmented_batch_samples = reduce(operator.add, augmented_batch_samples)
        augmented_batch_labels = reduce(operator.add, augmented_batch_labels)

        # Reshape input format
        X_train = np.array(augmented_batch_samples)
        print(X_train.shape)
        X_train = X_train.reshape(X_train.shape[0], CROP_SIZE, CROP_SIZE, 1)
        print(X_train.shape)

        # Transform input to float and normalize
        X_train = X_train.astype('float32')
        X_train /= 255

        y_train = np.array(augmented_batch_labels)
        y_train = to_categorical(y_train, NUM_LABELS)

        return X_train, y_train

In [None]:
train_generator = generate_data(img_files, encoded_img_targets, BATCH_SIZE, 0.3)

In [None]:
def resize_image(img):
    size = round(CROP_SIZE/2)
    return tf.image.resize(img, [size, size])

## CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Lambda, Activation
from keras.layers.convolutional import Convolution2D, ZeroPadding2D, MaxPooling2D
from keras.optimizers import Adam
from keras import metrics

model = Sequential()

# Define network input shape
model.add(ZeroPadding2D((1, 1), input_shape=(CROP_SIZE, CROP_SIZE, 1)))
# Resize images to allow for easy computation
model.add(Lambda(resize_image))

# CNN model - Building the model suggested in paper
model.add(Convolution2D(filters= 32, kernel_size =(5,5), strides= (2, 2), padding='same', name='conv1'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool1'))

model.add(Convolution2D(filters= 64, kernel_size =(3, 3), strides= (1, 1), padding='same', name='conv2'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool2'))

model.add(Convolution2D(filters= 128, kernel_size =(3, 3), strides= (1, 1), padding='same', name='conv3'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool3'))


model.add(Flatten())
model.add(Dropout(0.5))

model.add(Dense(512, name='dense1'))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256, name='dense2'))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(50, name='output'))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['acc'])

print(model.summary())

## Loading Weights Of CNN Trained with Dataset IAM and Feature Extraction (from each model we generate 2 csv, one with -6 layer the other -5 layer, changing the value in layers[-6])

In [None]:

df = pd.DataFrame()

#model.load_weights("model_checkpoints/check_200.hdf5")
#model.load_weights("model_checkpoints/check_20.hdf5")
model.load_weights("model_checkpoints/check_100.hdf5")

model = Model(inputs=model.inputs, outputs=model.layers[-6].output)

trainX, trainY =train_generator


for elem in trainX :

    features = model.predict(elem)
    df1 = pd.DataFrame(features)
    df =  pd.concat([df, df1], ignore_index=True)

y=list(trainY)
y=list(map(int, list(map(lambda x: x[0], y))))
res = []
for i in y:
    for ele in range(113):
        res.append(i)
df["diag"]=res
df.to_csv("CNNFeatureExtractionEpoch100.-6layer.csv")

## Classification of the selected FeatureExtraction

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

features_df = pd.DataFrame()
#features_df = pd.read_csv("CNNFeatureExtractionEpoch200.-6layer.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch200.-5layer.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch20.-6layer.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch20.-5layer.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch100.-6layer.csv")
features_df = pd.read_csv("CNNFeatureExtractionEpoch100.-5layer.csv")

features_df = features_df.drop(["Unnamed: 0"], axis=1)

train, test = train_test_split(features_df, test_size=20)

y = features_df['diag']

trainY = train["diag"]
trainX = train.drop(["diag"], axis=1)

testY = test["diag"]
testX = test.drop(["diag"], axis=1)

trainX

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model = RidgeClassifier(alpha=100)
model.fit(trainX, trainY)

print(model.score(testX,testY))


In [None]:
logModel = LogisticRegression()
logModel.fit(trainX, trainY)

print(logModel.score(testX,testY))

In [None]:
tree = DecisionTreeClassifier()
tree.fit(trainX, trainY)

print(tree.score(testX,testY))

In [None]:
rndforest = RandomForestClassifier(n_estimators=100)
rndforest.fit(trainX, trainY)
print(rndforest.score(testX,testY))

### The accuracy is very variable therefore through this block it is possible to obtain the average results

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import statistics
accRidge = []
accLogReg = []
accDecTree = []
accRnFr = []
ridModel = RidgeClassifier(alpha=100)
logModel = LogisticRegression()
tree = DecisionTreeClassifier()
clf=RandomForestClassifier(n_estimators=100)
features_df = pd.read_csv("CNNFeatureExtractionEpoch100.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch100.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch20.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch20.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch200.csv")
#features_df = pd.read_csv("CNNFeatureExtractionEpoch200.csv")
features_df = features_df.drop(["Unnamed: 0"], axis=1)
for i in range(10):
    train, test = train_test_split(features_df, test_size=20)
    trainY = train["diag"]
    trainX = train.drop(["diag"], axis=1)
    testY = test["diag"]
    testX = test.drop(["diag"], axis=1)


    ridModel.fit(trainX, trainY)
    accRidge.append(ridModel.score(testX, testY))
    logModel.fit(trainX, trainY)
    accLogReg.append(logModel.score(testX,testY))
    tree.fit(trainX, trainY)
    accDecTree.append(tree.score(testX,testY))
    clf.fit(trainX,trainY)
    accRnFr.append(clf.score(testX,testY))

print("Media Acc RidgeClassifier: "+str(statistics.mean(accRidge)))
print("Accuracy piu elevata "+str(max(accRidge)))
print("Accuracy piu bassa "+str(min(accRidge)))
print("---------------------------------------------------")
print("Media Acc LogisticRegression: "+str(statistics.mean(accLogReg)))
print("Accuracy piu elevata "+str(max(accLogReg)))
print("Accuracy piu bassa "+str(min(accLogReg)))
print("---------------------------------------------------")
print("Media Acc DecisionTreeClassifier: "+str(statistics.mean(accDecTree)))
print("Accuracy piu elevata "+str(max(accDecTree)))
print("Accuracy piu bassa "+str(min(accDecTree)))
print("---------------------------------------------------")
print("Media Acc RandomForestClassifier: "+str(statistics.mean(accRnFr)))
print("Accuracy piu elevata "+str(max(accRnFr)))
print("Accuracy piu bassa "+str(min(accRnFr)))