In [1]:
# Import libraries:
import os
from keras.models import Model
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, Flatten
from pathlib import Path
import numpy as np

In [2]:
BATCH_SIZE = 8

# Implement the necessary pre-processing for our image to work with
# the pre-trained VGG
train_generator = ImageDataGenerator(rotation_range=90, 
                                     brightness_range=[0.1, 0.7],
                                     width_shift_range=0.5, 
                                     height_shift_range=0.5,
                                     horizontal_flip=True, 
                                     vertical_flip=True,
                                     validation_split=0.15,
                                     preprocessing_function=preprocess_input) # VGG16 preprocessing

test_generator = ImageDataGenerator(preprocessing_function=preprocess_input) # VGG16 preprocessing

train_data_dir = 'documents-101_ward/training/'
test_data_dir = 'documents-101_ward/test/'

# I named the 3 classes identified by the clustering in this way.
# The class "others" is more of a miscellanea class (it could have been
# divided into further classes)
class_subset = ['others','cost_authorization','coupon_registration']

traingen = train_generator.flow_from_directory(train_data_dir,
                                               target_size=(224, 224),
                                               class_mode='categorical',
                                               classes=class_subset,
                                               subset='training',
                                               batch_size=BATCH_SIZE, 
                                               shuffle=False,
                                               seed=42)

validgen = train_generator.flow_from_directory(train_data_dir,
                                               target_size=(224, 224),
                                               class_mode='categorical',
                                               classes=class_subset,
                                               subset='validation',
                                               batch_size=BATCH_SIZE,
                                               shuffle=True,
                                               seed=42)

testgen = test_generator.flow_from_directory(test_data_dir,
                                             target_size=(224, 224),
                                             class_mode=None,
                                             classes=class_subset,
                                             batch_size=1,
                                             shuffle=False,
                                             seed=42)

Found 128 images belonging to 3 classes.
Found 21 images belonging to 3 classes.
Found 50 images belonging to 3 classes.


In [3]:
# Define the model using the pre-trained VGG16
def create_model(input_shape, n_classes, optimizer='rmsprop', fine_tune=0):

    # Load the convolutional layer pre-trained on the ImageNet data,
    # but not the fully-connected layers.
    conv_base = VGG16(include_top=False,
                     weights='imagenet', 
                     input_shape=input_shape)
    
    # Layers to freeze during training: 
    if fine_tune > 0:
        for layer in conv_base.layers[:-fine_tune]:
            layer.trainable = False
    else:
        for layer in conv_base.layers:
            layer.trainable = False

    # Create the new fully-connected layers
    top_model = conv_base.output
    top_model = Flatten(name="flatten")(top_model)
    top_model = Dense(4096, activation='relu')(top_model)
    top_model = Dense(1072, activation='relu')(top_model)
    top_model = Dropout(0.2)(top_model)
    output_layer = Dense(n_classes, activation='softmax')(top_model)
    
    # Group into a Model object
    model = Model(inputs=conv_base.input, outputs=output_layer)

    # Compiles the model
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model


In [4]:
# Define training parameters:
input_shape = (224, 224, 3)
optim_1 = Adam(learning_rate=0.001)
n_classes=3

n_steps = traingen.samples // BATCH_SIZE
n_val_steps = validgen.samples // BATCH_SIZE
n_epochs = 10

# Create a model with all the pre-trained layers frozen:
vgg_model = create_model(input_shape, n_classes, optim_1, fine_tune=0)

# Import to live plot the 
from livelossplot.inputs.keras import PlotLossesCallback

plot_loss_1 = PlotLossesCallback()

# Checkpoints to save best weights
tl_checkpoint_1 = ModelCheckpoint(filepath='tl_model_v1.weights.best.ward.hdf5',
                                  save_best_only=True,
                                  verbose=1)

# Early stopping criteria
early_stop = EarlyStopping(monitor='val_loss',
                           patience=10,
                           restore_best_weights=True,
                           mode='min')

# Model training: in this case we will train only the new fully-connected
# layer for doing the prediction, the pre-trained layer will
# perform the feature extraction:
vgg_history = vgg_model.fit(traingen,
                            batch_size=BATCH_SIZE,
                            epochs=n_epochs,
                            validation_data=validgen,
                            steps_per_epoch=n_steps,
                            validation_steps=n_val_steps,
                            callbacks=[tl_checkpoint_1, early_stop],
                            verbose=1)

2022-06-06 20:40:34.228615: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-06 20:40:34.228800: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-06 20:40:34.230880: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-06-06 20:40:35.083602: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-06-06 20:40:35.083989: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2592230000 Hz


Epoch 1/10

Epoch 00001: val_loss improved from inf to 6.33277, saving model to tl_model_v1.weights.best.ward.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 6.33277 to 0.00000, saving model to tl_model_v1.weights.best.ward.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.00000
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.00000
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.00000
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.00000
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.00000
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.00000
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.00000
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.00000


In [5]:
# Prediction:
# Load the best trained weights:
vgg_model.load_weights('tl_model_v1.weights.best.ward.hdf5') 

true_classes = testgen.classes
class_indices = traingen.class_indices
class_indices = dict((v,k) for k,v in class_indices.items())

vgg_preds = vgg_model.predict(testgen)
vgg_pred_classes = np.argmax(vgg_preds, axis=1)

from sklearn.metrics import accuracy_score

vgg_acc = accuracy_score(true_classes, vgg_pred_classes)
print("Classification accuracy: {:.2f}%".format(vgg_acc * 100))

Classification accuracy: 70.00%


In [None]:
# Try again with the same model, but letting backrpopagation update
# the weights of the last two layers:
traingen.reset()
validgen.reset()
testgen.reset()

# Smaller learning rate
optim_2 = Adam(lr=0.0001)

# Re-create the model with the parameter fine_tuning=2:
vgg_model_ft = create_model(input_shape, n_classes, optim_2, fine_tune=2)

plot_loss_2 = PlotLossesCallback()

# Retrain the model:
vgg_ft_history = vgg_model_ft.fit(traingen,
                                  batch_size=BATCH_SIZE,
                                  epochs=n_epochs,
                                  validation_data=validgen,
                                  steps_per_epoch=n_steps, 
                                  validation_steps=n_val_steps,
                                  callbacks=[tl_checkpoint_1, early_stop],
                                  verbose=1)

Epoch 1/10

Epoch 00001: val_loss did not improve from 0.00000
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.00000
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.00000
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.00000
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.00000
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.00000
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.00000
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.00000
Epoch 9/10
 2/16 [==>...........................] - ETA: 8s - loss: 0.3092 - accuracy: 0.8125 

In [None]:
# Prediction (once again):
# Load the best trained weights:
vgg_model_ft.load_weights('tl_model_v1.weights.best.ward.hdf5')

vgg_preds_ft = vgg_model_ft.predict(testgen)
vgg_pred_classes_ft = np.argmax(vgg_preds_ft, axis=1)
vgg_acc_ft = accuracy_score(true_classes, vgg_pred_classes_ft)
print("Classification accuracy with last 2 layers re-trained: {:.2f}%".format(vgg_acc_ft * 100))

**FINAL COMMENTS**

The accuracy is not very high, and I think the main cause is the low number of data. The training set is 149 images and it seems that some classes only have around 10 images. Also, the network has been trained on recognising images that can be very different from each other (e.g. a car vs a dog). This task instead requires to distinguish between images that are all documents, so the difference between them is more subtle (I have tried just to feed them to the pre-trained VGG16 with the default 1000 classes and they were all classified as "menus").

Also I could have probably defined at least a couple more classes in the clustering (4 or 5 classes in total). After doing the classification with the VGG net, I played around a bit more with the clustering, and I noticed that the classes defined by the agglomerative clustering when setting k=4 or 5 where actually breaking down the "big" class into smaller classes, which made sense, by checking the pictures manually.   