## Notboook to fintune pretrained SSD model to detect webpage elements

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import pickle
from six import BytesIO
from PIL import Image, ImageDraw, ImageFont
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

In [2]:
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/Users/amishra162/Documents/Coursera/work/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ["dlopen(/Users/amishra162/Documents/Coursera/work/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 0x0006): tried: '/Users/amishra162/Documents/Coursera/work/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file)"]


In [3]:
# function to load image and convert into numpy array

def load_image_into_numpy_array(path):
    
    img_data = tf.io.gfile.GFile(path, 'rb').read()
    image = (Image.open(BytesIO(img_data)))
    (im_width, im_height) = image.size
    
    return np.array(image.getdata()).reshape(
        (im_height, im_width, 3)).astype(np.float32)

In [5]:
og_train_df = pd.read_csv('./data/train/_annotations.csv')

# Arranging the bounding box coordinates as per SSD model 
# ymin xmin ymax xmax
train_df = og_train_df.iloc[:,[0,1,2,3,5,4,7,6]]
val_df = pd.read_csv('./data/valid/_annotations.csv')
test_df = pd.read_csv('./data/test/_annotations.csv')

print("Train data shape: ",train_df.shape)
print("Val data shape: ",val_df.shape)
print("Test data shape: ",test_df.shape)

Train data shape:  (76820, 8)
Val data shape:  (20954, 8)
Test data shape:  (10656, 8)


In [6]:
# Normalizing the bounding box cordinates

train_df['ymin'] = train_df['ymin']/train_df.ymax.max()

train_df['xmin'] = train_df['xmin']/train_df.xmax.max()

train_df['ymax'] = train_df['ymax']/train_df.ymax.max()

train_df['xmax'] = train_df['xmax']/train_df.xmax.max()

In [7]:
train_df.head()

Unnamed: 0,filename,width,height,class,ymin,xmin,ymax,xmax
0,lavanguardia_com_png.rf.1070b85a3d6b62256b2ff8...,1024,768,text,0.763021,0.022461,0.955729,0.665039
1,lavanguardia_com_png.rf.1070b85a3d6b62256b2ff8...,1024,768,link,0.936198,0.443359,0.955729,0.567383
2,lavanguardia_com_png.rf.1070b85a3d6b62256b2ff8...,1024,768,text,0.936198,0.022461,0.980469,0.642578
3,lavanguardia_com_png.rf.1070b85a3d6b62256b2ff8...,1024,768,text,0.861979,0.71582,0.880208,0.789062
4,lavanguardia_com_png.rf.1070b85a3d6b62256b2ff8...,1024,768,button,0.846354,0.683594,0.897135,0.821289


In [8]:
# Class distribution
classes = train_df['class'].unique()
num_classes = len(classes)
train_df.groupby(by='class').count()['filename']

class
button     23872
field       1558
heading     6470
iframe       608
image      12486
label        180
link        9246
text       22400
Name: filename, dtype: int64

In [9]:
print("Number of train images: ",train_df['filename'].unique().shape)

Number of train images:  (1688,)


In [10]:
#loading all the images as numpy array
img_path = train_df.filename.unique()
train_images_np = [load_image_into_numpy_array('./data/train/'+path) for path in img_path]

In [11]:
# Getting bounding box coordinates for each images and then converting them to tensors
# Label Encoding the classes for each bounding box and then creating a category index dictionary for each class
# classes are later one hot encoded to tensors

gt_boxes = np.array([train_df[train_df['filename']==i].drop(columns=['filename','width','height','class']).to_numpy() for i in img_path])
gt_classes = np.array([train_df[train_df['filename']==i]['class'] for i in img_path])

class_label_list = []
for i in range(gt_classes.shape[0]):
    class_label_list.append(gt_classes[i].to_list())

LE = LabelEncoder()
label_encoder = LE.fit(classes)
encoded_labels= [label_encoder.transform(gt_classes[i]) for i in range(gt_classes.shape[0])]

category_index = {}
for x in range(len(label_encoder.classes_)):
    category_index[x+1] = {'id':x+1,'name':label_encoder.classes_[x]}

gt_classes_one_hot_tensors = [tf.one_hot(encoded_labels[i], num_classes) for i in range(len(encoded_labels))]


  gt_boxes = np.array([train_df[train_df['filename']==i].drop(columns=['filename','width','height','class']).to_numpy() for i in img_path])
  gt_classes = np.array([train_df[train_df['filename']==i]['class'] for i in img_path])


In [12]:
# Converting the image numpy array into tensors
# converting ground truth boxes to tensors
train_image_tensors = []

gt_box_tensors = []

for (train_image_np, gt_box_np) in zip(train_images_np, gt_boxes):
    
    train_image_tensors.append(tf.expand_dims(tf.convert_to_tensor(
        train_image_np, dtype=tf.float32), axis=0))
    
    gt_box_tensors.append(tf.convert_to_tensor(gt_box_np, dtype=tf.float32))

### Model definition and training

In [13]:
tf.keras.backend.clear_session()

pipeline_config = './ssd_resnet50_v1_fpn_640x640_coco17_tpu-8/pipeline.config'

# loading piepline config
configs = config_util.get_configs_from_pipeline_file(pipeline_config)

In [14]:
# modifying the last layer to number of classes in our problem i.e 10
model_config = configs.get('model')
model_config.ssd.num_classes = num_classes
model_config.ssd.freeze_batchnorm = True

In [15]:
# Building model with our modified configuration
detection_model = model_builder.build(model_config=model_config, is_training=True)

In [16]:
# we create a temprory checkpoint to load only the required layer checkpoint

tmp_box_predictor_checkpoint = tf.train.Checkpoint(
    _base_tower_layers_for_heads = detection_model._box_predictor._base_tower_layers_for_heads,
    _box_prediction_head = detection_model._box_predictor._box_prediction_head
)  
    

In [17]:
tmp_model_checkpoint = tf.train.Checkpoint(
    _box_predictor = tmp_box_predictor_checkpoint,
    _feature_extractor = detection_model._feature_extractor
)          
          

In [18]:
checkpoint_path = './ssd_resnet50_v1_fpn_640x640_coco17_tpu-8/checkpoint/ckpt-0'

checkpoint = tf.train.Checkpoint(
    model=tmp_model_checkpoint
)
checkpoint.restore(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x34d316d30>

In [19]:
# use the detection model's `preprocess()` method and pass a dummy image
tmp_image, tmp_shapes = detection_model.preprocess(tf.zeros([1, 640, 640, 3]))

# run a prediction with the preprocessed image and shapes
tmp_prediction_dict = detection_model.predict(tmp_image, tmp_shapes)

# postprocess the predictions into final detections
tmp_detections = detection_model.postprocess(tmp_prediction_dict, tmp_shapes)


In [20]:
len(detection_model.trainable_variables) 

269

In [21]:
tf.keras.backend.set_learning_phase(True)

batch_size = 5
num_batches = int(len(img_path)/batch_size)
learning_rate = 0.0001
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
epochs = 5



In [22]:
# We will be fintuning all the layers in our network since our data is different from the coco dataset
to_fine_tune = []
for v in detection_model.trainable_variables:
    to_fine_tune.append(v)

In [23]:
@tf.function
def train_step_fn(image_list,
                groundtruth_boxes_list,
                groundtruth_classes_list,
                model,
                optimizer,
                vars_to_fine_tune):

    model.provide_groundtruth(
        groundtruth_boxes_list=groundtruth_boxes_list,
        groundtruth_classes_list=groundtruth_classes_list
    )


    with tf.GradientTape() as tape:
        preprocessed_image_list = []
        true_shape_list = []

        for img in image_list:
            processed_img, true_shape = model.preprocess(img)
            preprocessed_image_list.append(processed_img)
            true_shape_list.append(true_shape)

        preprocessed_image_tensor = tf.concat(preprocessed_image_list, axis=0)
        true_shape_tensor = tf.concat(true_shape_list, axis=0)

        prediction_dict = model.predict(preprocessed_image_tensor, true_shape_tensor)

        losses_dict = model.loss(prediction_dict, true_shape_tensor)
        
        total_loss = losses_dict['Loss/localization_loss'] + losses_dict['Loss/classification_loss']

        gradients = tape.gradient([total_loss], vars_to_fine_tune)

        optimizer.apply_gradients(zip(gradients, vars_to_fine_tune))
                
    return total_loss

In [24]:
# Training model for 5 epochs

tf.config.run_functions_eagerly(True)

loss_monitor = {}
for i in range(epochs):
    for idx in range(num_batches):
        all_keys = list(range(len(train_images_np)))
        random.shuffle(all_keys)
        example_keys = all_keys[:batch_size]

        gt_boxes_list = [gt_box_tensors[key] for key in example_keys]
        gt_classes_list = [gt_classes_one_hot_tensors[key] for key in example_keys]
        
        image_tensors = [train_image_tensors[key] for key in example_keys]

        total_loss = train_step_fn(image_tensors, 
                                gt_boxes_list, 
                                gt_classes_list,
                                detection_model,
                                optimizer,
                                to_fine_tune
                                )

        if idx % 10 == 0:
            print('Epcoh ' + str(i) + 'batch ' + str(idx) + ' of ' + str(num_batches)
            + ', loss=' +  str(total_loss.numpy()), flush=True)
            loss_monitor[i] = {str(idx):str(total_loss.numpy())}



Epcoh 0batch 0 of 337, loss=1.7261318
Epcoh 0batch 10 of 337, loss=1.5729825
Epcoh 0batch 20 of 337, loss=1.4748225
Epcoh 0batch 30 of 337, loss=1.3079951
Epcoh 0batch 40 of 337, loss=1.2571833
Epcoh 0batch 50 of 337, loss=1.1960187
Epcoh 0batch 60 of 337, loss=1.345959
Epcoh 0batch 70 of 337, loss=1.1256416
Epcoh 0batch 80 of 337, loss=1.1253576
Epcoh 0batch 90 of 337, loss=0.94458616
Epcoh 0batch 100 of 337, loss=1.1281161
Epcoh 0batch 110 of 337, loss=0.91451395
Epcoh 0batch 120 of 337, loss=0.83039725
Epcoh 0batch 130 of 337, loss=1.1309582
Epcoh 0batch 140 of 337, loss=0.8774462
Epcoh 0batch 150 of 337, loss=0.97723883
Epcoh 0batch 160 of 337, loss=0.865576
Epcoh 0batch 170 of 337, loss=0.9814497
Epcoh 0batch 180 of 337, loss=0.9388566
Epcoh 0batch 190 of 337, loss=0.8057493
Epcoh 0batch 200 of 337, loss=1.1015687
Epcoh 0batch 210 of 337, loss=0.97636473
Epcoh 0batch 220 of 337, loss=0.76434267
Epcoh 0batch 230 of 337, loss=0.85308707
Epcoh 0batch 240 of 337, loss=0.9904704
Epcoh 

In [25]:
tf.saved_model.save(detection_model, 'web_element_detection', signatures=None, options=None)





INFO:tensorflow:Assets written to: web_element_detection/assets


INFO:tensorflow:Assets written to: web_element_detection/assets


In [122]:
# Save new pipeline config
new_pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
config_util.save_pipeline_config(new_pipeline_proto, './web_element_detection/new_config')

exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt_manager = tf.train.CheckpointManager(exported_ckpt, directory="./web_element_detection/checkpoint/", max_to_keep=None)

ckpt_manager.save()


INFO:tensorflow:Writing pipeline config file to ./web_element_detection/new_config/pipeline.config


INFO:tensorflow:Writing pipeline config file to ./web_element_detection/new_config/pipeline.config


'./web_element_detection/checkpoint/ckpt-1'

In [None]:
with open('./web_element_detection/classes.pkl', 'wb') as f:
    pickle.dump(category_index, f)