In [2]:
import os
import shutil
from ast import literal_eval
from glob import glob

import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

2022-05-28 10:56:17.173046: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Data Preparation
In this notebook, the data will be loaded, a train-test split will be performed, and the data will be saved to tfrecord files.

# Load Data

## Part 1: Images

Create a dataframe containing the image names. We need this so we can perform a train-test-split on the images as well.

In [3]:
# Location of dataset
DATASET_PATH = "../data/01_raw"

# List all images in the folder
image_list = [
    filename.split("/")[-1].split(".")[0]
    for filename in glob(DATASET_PATH + "/images/*.jpg")
]
image_ids = pd.DataFrame(image_list).rename(columns={0: "image_id"})
print("Number of images in folder: {}".format(len(image_ids)))

image_ids.head()

Number of images in folder: 98


Unnamed: 0,image_id
0,28483ab7-42f0-4343-af24-f8076dbdf2f9
1,ea75050f-d6a4-4356-9314-0f17b05e972c
2,3542adbf-145b-4986-8f01-76cfffa9dcca
3,b33b9176-6d05-4603-9c2a-782b5d0ea140
4,5529501e-5b67-4a9f-bedd-efb785b78ce2


## Part 2: Annotations

Import the annotations. We perform transformations on import:
1. convert the string representation of the bounding boxes to python objects

In [None]:
# convert a string record into a valid python object
def convert_string_to_python_object(x):
    return literal_eval(x.rstrip("\r\n"))

# read the CSV with annotations
labels = pd.read_csv(
    DATASET_PATH + "/annotations.csv",
    converters={"bounds": convert_string_to_python_object},
)


labels.head()

# Train Test Split

The train test split is performed on the image names. We split the dataset into three parts:
- **train**: the split used for training
- **val**: the split used to evaluate the generalization performance of the model during training and to tune the hyperparameters for generalization
- **test**: the split to finally evaluate the model after the training has finished

In [None]:
train_image_names, test_image_names = train_test_split(image_list, test_size=0.2)
val_image_names, test_image_names = train_test_split(test_image_names, test_size=0.5)

print(
    f"The trainings dataset contains {len(train_image_names)} images. Thats {round(len(train_image_names)/len(image_list) *100)}%."
)
print(
    f"The test dataset contains {len(test_image_names)} images. Thats {round(len(test_image_names)/len(image_list) *100)}%."
)
print(
    f"The validation dataset contains {len(val_image_names)} images. Thats {round(len(val_image_names)/len(image_list) *100)}%."
)

## Export The Data
The images are saved into new, seperated folders according to the splits.

The annotations are saved into seperate .csv files according to the splits.

In [None]:
# add bounding box information
train_df = labels[labels["image_id"].isin(train_image_names)]
valid_df = labels[labels["image_id"].isin(val_image_names)]
test_df = labels[labels["image_id"].isin(test_image_names)]

# write datasets to disk
os.makedirs("../data/04_feature/annotations", exist_ok=True)
train_df.to_csv("../data/04_feature/annotations/train.csv", index=False)
valid_df.to_csv("../data/04_feature/annotations/valid.csv", index=False)
test_df.to_csv("../data/04_feature/annotations/test.csv", index=False)

# copy images to model_input folder
os.makedirs("../data/04_feature/images/train", exist_ok=True)
for image_name in train_image_names:
    shutil.copy(
        f"../data/01_raw/images/{image_name}.jpg",
        f"../data/04_feature/images/train/{image_name}.jpg",
    )
os.makedirs("../data/04_feature/images/validation", exist_ok=True)
for image_name in val_image_names:
    shutil.copy(
        f"../data/01_raw/images/{image_name}.jpg",
        f"../data/04_feature/images/validation/{image_name}.jpg",
    )
os.makedirs("../data/04_feature/images/test", exist_ok=True)
for image_name in test_image_names:
    shutil.copy(
        f"../data/01_raw/images/{image_name}.jpg",
        f"../data/04_feature/images/test/{image_name}.jpg",
    )

# Convert data to tensorflow records dataset

The Tensorflow object detection API requires the data to be in a specific tensorflow-records format. The tf-records format is a binary, dict-like structure that contains both the images and their annotations.

In [2]:
from __future__ import annotations

from ast import literal_eval
from pathlib import Path

import tensorflow.compat.v1 as tf
from object_detection.utils import dataset_util, label_map_util

## Define util functions
`create_tf_records_from_files()`

This method takes an image path, an annotations path and the image_size and creates a list of tensorflow records, each record stands for one image and its annotations.

The methods requires the images to be square, and the image_path to only contain the images that have annotations in the annotations path.

The method performs the following transformations:
- image-data is read from file and encoded to binary
- bounding box coordinates are extracted from the input strings
- all string (image file name, class name, ..) are encoded with utf8
- bounding box coordinates are put into relative form (they are devided by their respecive image dimension)

In [3]:
def create_tf_records_from_files(
    image_path: str | Path, annotations_file_path: str | Path, image_size: int
) -> list[tf.train.Example]:

    image_path = Path(image_path)
    annotations_file_path = Path(annotations_file_path)

    annotations = pd.read_csv(annotations_file_path)

    def expand_bbox_coordinates(row: pd.Series):
        bbox_as_tuple = literal_eval(row["bounds"])
        row["bbox_x1"] = bbox_as_tuple[0]
        row["bbox_y1"] = bbox_as_tuple[1]
        row["bbox_x2"] = bbox_as_tuple[2]
        row["bbox_y2"] = bbox_as_tuple[3]

        return row

    annotations = annotations.apply(expand_bbox_coordinates, 1)

    annotations.drop(["class", "bounds"], axis=1, inplace=True)

    image_filenames = os.listdir(image_path)

    tf_records = []

    for image_filename in image_filenames:
        with tf.gfile.GFile(image_path / image_filename, "rb") as fid:
            encoded_jpg = fid.read()

        image_width = image_size
        image_height = image_size

        filename = image_filename.encode("utf8")
        image_format = b"jpg"

        annotations_filtered = annotations[
            annotations["image_id"] == image_filename[:-4]
        ]

        xmins = annotations_filtered["bbox_x1"].values / image_width
        xmaxs = annotations_filtered["bbox_x2"].values / image_width
        ymins = annotations_filtered["bbox_y1"].values / image_height
        ymaxs = annotations_filtered["bbox_y2"].values / image_height

        class_name = "OST".encode("utf8")

        classes_text = [class_name for i in range(len(xmins))]
        classes = [1 for i in range(len(xmins))]

        tf_records.append(
            tf.train.Example(
                features=tf.train.Features(
                    feature={
                        "image/height": dataset_util.int64_feature(image_height),
                        "image/width": dataset_util.int64_feature(image_width),
                        "image/filename": dataset_util.bytes_feature(filename),
                        "image/source_id": dataset_util.bytes_feature(filename),
                        "image/encoded": dataset_util.bytes_feature(encoded_jpg),
                        "image/format": dataset_util.bytes_feature(image_format),
                        "image/object/bbox/xmin": dataset_util.float_list_feature(
                            xmins
                        ),
                        "image/object/bbox/xmax": dataset_util.float_list_feature(
                            xmaxs
                        ),
                        "image/object/bbox/ymin": dataset_util.float_list_feature(
                            ymins
                        ),
                        "image/object/bbox/ymax": dataset_util.float_list_feature(
                            ymaxs
                        ),
                        "image/object/class/text": dataset_util.bytes_list_feature(
                            classes_text
                        ),
                        "image/object/class/label": dataset_util.int64_list_feature(
                            classes
                        ),
                    }
                )
            )
        )

    return tf_records

`save_tf_records_to_file()`

save a list of tf_records to a specified output_path

In [None]:
def save_tf_records_to_file(
    tf_records: list[tf.train.Example], output_path: str | Path
):
    writer = tf.python_io.TFRecordWriter(output_path)

    for record in tf_records:
        writer.write(record.SerializeToString())
    writer.close()

`label_map()`

create a single object label map and and save it to a file. 
The label map is required by the training script of the TensorFlow object detection API.

In [None]:
def label_map(file_path: str, objname: str):
    with open(Path(file_path), "a") as the_file:
        the_file.write("item\n")
        the_file.write("{\n")
        the_file.write("id :{}".format(int(1)))
        the_file.write("\n")
        the_file.write("name :'{0}'".format(str(objname)))
        the_file.write("\n")
        the_file.write("}\n")

## Write data
Apply the defined methods to all three splits.

In [4]:
# create label map with one object: oil-storage-tank (OSD)
label_map("../data/04_feature/annotations/label_map.pbtxt", "OSD")

# create tfrecords from images and annotations 
tf_train_records = create_tf_records_from_files(
    "../data/04_feature/images/train/",
    "../data/04_feature/annotations/train.csv",
    1280,
)

tf_test_records = create_tf_records_from_files(
    "../data/04_feature/images/test/",
    "../data/04_feature/annotations/test.csv",
    1280,
)

tf_valid_records = create_tf_records_from_files(
    "../data/04_feature/images/validation/",
    "../data/04_feature/annotations/valid.csv",
    1280,
)

# write tfrecord files to disk 
save_tf_records_to_file(tf_train_records, "../data/05_model_input/train.record")

save_tf_records_to_file(tf_test_records, "../data/05_model_input/test.record")

save_tf_records_to_file(tf_valid_records, "../data/05_model_input/valid.record")