In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import pickle
import matplotlib.pyplot as plt
from PIL import Image

from sklearn.model_selection import train_test_split

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf

import keras
import keras.backend as K

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

2025-07-01 16:38:50.447025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751387930.650283      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751387930.711256      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# 4. Regression metrics

In [2]:
@tf.function
def rmse(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    y_true = tf.reshape(y_true, [-1, 1])
    y_pred = tf.reshape(y_pred, [-1, 1])
    
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

@tf.function
def eva(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    y_true = tf.reshape(y_true, [-1, 1])
    y_pred = tf.reshape(y_pred, [-1, 1])

    numerator = tf.math.reduce_variance(y_true - y_pred)
    denominator = tf.math.reduce_variance(y_true) + tf.keras.backend.epsilon()
    
    return 1.0 - numerator / denominator

@tf.function
def r2_score(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    
    y_true = tf.reshape(y_true, [-1, 1])
    y_pred = tf.reshape(y_pred, [-1, 1])
    
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred))
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)))
    
    return 1 - (ss_res / (ss_tot + K.epsilon()))

# 4. ResNet 8/18/34 construction

In [3]:
def resnet_block(x, filters, stride=1, use_projection=False):
    shortcut = x

    x = keras.layers.Conv2D(filters, 3, strides=stride, padding='same', use_bias=False)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.ReLU()(x)

    x = keras.layers.Conv2D(filters, 3, strides=1, padding='same', use_bias=False)(x)
    x = keras.layers.BatchNormalization()(x)

    if use_projection:
        shortcut = keras.layers.Conv2D(filters, 1, strides=stride, use_bias=False)(shortcut)
        shortcut = keras.layers.BatchNormalization()(shortcut)

    x = keras.layers.Add()([x, shortcut])
    x = keras.layers.ReLU()(x)
    return x

In [4]:
def ResNet8_DroNet(input_shape=(200, 200, 1), dropout_rate=0.3):
    inputs = keras.Input(shape=input_shape)

    x = keras.layers.Conv2D(32, 5, strides=2, padding='same', use_bias=False)(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.ReLU()(x)

    # Residual Blocks
    x = resnet_block(x, 32)
    x = resnet_block(x, 32)

    x = resnet_block(x, 64, stride=2, use_projection=True)
    x = resnet_block(x, 64)

    x = resnet_block(x, 128, stride=2, use_projection=True)
    x = resnet_block(x, 128)

    return keras.models.Model(inputs, x, name="ResNet8_DroNet")

In [5]:
def ResNet18(input_shape=(224, 224, 3)):
    inputs = keras.Input(shape=input_shape)
    
    x = keras.layers.Conv2D(64, 7, strides=2, padding='same', use_bias=False)(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.ReLU()(x)
    x = keras.layers.MaxPooling2D(3, strides=2, padding='same')(x)

    # Conv2_x
    x = resnet_block(x, 64)
    x = resnet_block(x, 64)

    # Conv3_x
    x = resnet_block(x, 128, stride=2, use_projection=True)
    x = resnet_block(x, 128)

    # Conv4_x
    x = resnet_block(x, 256, stride=2, use_projection=True)
    x = resnet_block(x, 256)

    # Conv5_x
    x = resnet_block(x, 512, stride=2, use_projection=True)
    x = resnet_block(x, 512)

    return keras.models.Model(inputs, x, name="ResNet18_backbone")

In [6]:
def ResNet34(input_shape=(224, 224, 3)):
    inputs = keras.Input(shape=input_shape)
    
    x = keras.layers.Conv2D(64, 7, strides=2, padding='same', use_bias=False)(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.ReLU()(x)
    x = keras.layers.MaxPooling2D(3, strides=2, padding='same')(x)

    # Conv2_x (3 blocks)
    for _ in range(3):
        x = resnet_block(x, 64)

    # Conv3_x (4 blocks)
    x = resnet_block(x, 128, stride=2, use_projection=True)
    for _ in range(3):
        x = resnet_block(x, 128)

    # Conv4_x (6 blocks)
    x = resnet_block(x, 256, stride=2, use_projection=True)
    for _ in range(5):
        x = resnet_block(x, 256)

    # Conv5_x (3 blocks)
    x = resnet_block(x, 512, stride=2, use_projection=True)
    for _ in range(2):
        x = resnet_block(x, 512)

    return keras.models.Model(inputs, x, name="ResNet34_backbone")

# Grid search configurations

In [7]:
from itertools import product

configurations = list(product([1e-3], [64]))

# Models descriptions

In [8]:
models_descriptions = [
    {
        "name": "MobileNet_scratch",
        "backbone": {
            "name": "MobileNet",
        },
        "input": {
            "shape": [224, 224, 3],
        }
    },
    {
        "name": "MobileNet_scratch_gray",
        "backbone": {
            "name": "MobileNet",
        },
        "input": {
            "shape": [224, 224, 3],
            "grayscale": True
        }
    },
]

# Format verification
for model_description in models_descriptions:
    if "backbone" not in model_description:
        raise KeyError("backbone not specified at:\n" + str(model_description))

    if "name" not in model_description["backbone"]:
        raise KeyError("backbone.name not specified at:\n" + str(model_description))

    if "weights" not in model_description["backbone"]:
        model_description["backbone"]["weights"] = None
    
    if "name" not in model_description:
        model_description["name"] = model_description["backbone"]["name"]

    if "head_description" not in model_description:
        model_description["head_description"] = []

    if "weights" not in model_description:
        model_description["weights"] = None

    if "input" not in model_description:
        raise KeyError("input not specified at:\n" + str(model_description))

    if "shape" not in model_description["input"]:
        raise KeyError("input.shape not specified at:\n" + str(model_description))
    
    if "grayscale" not in model_description["input"]:
        model_description["input"]["grayscale"] = False

# Load dataset and build pipeline

In [9]:
def load_samples_dataframe(path: str):
    images_folder = os.path.join(path, "images")

    samples_df = pd.read_csv(
        os.path.join(path, "market_dataset_xy.txt"),
        sep=" ",
        header=None,
        names=["file path", "_", "datetime", "vel_y", "vel_x"]
    )

    samples_df["file path"] = (
        samples_df["file path"]
        .apply(lambda image_name: os.path.join(path, "images", image_name))
    )
    
    return samples_df

In [10]:
@tf.function
def augment(image, label):
    # Brightness and contrast
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    
    # Flip horizontally
    do_flip = tf.random.uniform([]) > 0.5
    image = tf.cond(do_flip, lambda: tf.image.flip_left_right(image), lambda: image)
    
    vel_x = tf.cond(do_flip, lambda: -label["vel_x"], lambda: label["vel_x"])
    vel_y = label["vel_y"]
    
    # Optionally add noise
    noise = tf.random.normal(shape=tf.shape(image), mean=0.0, stddev=0.025)
    image = image + noise
    image = tf.clip_by_value(image, 0.0, 1.0)

    return image, {"vel_x": vel_x, "vel_y": vel_y}

In [11]:
def load_image(path, mode="color"):
    image = tf.io.read_file(path)

    if mode == "color":
        image = tf.image.decode_jpeg(image, channels=3)

    elif mode == "grayscale1":
        image = tf.image.decode_jpeg(image, channels=1)

    elif mode == "grayscale3":
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.rgb_to_grayscale(image)
        image = tf.image.grayscale_to_rgb(image)

    else:
        raise ValueError(f"Invalid mode: {mode}")

    return image

In [12]:
def build_train_val_test_datasets(samples_dataframe, input_description: dict, seed=42):
    # Split
    df_train, df_temp = train_test_split(samples_dataframe, test_size=0.4, random_state=seed)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=seed)

    resolution = input_description["shape"][:2]
    channels = input_description["shape"][2]

    if channels == 1:
        mode = "grayscale1"
    elif input_description["grayscale"] and channels == 3:
        mode = "grayscale3"
    else:
        mode = "color"

    print("Created", mode, "dataset with resolution", resolution)

    def process_sample(path, label):
        image = load_image(path, mode=mode)
        image = tf.image.resize(image, resolution)
        image.set_shape(input_description["shape"])
        image = tf.cast(image, tf.float32) / 255.0

        return image, label

    def df_to_tf_dataset(df, training=False):
        paths = tf.constant(df["file path"].tolist())
        vel_x = tf.constant(df["vel_x"].astype("float32").tolist())
        vel_y = tf.constant(df["vel_y"].astype("float32").tolist())
    
        ds = tf.data.Dataset.from_tensor_slices((paths, {"vel_x": vel_x, "vel_y": vel_y}))
    
        ds = ds.map(process_sample, num_parallel_calls=tf.data.AUTOTUNE)
        ds = ds.cache()
    
        if training:
            ds = ds.shuffle(len(df), reshuffle_each_iteration=True)
            ds = ds.map(augment, num_parallel_calls=tf.data.AUTOTUNE)
            
        return ds

    return {
        "train": df_to_tf_dataset(df_train, training=True),
        "val":   df_to_tf_dataset(df_val, training=False),
        "test":  df_to_tf_dataset(df_test, training=False),
    }

In [13]:
def instantiate_backbone(backbone_name, weights=None):
    if backbone_name == "ResNet-8":
        if weights is not None:
            print("[INFO] No pre-trained weights available for ResNet-8")
        return ResNet8_DroNet(input_shape=(200, 200, 1))

    if backbone_name == "ResNet-18":
        if weights is not None:
            print("[INFO] No pre-trained weights available for ResNet-18")
        return ResNet18(input_shape=(224, 224, 3))

    if backbone_name == "ResNet-34":
        if weights is not None:
            print("[INFO] No pre-trained weights available for ResNet-34")
        return ResNet34(input_shape=(224, 224, 3))

    if backbone_name == "ResNet-50":
        assert not weights or weights in ["imagenet"], f"[ERROR] no {weights} weights found for ResNet-50"
        return keras.applications.ResNet50(
            include_top=False,
            input_shape=(224, 224, 3),
            weights="imagenet" if weights == "imagenet" else None,
            pooling=None,
        )

    if backbone_name == "ResNet-50V2":
        assert not weights or weights in ["imagenet"], f"[ERROR] no {weights} weights found for ResNet-50V2"
        return keras.applications.ResNet50V2(
            include_top=False,
            input_shape=(224, 224, 3),
            weights="imagenet" if weights == "imagenet" else None,
            pooling=None,
        )

    if backbone_name == "MobileNet":
        assert not weights or weights in ["imagenet"], f"[ERROR] no {weights} weights found for MobileNet"
        return keras.applications.MobileNet(
            include_top=False,
            input_shape=(224, 224, 3),
            weights="imagenet" if weights == "imagenet" else None,
            pooling=None,
            alpha=1.0,
        )

    if backbone_name == "MobileNetV2":
        assert not weights or weights in ["imagenet"], f"[ERROR] no {weights} weights found for MobileNetV2"
        return keras.applications.MobileNetV2(
            include_top=False,
            input_shape=(224, 224, 3),
            weights="imagenet" if weights == "imagenet" else None,
            pooling=None,
            alpha=1.0,
        )

    if backbone_name == "EfficientNetB0":
        assert not weights or weights in ["imagenet"], f"[ERROR] no {weights} weights found for EfficientNetB0"
        return keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(224, 224, 3),
            weights="imagenet" if weights == "imagenet" else None,
            pooling=None,
        )

    raise ValueError(f"Backbone {backbone_name} not implemented.")


In [14]:
def build_model(model_description: dict):
    input_shape = model_description["input"]["shape"]
    input_shape = tuple(input_shape)

    backbone_name = model_description["backbone"]["name"]
    backbone_weights = model_description["backbone"]["weights"]

    PREPROCESS = {
        "MobileNet":       keras.applications.mobilenet.preprocess_input,
        "MobileNetV2":     keras.applications.mobilenet_v2.preprocess_input,
        "ResNet-50":       lambda x: keras.applications.resnet.preprocess_input(x, mode="caffe"),
        "ResNet-50V2":     keras.applications.resnet_v2.preprocess_input,
        "EfficientNetB0":  keras.applications.efficientnet.preprocess_input,
    }
    
    inputs = keras.layers.Input(input_shape)
    x = keras.layers.Rescaling(255.)(inputs) # To parse from [0, 1] to [0, 225]
    x = PREPROCESS.get(backbone_name, tf.identity)(x)

    backbone = instantiate_backbone(backbone_name, weights=backbone_weights)
    if backbone_weights:
        backbone.trainable = False

    x = backbone(x)
    x = keras.layers.GlobalAveragePooling2D(name="global_pool")(x)
    x = keras.layers.Dropout(0.3, name="dropout")(x)

    for i, (nodes, use_batch_normalization, use_dropout) in enumerate(model_description["head_description"]):
        use_bias = not use_batch_normalization
        
        x = keras.layers.Dense(nodes, use_bias=use_bias, activation="linear", name=f"dense_{i}")(x)

        if use_batch_normalization:
            x = keras.layers.BatchNormalization(name=f"batchnorm_{i}")(x)

        x = keras.layers.ReLU(name=f"relu_{i}")(x)
        
        if use_dropout:
            x = keras.layers.Dropout(0.3, name=f"dropout_{i}")(x)
    
    output_x = keras.layers.Dense(1, name="vel_x")(x)
    output_y = keras.layers.Dense(1, name="vel_y")(x)

    outputs = [output_x, output_y]

    return keras.models.Model(
        inputs=inputs,
        outputs=outputs,
        name=model_description["name"]
    )

In [15]:
def train_model(model: keras.Model, train_dataset: tf.data.Dataset, validation_dataset: tf.data.Dataset):
    history_1 = model.fit(
        train_dataset,
        validation_data=validation_dataset,
        verbose=0,
        epochs=40,
    )

    history_2 = model.fit(
        train_dataset,
        validation_data=validation_dataset,
        epochs=200,
        verbose=0,
        callbacks=[
            keras.callbacks.EarlyStopping(
                monitor="val_loss",
                patience=45,
                min_delta=1e-4,
                mode="min",
                restore_best_weights=True,
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor="val_loss",
                patience=15,
                factor=0.5,
                min_lr=1e-7,
                mode="min"
            ),
        ],
    )

    full_history = pd.concat([
        pd.DataFrame(history_1.history),
        pd.DataFrame(history_2.history),
    ]).reset_index(drop=True)

    return model, full_history

# Results structure

In [16]:
results = {}

for model_description in models_descriptions:
    model_name = model_description["name"]
    
    results[model_name] = {}

# Benchmark loop

In [17]:
samples_dataframe = load_samples_dataframe("/kaggle/input/marketplace-navigation-dataset/dataset")
# samples_dataframe = samples_dataframe.sample(frac=0.05)

round_number = 1
seed = round_number + 1

datasets = {}

for model_description in models_descriptions:
    dataset_key = tuple(model_description["input"]["shape"])
    dataset_key = model_description["input"]["grayscale"], dataset_key
    
    if not dataset_key in datasets:
        datasets[dataset_key] = build_train_val_test_datasets(
            samples_dataframe,
            model_description["input"],
            seed
        )
        
for configuration in configurations:
    for model_description in models_descriptions:
        print("Training model", model_description["name"], " for configuration", configuration)
        
        learning_rate, batch_size = configuration
        dataset_key = tuple(model_description["input"]["shape"])
        dataset_key = model_description["input"]["grayscale"], dataset_key

        train_ds = datasets[dataset_key]["train"]
        val_ds   = datasets[dataset_key]["val"]
        test_ds  = datasets[dataset_key]["test"]
        
        train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
        val_ds   = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
        test_ds  = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

        model = build_model(model_description)
        model.compile(
            loss={
                "vel_x": "mean_squared_error",
                "vel_y": "mean_squared_error",
            },
            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
            metrics={
                "vel_x": [rmse, eva, r2_score],
                "vel_y": [rmse, eva, r2_score],
            },
        )
        model, history = train_model(model, train_ds, val_ds)

        evaluation_metrics = model.evaluate(test_ds, return_dict=True)
        
        results[model_description["name"]][configuration] = evaluation_metrics

        file_name = f"r{round_number}_{model.name}_{learning_rate:.0e}_{batch_size}"

        model.save(f"{file_name}.keras")
        history.to_csv(f"{file_name}.csv")
        with open("results.pkl", "wb") as results_file:
            pickle.dump(results, results_file)

Created color dataset with resolution [224, 224]


I0000 00:00:1751387942.870311      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Created grayscale3 dataset with resolution [224, 224]
Training model MobileNet_scratch  for configuration (0.001, 64)


I0000 00:00:1751387989.305433      56 service.cc:148] XLA service 0x7b12b4002ca0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751387989.306212      56 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1751387991.419309      56 cuda_dnn.cc:529] Loaded cuDNN version 90300
E0000 00:00:1751387997.284956      56 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1751387997.486986      56 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
I0000 00:00:1751388003.327253      56 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
E0000 00:00:1751388009.776900      58 gpu_timer.cc:82] Delay kernel t

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - loss: 0.0645 - vel_x_eva: 0.8711 - vel_x_loss: 0.0491 - vel_x_r2_score: 0.8672 - vel_x_rmse: 0.2072 - vel_y_eva: 0.9812 - vel_y_loss: 0.0157 - vel_y_r2_score: 0.9809 - vel_y_rmse: 0.1248
Training model MobileNet_scratch_gray  for configuration (0.001, 64)
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 6s/step - loss: 0.0689 - vel_x_eva: 0.8598 - vel_x_loss: 0.0541 - vel_x_r2_score: 0.8527 - vel_x_rmse: 0.2228 - vel_y_eva: 0.9827 - vel_y_loss: 0.0147 - vel_y_r2_score: 0.9822 - vel_y_rmse: 0.1208
