MODELO DE PRUEBAS DE DETECCIÓN DE OBJETOS EN IMAGENES, USA UN DATASET GRANDE (CUIDADO AL EJECUTAR)

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

# Cargar un subset ligero (10%) para pruebas
dataset, info = tfds.load('voc/2007', split='train[:10%]', with_info=True)

# Reducción de resolución y normalización de imagen + conversión de cajas
def preprocess_sample(sample):
    image = tf.image.resize(sample['image'], (224, 224))
    image = tf.cast(image, tf.float32) / 255.0

    # Normalizamos las bounding boxes [ymin, xmin, ymax, xmax] → [x, y, w, h]
    boxes = sample['objects']['bbox']
    boxes = tf.stack([
        (boxes[:, 1] + boxes[:, 3]) / 2,  # x_center
        (boxes[:, 0] + boxes[:, 2]) / 2,  # y_center
        boxes[:, 3] - boxes[:, 1],        # width
        boxes[:, 2] - boxes[:, 0],        # height
    ], axis=-1)

    labels = sample['objects']['label']
    return image, (boxes, labels)

# Aplicar preprocesado y batch
dataset = dataset.map(preprocess_sample).padded_batch(8).prefetch(tf.data.AUTOTUNE)

[1mDownloading and preparing dataset 868.85 MiB (download: 868.85 MiB, generated: Unknown size, total: 868.85 MiB) to /Users/gmr/tensorflow_datasets/voc/2007/4.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/2 [00:00<?, ? url/s]
Extraction completed...: 0 file [00:00, ? file/s]
Dl Size...: 0 MiB [00:00, ? MiB/s]
Dl Completed...:  50%|█████     | 1/2 [00:00<00:00,  1.62 url/s]


TooManyRedirects: Exceeded 30 redirects.

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, Input, Model

# Cargar ResNet50 sin la capa final de clasificación
resnet_backbone = ResNet50(include_top=False, input_shape=(224, 224, 3), weights='imagenet')
resnet_backbone.trainable = False  # congelamos sus pesos

In [None]:
# Entrada
input_img = Input(shape=(224, 224, 3))
x = resnet_backbone(input_img, training=False)
x = layers.GlobalAveragePooling2D()(x)

# Predicción de cajas: máximo 5 objetos
bbox_output = layers.Dense(5 * 4, activation='sigmoid')(x)
bbox_output = layers.Reshape((5, 4), name="bboxes")(bbox_output)

# Predicción de clases (20 clases en VOC)
class_output = layers.Dense(5 * 20, activation='softmax')(x)
class_output = layers.Reshape((5, 20), name="labels")(class_output)

# Modelo final
model = Model(inputs=input_img, outputs=[bbox_output, class_output])
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss={
        'bboxes': 'mean_squared_error',
        'labels': 'sparse_categorical_crossentropy'
    },
    metrics={
        'bboxes': 'mae',
        'labels': 'accuracy'
    }
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(
    dataset,
    epochs=10,
    callbacks=[early_stop]
)

In [None]:
def draw_boxes(image, boxes):
    boxes = tf.clip_by_value(boxes, 0, 1)
    boxes = tf.stack([
        boxes[..., 1] - boxes[..., 3] / 2,  # ymin
        boxes[..., 0] - boxes[..., 2] / 2,  # xmin
        boxes[..., 1] + boxes[..., 3] / 2,  # ymax
        boxes[..., 0] + boxes[..., 2] / 2   # xmax
    ], axis=-1)
    return tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), boxes[None, :, :])

# Ejemplo de uso:
for image, (boxes, labels) in dataset.take(1):
    drawn = draw_boxes(image[0], boxes[0])
    plt.imshow(drawn[0])
    plt.axis('off')
    plt.show()