# Fasr R-CNN implementation

In [34]:
import tensorflow as tf
from tensorflow.keras.layers import Layer
import keras
from keras import Input, Model
from keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D
)
from keras.applications import VGG16
import numpy as np

In [35]:
input_shape = (224, 224, 3)
num_classes = 20  
roi_pool_size = (7, 7)

Feature extractor

In [36]:
def build_feature_extractor(input_shape):
    base_model = VGG16(include_top=False, weights="imagenet", input_shape=input_shape)
    return Model(inputs=base_model.input, outputs=base_model.get_layer("block5_conv3").output)

Simple example of ROI Pooling

In [37]:
def roi_pooling(features, rois, pool_size):
    """
    features: Feature map (batch_size, H, W, C)
    rois: ROI координаты в формате [batch_size, num_rois, 4], нормализованные.
    pool_size: Размер фиксированного ROI Pooling
    """
    def crop_and_resize(args):
        features, rois = args
        batch_size = tf.shape(rois)[0]
        num_rois = tf.shape(rois)[1]

        # Нормализованные ROI и индексы
        rois = tf.reshape(rois, (-1, 4))  # Объединяем все ROI в один тензор
        box_indices = tf.repeat(tf.range(batch_size), repeats=num_rois)

        # Выполняем crop_and_resize
        cropped_features = tf.image.crop_and_resize(
            features, boxes=rois, box_indices=box_indices, crop_size=pool_size
        )
        return cropped_features

    return Lambda(crop_and_resize)([features, rois])

Create model

In [38]:
def build_fast_rcnn(input_shape, num_classes, roi_pool_size):
    # Входные данные
    image_input = Input(shape=input_shape, name="image_input")
    roi_input = Input(shape=(None, 4), name="roi_input")  # ROI координаты (batch, num_rois, 4)
    
    # Экстракция признаков (backbone)
    feature_extractor = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)
    features = feature_extractor(image_input)

    # ROI Pooling через Lambda слой
    roi_pooled = roi_pooling(features, roi_input, roi_pool_size)

    # Полносвязные слои
    x = Flatten()(roi_pooled)
    x = Dense(1024, activation='relu')(x)
    x = Dense(1024, activation='relu')(x)

    # Выходы модели
    class_output = Dense(num_classes + 1, activation='softmax', name="class_output")(x)  # Классы
    bbox_output = Dense(4, activation='linear', name="bbox_output")(x)  # Регрессия рамок

    # Итоговая модель
    model = Model(inputs=[image_input, roi_input], outputs=[class_output, bbox_output])
    return model

In [39]:
model = build_fast_rcnn(input_shape, num_classes, roi_pool_size)
model.summary()

## Example of use

In [40]:
batch_size = 2
images = np.random.random((batch_size, 224, 224, 3)).astype(np.float32)
rois = np.array([
    [[50, 50, 150, 150], [30, 30, 100, 100]],  # ROI для первого изображения
    [[20, 20, 120, 120], [60, 60, 200, 200]]   # ROI для второго изображения
], dtype=np.float32)

# Нормализация ROI координат
height, width = 224, 224
rois[..., [0, 2]] /= height  # Нормализация по высоте
rois[..., [1, 3]] /= width   # Нормализация по ширине

# Предсказание
class_preds, bbox_preds = model.predict([images, rois])
print("Class predictions shape:", class_preds.shape)
print("Bounding box predictions shape:", bbox_preds.shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 692ms/step
Class predictions shape: (4, 21)
Bounding box predictions shape: (4, 4)
