<a href="https://colab.research.google.com/github/Aestivation/CNN-Transformer-Model/blob/KITTY-Dataset/CNN%2BTransformer_Hybrid_Model_KITTI_DATASET_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Connect Google Colab to Google Drive (if using Colab)
from google.colab import drive
drive.mount('/content/drive')

# Download KITTI dataset from Kaggle
!kaggle datasets download -d klemenko/kitti-dataset
!unzip -q kitti-dataset.zip -d /content/kitti_data/

# Define dataset paths
IMG_DIR = "/content/kitti_data/data_object_image_2/training/image_2/"
LABEL_DIR = "/content/kitti_data/data_object_label_2/training/label_2/"

# Check the number of images in the dataset
image_files = os.listdir(IMG_DIR)
print("Total images in KITTI dataset:", len(image_files))

# Display a sample image from the dataset
sample_img = load_img(os.path.join(IMG_DIR, image_files[0]))
plt.imshow(sample_img)
plt.axis("off")
plt.title("Sample KITTI Image")
plt.show()


In [None]:
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Image size for training
IMG_SIZE = (224, 224)

# Function to load and preprocess images and labels
def load_kitti_data(img_dir, label_dir, img_size=IMG_SIZE, max_samples=1000):
    images, labels = [], []
    image_files = os.listdir(img_dir)[:max_samples]  # Load only a subset

    for img_file in image_files:
        # Load image and normalize
        img_path = os.path.join(img_dir, img_file)
        img = load_img(img_path, target_size=img_size)
        img = img_to_array(img) / 255.0  # Normalize pixel values
        images.append(img)

        # Load corresponding label file
        label_file = img_file.replace(".png", ".txt")
        label_path = os.path.join(label_dir, label_file)
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                lines = f.readlines()
                if len(lines) > 0:
                    parts = lines[0].strip().split()
                    x1, y1, x2, y2 = map(float, parts[4:8])

                    # Normalize bounding box coordinates
                    labels.append([x1 / img_size[0], y1 / img_size[1], x2 / img_size[0], y2 / img_size[1]])

    return np.array(images), np.array(labels)

# Load dataset
X_train, y_train = load_kitti_data(IMG_DIR, LABEL_DIR)

# Print dataset shape
print(f"Images shape: {X_train.shape}, Labels shape: {y_train.shape}")

# Show a sample image with its bounding box
def show_sample_image(img, bbox):
    img = (img * 255).astype(np.uint8)  # Convert back to 0-255 range
    x1, y1, x2, y2 = bbox
    x1, y1, x2, y2 = int(x1 * IMG_SIZE[0]), int(y1 * IMG_SIZE[1]), int(x2 * IMG_SIZE[0]), int(y2 * IMG_SIZE[1])

# This code creates a copy of the image and then uses cv2.rectangle() to draw a blue bounding box at coordinates (x1, y1, x2, y2) to visualize the detected object:
    img_with_box = img.copy()
    img_with_box = cv2.rectangle(img_with_box, (x1, y1), (x2, y2), (255, 0, 0), 2)

    plt.imshow(img_with_box)
    plt.axis("off")
    plt.show()

# Display a sample image with bounding box
show_sample_image(X_train[0], y_train[0])


In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

# Define the CNN Backbone (ResNet50)
def build_cnn_feature_extractor(input_shape=(224, 224, 3)):
    # Load ResNet50 without the top classification layer (head)
    base_cnn = ResNet50(weights="imagenet", include_top=False, input_shape=input_shape)

    # Freeze pre-trained weights to prevent modification during training
    base_cnn.trainable = False

    # Preserve spatial structure: Reshape CNN output from (7, 7, 2048) to (49, 2048)
    cnn_output = Reshape((49, 2048))(base_cnn.output)  # ✅ Fix applied

    # Define the final feature extractor model
    cnn_model = Model(inputs=base_cnn.input, outputs=cnn_output)

    return cnn_model

# Build the CNN model
cnn_model = build_cnn_feature_extractor()

# Print CNN model summary
cnn_model.summary()



In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Layer

# Define Transformer input dimensions
D_MODEL = 512  # Embedding dimension per patch
NUM_PATCHES = 49  # Matches the CNN output patches (7x7 -> 49 patches)

# Define a custom layer to reshape CNN features for Transformer input
class ReshapeCNNFeatures(Layer):
    def __init__(self, embed_dim, num_patches):
        super(ReshapeCNNFeatures, self).__init__()
        self.dense = Dense(embed_dim)  # Reduce feature size to match Transformer D_MODEL
        self.num_patches = num_patches
        self.embed_dim = embed_dim

    def call(self, inputs, training=None):
        print(f"Shape before Dense layer: {inputs.shape}")  # Debugging print
        x = self.dense(inputs)  # Convert CNN features to embedding space
        print(f"Shape after Dense layer: {x.shape}")  # Debugging print

        reshaped_x = tf.reshape(x, (-1, self.num_patches, self.embed_dim))  # Reshape for Transformer
        print(f"Shape after Reshaping: {reshaped_x.shape}")  # Debugging print
        return reshaped_x

# Initialize the reshape layer
reshape_layer = ReshapeCNNFeatures(D_MODEL, NUM_PATCHES)


In [None]:
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Dense

# Define Transformer Encoder Block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),  # Expanding feature space
            Dense(embed_dim)  # Compressing back to original dimensions
        ])
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        # Apply self-attention mechanism
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm1(inputs + attn_output)  # Residual connection & normalization

        # Pass through feed-forward network
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        print(f"Transformer Output Sample: {ffn_output.numpy()[0, :5]}")
        return self.norm2(out1 + ffn_output)  # Final residual connection & normalization


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dropout

# Define the Hybrid CNN + Transformer Model
def build_cnn_transformer_model():
    inputs = Input(shape=(224, 224, 3))  # Input image

    # CNN Feature Extraction
    cnn_features = cnn_model(inputs)

    # Reshape CNN features for Transformer input
    cnn_features = reshape_layer(cnn_features)

    # Apply Transformer Encoder
   transformer_block = TransformerBlock(embed_dim=D_MODEL, num_heads=4, ff_dim=1024, rate=0.2)
   transformer_output = transformer_block(cnn_features, training=True)  # Pass training argument

    # Flatten the Transformer output
    x = Flatten()(transformer_output)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)

    # Output layer predicts bounding box (x1, y1, x2, y2)
    outputs = Dense(4, activation="sigmoid")(x)

    return Model(inputs, outputs)

# Build the model
model = build_cnn_transformer_model()
model.summary()


In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.losses import Huber

# Compile the Model
model.compile(
    optimizer=Adam(learning_rate=0.001),  # Adaptive learning rate optimizer
    loss=Huber(delta=1.0),  # Loss function for bounding box regression
    metrics=[MeanAbsoluteError()]  # Track error in bounding box prediction
)

# Train the Model
model.fit(
    X_train, y_train,
    epochs=30,  # Number of training iterations
    batch_size=32,  # Number of samples per batch
    validation_split=0.1  # Use 10% of data for validation
)


In [None]:

# Function to draw bounding boxes on images
def draw_bounding_box(image, bbox, color=(0, 255, 0)):
    """ Draws a bounding box on the image using OpenCV """
    img = (image * 255).astype(np.uint8)  # Convert back to 0-255 range
    x1, y1, x2, y2 = bbox
    x1, y1, x2, y2 = int(x1 * IMG_SIZE[0]), int(y1 * IMG_SIZE[1]), int(x2 * IMG_SIZE[0]), int(y2 * IMG_SIZE[1])

    img_with_box = img.copy()
    img_with_box = cv2.rectangle(img_with_box, (x1, y1), (x2, y2), color, 2)

    return img_with_box

# Select a few test samples
num_samples = 5
sample_images = X_train[:num_samples]
true_bboxes = y_train[:num_samples]

# Predict bounding boxes for test images
predicted_bboxes = model.predict(sample_images)

# Plot results
plt.figure(figsize=(10, 10))
for i in range(num_samples):
    plt.subplot(1, num_samples, i + 1)
    img_with_pred = draw_bounding_box(sample_images[i], predicted_bboxes[i])
    plt.imshow(img_with_pred)
    plt.axis("off")
plt.show()
