<a href="https://colab.research.google.com/github/Aestivation/CNN-Transformer-Model/blob/main/cnn_transformer_kitti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Create the .kaggle directory if it does not exist
!mkdir -p ~/.kaggle

# Move the kaggle.json file to the correct location
!cp /content/kaggle.json ~/.kaggle/

# Set proper permissions to avoid permission errors
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import numpy as np
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Download KITTI dataset from Kaggle
!kaggle datasets download -d klemenko/kitti-dataset
!unzip -q kitti-dataset.zip -d /content/kitti_data/

# Define dataset paths
IMG_DIR = "/content/kitti_data/data_object_image_2/training/image_2/"
LABEL_DIR = "/content/kitti_data/data_object_label_2/training/label_2/"

# Number of images for training
NUM_SAMPLES = 5000

# Data Augmentation to enhance the dataset
data_gen = ImageDataGenerator(
    rotation_range=10,       # Random rotation up to 10 degrees
    width_shift_range=0.1,   # Random horizontal shift up to 10% of width
    height_shift_range=0.1,  # Random vertical shift up to 10% of height
    horizontal_flip=True     # Randomly flip images horizontally
)



In [None]:
def load_kitti_data(img_dir, label_dir, num_samples=NUM_SAMPLES):
    images, labels = [], []
    img_size = (224, 224)

    for i, img_file in enumerate(sorted(os.listdir(img_dir))):
        if i >= num_samples:
            break  # Limit the number of samples to reduce memory usage

        img_path = os.path.join(img_dir, img_file)
        img = cv2.imread(img_path)
        original_shape = img.shape[:2]  # (height, width)
        img = cv2.resize(img, img_size) / 255.0  # Normalize pixel values
        images.append(img)

        # Load corresponding label file
        label_file = img_file.replace('.png', '.txt')
        label_path = os.path.join(label_dir, label_file)
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                lines = f.readlines()
                if len(lines) > 0:
                    parts = lines[0].strip().split()
                    x1, y1, x2, y2 = map(float, parts[4:8])

                    # Normalize bounding box coordinates based on original image size
                    x1 /= original_shape[1]
                    x2 /= original_shape[1]
                    y1 /= original_shape[0]
                    y2 /= original_shape[0]

                    labels.append([x1, y1, x2, y2])
                else:
                    labels.append([0, 0, 1, 1])  # Default bounding box if no label exists
        else:
            labels.append([0, 0, 1, 1])  # Default bounding box if label file is missing

    return np.array(images), np.array(labels)

# Load training data
X_train, y_train = load_kitti_data(IMG_DIR, LABEL_DIR)



In [None]:
def build_cnn_model():
    inputs = keras.Input(shape=(224, 224, 3))
    base_model = keras.applications.VGG16(include_top=False, weights="imagenet", input_tensor=inputs)
    base_model.trainable = False
    cnn_features = base_model.output

    cnn_features = layers.GlobalAveragePooling2D()(cnn_features)
    cnn_features = layers.Reshape((1, cnn_features.shape[-1]))(cnn_features)

    return keras.Model(inputs, cnn_features, name="CNN_Feature_Extractor")

cnn_model = build_cnn_model()
cnn_model.summary()


In [None]:
def build_transformer_block(embed_dim=512, num_heads=4, ff_dim=1024, dropout_rate=0.3):
    inputs = keras.Input(shape=(1, embed_dim))
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attn_output = layers.Dropout(dropout_rate)(attn_output)
    out1 = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn = keras.Sequential([
        layers.Dense(ff_dim, activation="relu"),
        layers.Dense(embed_dim),
    ])

    ffn_output = ffn(out1)
    ffn_output = layers.Dropout(dropout_rate)(ffn_output)
    outputs = layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

    return keras.Model(inputs, outputs, name="Transformer_Block")

transformer_block = build_transformer_block()
transformer_block.summary()


In [None]:
def build_cnn_transformer_model():
    inputs = keras.Input(shape=(224, 224, 3))

    # Extract features using CNN
    cnn_features = cnn_model(inputs)

    # Process features with Transformer
    transformer_output = transformer_block(cnn_features)

    # Flatten the output for Bounding Box prediction
    x = layers.Flatten()(transformer_output)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    # Final output with 4 values (Bounding Box: x1, y1, x2, y2)
    outputs = layers.Dense(4, activation="sigmoid")(x)

    return keras.Model(inputs, outputs, name="CNN_Transformer_Detector")

# Build the model
model = build_cnn_transformer_model()
model.summary()


In [None]:
# Model Compilation Settings
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss="mse",
    metrics=["mae"]
)

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define Early Stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor="val_loss",  # Monitor validation loss
    patience=5,          # Stop training if no improvement for 5 epochs
    restore_best_weights=True,  # Restore the best weights after stopping
    verbose=1
)

# Save the best model based on validation loss
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', save_best_only=True)

# Model Training
history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stopping, checkpoint]  # Include early stopping and checkpointing
)



In [None]:
# Plot Training Loss and Validation Loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Select the last 20 training samples as test data
X_test = X_train[-20:]
y_test = y_train[-20:]

# Predict Bounding Box coordinates using the model
predicted_bboxes = model.predict(X_test)

# Display some prediction samples
for i in range(5):
    img = X_test[i] * 255  # Restore original scale of the image
    bbox = predicted_bboxes[i]

    # Convert Bounding Box from normalized scale to pixel values
    h, w = img.shape[:2]
    x1, y1, x2, y2 = int(bbox[0] * w), int(bbox[1] * h), int(bbox[2] * w), int(bbox[3] * h)

    # Display image with predicted Bounding Box
    plt.imshow(img.astype(np.uint8))
    plt.gca().add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='r', facecolor='none'))
    plt.axis("off")
    plt.show()



In [None]:
# Save the trained model for future use
model.save("cnn_transformer_kitti_trained.h5")
print("Model was saved.")

