In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
import os
import cv2
import numpy as np

In [2]:
# === Beta-VAE Model ===
class BetaVAE(Model):
    def __init__(self, latent_dim=128, beta=1.0):
        super(BetaVAE, self).__init__()
        self.latent_dim = latent_dim
        self.beta = beta  # Weight for KL loss

        # Encoder
        self.encoder = tf.keras.Sequential([
            layers.InputLayer(input_shape=(64, 64, 3)),
            layers.Conv2D(64, (3, 3), strides=2, padding="same", activation="relu"),
            layers.Conv2D(128, (3, 3), strides=2, padding="same", activation="relu"),
            layers.Conv2D(256, (3, 3), strides=2, padding="same", activation="relu"),
            layers.Flatten(),
            layers.Dense(128, activation="relu"),
            layers.Dense(2 * latent_dim)  # Output mean and log variance
        ])

        # Decoder
        self.decoder = tf.keras.Sequential([
            layers.InputLayer(input_shape=(latent_dim,)),
            layers.Dense(8 * 8 * 256, activation="relu"),
            layers.Reshape((8, 8, 256)),
            layers.Conv2DTranspose(128, (3, 3), strides=2, padding="same", activation="relu"),
            layers.Conv2DTranspose(64, (3, 3), strides=2, padding="same", activation="relu"),
            layers.Conv2DTranspose(3, (3, 3), strides=2, padding="same", activation="tanh")
        ])

    def reparameterize(self, mean, log_var):
        """Reparameterization trick: Sample from N(mean, std)."""
        eps = tf.random.normal(shape=tf.shape(mean))
        return mean + tf.exp(0.5 * log_var) * eps

    def call(self, x):
        """Forward pass through encoder and decoder."""
        z_params = self.encoder(x)
        mean, log_var = tf.split(z_params, num_or_size_splits=2, axis=1)
        z = self.reparameterize(mean, log_var)
        x_reconstructed = self.decoder(z)

        # KL divergence loss (Beta-VAE)
        kl_loss = -0.5 * tf.reduce_mean(1 + log_var - tf.square(mean) - tf.exp(log_var))
        self.add_loss(self.beta * kl_loss)

        return x_reconstructed

In [3]:
# === Load KITTI Dataset ===
def load_kitti_data(left_images_dir, right_images_dir, image_size=(64, 64)):
    left_images, right_images = [], []

    for root, _, files in os.walk(left_images_dir):
        for file in files:
            if file.endswith(".png") or file.endswith(".jpg"):
                img_path = os.path.join(root, file)
                img = cv2.imread(img_path)
                img = cv2.resize(img, image_size) / 127.5 - 1  # Normalize to [-1, 1]
                left_images.append(img)

    for root, _, files in os.walk(right_images_dir):
        for file in files:
            if file.endswith(".png") or file.endswith(".jpg"):
                img_path = os.path.join(root, file)
                img = cv2.imread(img_path)
                img = cv2.resize(img, image_size) / 127.5 - 1  # Normalize to [-1, 1]
                right_images.append(img)

    return np.array(left_images), np.array(right_images)

In [4]:
# === Paths ===
left_train_dir = "/kaggle/input/kitti-dataset/data_object_image_2/training/image_2"
right_train_dir = "/kaggle/input/kitti-dataset/data_object_image_3/training/image_3"
left_test_dir= "/kaggle/input/kitti-dataset/data_object_image_2/testing/image_2"
right_test_dir="/kaggle/input/kitti-dataset/data_object_image_3/testing/image_3"

In [5]:
# === Load Data ===
train_L, train_R = load_kitti_data(left_train_dir, right_train_dir)
print(f"Train Left Images {train_L.shape}")
print(f"Train Right Images {train_R.shape}")

test_L, test_R = load_kitti_data(left_test_dir, right_test_dir)
print(f"Test Left Images {test_L.shape}")
print(f"Test Right Images {test_R.shape}")

Train Left Images (7481, 64, 64, 3)
Train Right Images (7481, 64, 64, 3)
Test Left Images (7518, 64, 64, 3)
Test Right Images (7518, 64, 64, 3)


In [6]:
# === Train Beta-VAE ===
vae = BetaVAE(latent_dim=128, beta=1.0)
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss="mse")
vae.fit(train_L, train_R, epochs=1000, batch_size=128,validation_data=(test_L,test_R))  # Train on left images



Epoch 1/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 193ms/step - loss: 0.4516 - val_loss: 0.3891
Epoch 2/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3999 - val_loss: 0.3588
Epoch 3/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3748 - val_loss: 0.3449
Epoch 4/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3594 - val_loss: 0.3315
Epoch 5/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3424 - val_loss: 0.3250
Epoch 6/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3359 - val_loss: 0.3190
Epoch 7/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 0.3283 - val_loss: 0.3116
Epoch 8/1000
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.3170 - val_loss: 0.3066
Epoch 9/1000
[1m59/59[0m [3

<keras.src.callbacks.history.History at 0x79d34f746ef0>

In [21]:
# === Conditional GAN (CGAN) Components ===
# Generator
def build_generator(latent_dim):
    noise_input = layers.Input(shape=(latent_dim,))
    label_input = layers.Input(shape=(1,), dtype="int32")

    label_embedding = layers.Embedding(input_dim=2, output_dim=latent_dim)(label_input)
    label_embedding = layers.Flatten()(label_embedding)

    combined_input = layers.Multiply()([noise_input, label_embedding])
    x = layers.Dense(8 * 8 * 256, activation="relu")(combined_input)
    x = layers.Reshape((8, 8, 256))(x)
    x = layers.Conv2DTranspose(128, (3, 3), strides=2, padding="same", activation="relu")(x)
    x = layers.Conv2DTranspose(64, (3, 3), strides=2, padding="same", activation="relu")(x)
    x = layers.Conv2DTranspose(3, (3, 3), strides=2, padding="same", activation="tanh")(x)

    return Model([noise_input, label_input], x, name="Generator")

# Discriminator
def build_discriminator():
    image_input = layers.Input(shape=(64, 64, 3))
    label_input = layers.Input(shape=(1,), dtype="int32")

    label_embedding = layers.Embedding(input_dim=2, output_dim=np.prod((64, 64, 3)))(label_input)
    label_embedding = layers.Flatten()(label_embedding)
    label_embedding = layers.Reshape((64, 64, 3))(label_embedding)

    combined_input = layers.Concatenate()([image_input, label_embedding])
    x = layers.Conv2D(64, (3, 3), strides=2, padding="same", activation="relu")(combined_input)
    x = layers.Conv2D(128, (3, 3), strides=2, padding="same", activation="relu")(x)
    x = layers.Flatten()(x)
    x = layers.Dense(1, activation="sigmoid")(x)

    return Model([image_input, label_input], x, name="Discriminator")

In [22]:
# === Compile CGAN ===
latent_dim = 128
generator = build_generator(latent_dim)
discriminator = build_discriminator()

# Compile Discriminator
discriminator.compile(optimizer=tf.keras.optimizers.Adam(0.0002), loss="binary_crossentropy")

# === Combined Model (CGAN) ===
discriminator.trainable = False  # Freeze discriminator when training the generator

# Input Layers
z = layers.Input(shape=(latent_dim,))
label = layers.Input(shape=(1,))

# Generator Output
generated_image = generator([z, label])

# Discriminator Output
validity = discriminator([generated_image, label])

# Combined Model (Generator + Discriminator)
combined = Model([z, label], validity)
combined.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss="binary_crossentropy")

In [27]:
# === Train CGAN ===
def train_cgan(generator, discriminator, combined, epochs=500, batch_size=64):
    for epoch in range(epochs):
        idx = np.random.randint(0, train_L.shape[0], batch_size)
        real_images = train_L[idx]
        real_labels = np.zeros((batch_size, 1))  # Label 0 for left images

        idx = np.random.randint(0, train_R.shape[0], batch_size)
        real_images_r = train_R[idx]
        real_labels_r = np.ones((batch_size, 1))  # Label 1 for right images

        real_images = np.concatenate([real_images, real_images_r])
        real_labels = np.concatenate([real_labels, real_labels_r])

        # Train Discriminator
        noise = np.random.normal(0, 1, (batch_size * 2, latent_dim))
        fake_images = generator.predict([noise, real_labels])

        d_loss_real = discriminator.train_on_batch([real_images, real_labels], np.ones((batch_size * 2, 1)))
        d_loss_fake = discriminator.train_on_batch([fake_images, real_labels], np.zeros((batch_size * 2, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train Generator
        noise = np.random.normal(0, 1, (batch_size * 2, latent_dim))
        g_loss = combined.train_on_batch([noise, real_labels], np.ones((batch_size * 2, 1)))  

        # Extract the first element if loss is a list or tuple
        d_loss_value = d_loss[0] if isinstance(d_loss, (list, tuple)) else d_loss
        g_loss_value = g_loss[0] if isinstance(g_loss, (list, tuple)) else g_loss
        
        print(f"Epoch {epoch+1}/{epochs} | D Loss: {d_loss_value:.4f} | G Loss: {g_loss_value:.4f}")


In [None]:
# Train CGAN
train_cgan(generator, discriminator, combined, epochs=500, batch_size=64)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 1/5000 | D Loss: 0.7449 | G Loss: 0.7397
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 2/5000 | D Loss: 0.7437 | G Loss: 0.7400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 3/5000 | D Loss: 0.7430 | G Loss: 0.7401
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 4/5000 | D Loss: 0.7424 | G Loss: 0.7401
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 5/5000 | D Loss: 0.7421 | G Loss: 0.7401
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 6/5000 | D Loss: 0.7416 | G Loss: 0.7399
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 7/5000 | D Loss: 0.7412 | G Loss: 0.7397
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Epoch 8/5000 | D Loss: 0.7408 | G Loss: 0.7395
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[