<a href="https://colab.research.google.com/github/DivyaMeenaSundaram/Deep_Learning_Lab/blob/main/CIFAR_10_Transformer_comparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10

# Load and preprocess the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Split the training data into training and validation sets
val_size = int(0.2 * x_train.shape[0])
x_val = x_train[:val_size]
y_val = y_train[:val_size]
x_train = x_train[val_size:]
y_train = y_train[val_size:]

# Create patches from images
def create_patches(images, patch_size):
    batch_size, height, width, channels = images.shape
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding="VALID",
    )
    patch_dim = patch_size * patch_size * channels
    num_patches = (height // patch_size) * (width // patch_size)
    patches = tf.reshape(patches, [batch_size, num_patches, patch_dim])
    return patches

# Swin Block Implementation
class SwinBlock(layers.Layer):
    def __init__(self, dim, num_heads, window_size, shift_size, mlp_dim, dropout_rate=0.1, **kwargs):
        super(SwinBlock, self).__init__(**kwargs)
        self.window_size = window_size
        self.shift_size = shift_size
        self.num_heads = num_heads
        self.mlp_dim = mlp_dim
        self.dropout_rate = dropout_rate

        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=dim, dropout=dropout_rate)
        self.dropout1 = layers.Dropout(dropout_rate)

        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='relu'),
            layers.Dropout(dropout_rate),
            layers.Dense(dim),
            layers.Dropout(dropout_rate)
        ])

    def call(self, x):
        batch_size, num_patches, dim = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
        height = width = tf.cast(tf.sqrt(tf.cast(num_patches, tf.float32)), tf.int32)

        # Normalize and reshape input
        x = self.norm1(x)
        x = tf.reshape(x, (batch_size, height, width, dim))

        if self.shift_size > 0:
            x = tf.roll(x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2])

        # Extract windows
        windows = tf.image.extract_patches(
            images=x,
            sizes=[1, self.window_size, self.window_size, 1],
            strides=[1, self.window_size, self.window_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )

        # Flatten window dimensions
        num_windows = (height // self.window_size) * (width // self.window_size)
        windows = tf.reshape(windows, (batch_size * num_windows, self.window_size ** 2, dim))

        # Apply attention
        attention_output = self.attention(windows, windows)
        attention_output = self.dropout1(attention_output)

        # Reshape back to (batch_size, num_patches, dim)
        attention_output = tf.reshape(
            attention_output,
            (batch_size, num_windows * self.window_size ** 2, dim)
        )

        # Add residual connection
        x = tf.reshape(x, (batch_size, num_patches, dim))
        x = x + attention_output

        # Apply MLP
        mlp_output = self.mlp(self.norm2(x))

        # Return the final output after residual connection
        return x + mlp_output

# Swin Transformer Model
def swin_transformer(num_patches, patch_dim, window_size, num_heads, mlp_dim, num_layers, num_classes, dropout_rate=0.1):
    inputs = layers.Input(shape=(num_patches, patch_dim))
    x = inputs

    for i in range(num_layers):
        shift_size = window_size // 2 if i % 2 == 1 else 0
        x = SwinBlock(
            dim=patch_dim,
            num_heads=num_heads,
            window_size=window_size,
            shift_size=shift_size,
            mlp_dim=mlp_dim,
            dropout_rate=dropout_rate
        )(x)

    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)
    return model

# Model Parameters
PATCH_SIZE = 4
WINDOW_SIZE = 4
num_patches = (32 // PATCH_SIZE) ** 2  # 64 patches for 32x32 images
patch_dim = PATCH_SIZE * PATCH_SIZE * 3  # RGB images (32x32x3)
model = swin_transformer(
    num_patches=num_patches,
    patch_dim=patch_dim,
    window_size=WINDOW_SIZE,
    num_heads=4,
    mlp_dim=128,
    num_layers=4,
    num_classes=10,
    dropout_rate=0.1
)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Prepare the data
x_train_patches = create_patches(x_train, PATCH_SIZE)
x_val_patches = create_patches(x_val, PATCH_SIZE)
x_test_patches = create_patches(x_test, PATCH_SIZE)

# Train the model
history = model.fit(
    x_train_patches, y_train,
    validation_data=(x_val_patches, y_val),
    batch_size=64,
    epochs=50,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)]
)

# Save the model
model.save('swin_transformer_model.h5')

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test_patches, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 0us/step
Epoch 1/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 464ms/step - accuracy: 0.2984 - loss: 1.8819 - val_accuracy: 0.4097 - val_loss: 1.6044
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 474ms/step - accuracy: 0.4413 - loss: 1.5188 - val_accuracy: 0.4764 - val_loss: 1.4224
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 460ms/step - accuracy: 0.4866 - loss: 1.4082 - val_accuracy: 0.5249 - val_loss: 1.3315
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 462ms/step - accuracy: 0.5141 - loss: 1.3320 - val_accuracy: 0.5330 - val_loss: 1.2990
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 461ms/step - accuracy: 0.5361 - loss: 1.2760 - val_accuracy: 0.5449 - val_loss: 1.2673
Epoc



[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 75ms/step - accuracy: 0.6404 - loss: 1.0790
Test Accuracy: 0.64


In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np

# Load and preprocess CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize to [0, 1]
val_size = int(0.2 * len(x_train))
x_val, y_val = x_train[:val_size], y_train[:val_size]
x_train, y_train = x_train[val_size:], y_train[val_size:]

# Parameters
PATCH_SIZE = 4  # Larger patches for fewer tokens
EMBED_DIM = 128  # Moderately sized embedding dimension
NUM_HEADS = 4  # Balanced number of heads for multi-head attention
NUM_LAYERS = 3  # Reduced number of layers to 3
MLP_DIM = 256  # MLP dimension for transformer layers
DROPOUT_RATE = 0.1
NUM_CLASSES = 10
EPOCHS = 10  # Reduced epochs to 10

# Patch extraction
def extract_patches(images, patch_size):
    batch_size = tf.shape(images)[0]
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding='VALID'
    )
    patch_dim = patch_size * patch_size * 3
    patches = tf.reshape(patches, [batch_size, -1, patch_dim])
    return patches

# Positional Encoding
def positional_encoding(num_patches, dim):
    positions = np.arange(num_patches)[:, np.newaxis]
    dimensions = np.arange(dim)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float32(dim))
    angle_rads = positions * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.cast(angle_rads, dtype=tf.float32)

# Transformer block
def transformer_block(x, num_heads, mlp_dim, dropout_rate):
    # Multi-head self-attention
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=x.shape[-1])(x, x)
    x = layers.LayerNormalization()(x + attn_output)
    # Feedforward network
    mlp_output = layers.Dense(mlp_dim, activation='relu')(x)
    mlp_output = layers.Dropout(dropout_rate)(mlp_output)
    mlp_output = layers.Dense(x.shape[-1])(mlp_output)
    return layers.LayerNormalization()(x + mlp_output)

# Vision Transformer model
def create_vit(num_patches, patch_dim, embed_dim, num_heads, mlp_dim, num_layers, num_classes, dropout_rate):
    inputs = layers.Input(shape=(num_patches, patch_dim))
    # Linear projection of patches
    x = layers.Dense(embed_dim)(inputs)
    # Add positional encoding
    pos_encoding = positional_encoding(num_patches, embed_dim)
    x += pos_encoding
    # Transformer layers
    for _ in range(num_layers):
        x = transformer_block(x, num_heads, mlp_dim, dropout_rate)
    # Classification head
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    return tf.keras.Model(inputs, outputs)

# Create model
num_patches = (32 // PATCH_SIZE) ** 2
patch_dim = PATCH_SIZE * PATCH_SIZE * 3
model = create_vit(
    num_patches=num_patches,
    patch_dim=patch_dim,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    mlp_dim=MLP_DIM,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES,
    dropout_rate=DROPOUT_RATE
)

# Compile model
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=10000,
    decay_rate=0.9
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

# Patch generator
def patch_generator(datagen, x_data, y_data, patch_size):
    for x_batch, y_batch in datagen.flow(x_data, y_data, batch_size=64):
        yield extract_patches(x_batch, patch_size), y_batch

# Train the model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
history = model.fit(
    patch_generator(datagen, x_train, y_train, PATCH_SIZE),
    validation_data=(extract_patches(x_val, PATCH_SIZE), y_val),
    steps_per_epoch=len(x_train) // 64,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

# Evaluate
test_patches = extract_patches(x_test, PATCH_SIZE)
test_loss, test_accuracy = model.evaluate(test_patches, y_test, verbose=2)
print(f"Test Accuracy: {test_accuracy:.2f}")


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m898s[0m 1s/step - accuracy: 0.2195 - loss: 2.1437 - val_accuracy: 0.3942 - val_loss: 1.6679
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m868s[0m 1s/step - accuracy: 0.3853 - loss: 1.6770 - val_accuracy: 0.4275 - val_loss: 1.5854
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m903s[0m 1s/step - accuracy: 0.4183 - loss: 1.6029 - val_accuracy: 0.4777 - val_loss: 1.4644
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m876s[0m 1s/step - accuracy: 0.4451 - loss: 1.5264 - val_accuracy: 0.4558 - val_loss: 1.5123
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m907s[0m 1s/step - accuracy: 0.4599 - loss: 1.4908 - val_accuracy: 0.4737 - val_loss: 1.4706
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m919s[0m 1s/step - accuracy: 0.4794 - loss: 1.4449 - val_accuracy: 0.4822 - val_loss: 1.4435
Epoch 7/10
[1m625/625

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load and preprocess the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize pixel values to range [0, 1]
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Split training set into training and validation sets
val_size = int(0.2 * x_train.shape[0])
x_val = x_train[:val_size]
y_val = y_train[:val_size]
x_train = x_train[val_size:]
y_train = y_train[val_size:]

# Define patch size
PATCH_SIZE = 2  # Each patch is 2x2 pixels

# Function to split an image into patches
def extract_patches(images, patch_size):
    batch_size = tf.shape(images)[0]
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, patch_size, patch_size, 1],
        rates=[1, 1, 1, 1],
        padding='VALID'
    )
    # Reshape patches to (batch_size, num_patches, patch_height * patch_width * channels)
    patch_dim = patch_size * patch_size * 3
    patches = tf.reshape(patches, [batch_size, -1, patch_dim])
    return patches

# Apply patch extraction
train_patches = extract_patches(x_train, PATCH_SIZE)
val_patches = extract_patches(x_val, PATCH_SIZE)
test_patches = extract_patches(x_test, PATCH_SIZE)

# Define positional encoding
def positional_encoding(num_patches, dim):
    positions = np.arange(num_patches)[:, np.newaxis]
    dimensions = np.arange(dim)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float32(dim))
    angle_rads = positions * angle_rates

    # Apply sin to even indices and cos to odd indices
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return tf.cast(angle_rads, dtype=tf.float32)

# Define TNT transformer block (Tokenized transformer block)
def tnt_transformer_block(inputs, num_heads, mlp_dim, dropout_rate):
    # Multi-Head Self-Attention
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=inputs.shape[-1])(inputs, inputs)
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    attention_output = layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Tokenization-based Feedforward Network (MLP)
    mlp_output = layers.Dense(mlp_dim, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(attention_output)
    mlp_output = layers.Dropout(dropout_rate)(mlp_output)
    mlp_output = layers.Dense(inputs.shape[-1])(mlp_output)
    output = layers.LayerNormalization(epsilon=1e-6)(attention_output + mlp_output)

    return output

# Define the TNT Vision Transformer
def tnt_vision_transformer(num_patches, patch_dim, num_heads, mlp_dim, num_layers, num_classes, dropout_rate):
    inputs = layers.Input(shape=(num_patches, patch_dim))

    # Positional Encoding
    pos_encoding = positional_encoding(num_patches, patch_dim)
    pos_encoding = tf.expand_dims(pos_encoding, 0)  # Add batch dimension
    x = inputs + pos_encoding

    # Transformer Blocks (with TNT-style tokenization)
    for _ in range(num_layers):
        x = tnt_transformer_block(x, num_heads=num_heads, mlp_dim=mlp_dim, dropout_rate=dropout_rate)

    # Classification Head
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    # Define Model
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Instantiate the TNT Vision Transformer
num_patches = (32 // PATCH_SIZE) ** 2  # 256 patches for 32x32 images with 2x2 patches
patch_dim = PATCH_SIZE * PATCH_SIZE * 3  # Each patch is 2x2x3
model = tnt_vision_transformer(
    num_patches=num_patches,
    patch_dim=patch_dim,
    num_heads=4,  # Adjust heads to a reasonable number
    mlp_dim=128,  # Adjust MLP dimension to avoid overfitting
    num_layers=3,  # Fewer layers for optimization
    num_classes=10,
    dropout_rate=0.1  # Adjust dropout rate
)

# Compile the model
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=10000,
    decay_rate=0.9
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Print model summary
model.summary()

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)
datagen.fit(x_train)

# Wrap data augmentation for patches
def augmented_patch_generator(datagen, x_data, y_data, patch_size):
    for x_batch, y_batch in datagen.flow(x_data, y_data, batch_size=64):
        yield extract_patches(x_batch, patch_size), y_batch

# Train the model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
history = model.fit(
    augmented_patch_generator(datagen, x_train, y_train, PATCH_SIZE),
    validation_data=(val_patches, y_val),
    steps_per_epoch=len(x_train) // 64,
    epochs=10,  # Adjusted to 10 epochs
    callbacks=[early_stopping]
)

# Save the model
model.save('tnt_vision_transformer_model_optimized.h5')

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_patches, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1226s[0m 2s/step - accuracy: 0.1909 - loss: 2.1375 - val_accuracy: 0.2938 - val_loss: 1.9029
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1189s[0m 2s/step - accuracy: 0.2858 - loss: 1.9353 - val_accuracy: 0.3452 - val_loss: 1.8044
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1240s[0m 2s/step - accuracy: 0.3230 - loss: 1.8633 - val_accuracy: 0.3712 - val_loss: 1.7316
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1199s[0m 2s/step - accuracy: 0.3482 - loss: 1.8010 - val_accuracy: 0.3768 - val_loss: 1.7183
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1208s[0m 2s/step - accuracy: 0.3636 - loss: 1.7612 - val_accuracy: 0.4011 - val_loss: 1.6642
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1175s[0m 2s/step - accuracy: 0.3790 - loss: 1.7231 - val_accuracy: 0.4057 - val_loss: 1.6486
Epoch 7/10
[1m6



[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 409ms/step - accuracy: 0.4498 - loss: 1.5246
Test Accuracy: 0.45
