In [3]:
# ------------------------------------------------------------
# TASK 3: Define the hybrid model (CNN + Vision Transformer)
# ------------------------------------------------------------

class PatchEmbedding(layers.Layer):
    def __init__(self, num_patches=4, projection_dim=128):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(projection_dim)
    def call(self, patches):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        position_embedding = layers.Embedding(input_dim=self.num_patches, output_dim=128)(positions)
        return self.projection(patches) + position_embedding

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_cnn_vit_hybrid():
    base_model.trainable = False  # Freeze VGG16

    # CNN feature extraction
    cnn_features = base_model.output  # (None, 2, 2, 512)
    cnn_features = layers.Conv2D(128, (1,1), activation='relu')(cnn_features)  # reduce depth

    # Reshape into patches
    patches = layers.Reshape((4, 128))(cnn_features)

    # Transformer encoder
    transformer = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256)(patches)
    transformer_out = layers.GlobalAveragePooling1D()(transformer)

    # CNN dense features
    cnn_out = layers.Flatten()(cnn_features)
    cnn_out = layers.Dense(128, activation='relu')(cnn_out)

    # Combine both
    combined = layers.Concatenate()([cnn_out, transformer_out])
    outputs = layers.Dense(2, activation='softmax')(combined)

    model = models.Model(inputs=base_model.input, outputs=outputs, name="CNN_ViT_Hybrid")
    return model

# Build hybrid
hybrid_model = build_cnn_vit_hybrid()
hybrid_model.summary()

# ------------------------------------------------------------
# TASK 4 & 5: Compile and define training config
# ------------------------------------------------------------
hybrid_model.compile(
    optimizer=optimizers.Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("✅ Hybrid CNN-ViT model compiled successfully.")

train_config = {
    "epochs": 5,
    "batch_size": 4,
    "optimizer": "Adam",
    "learning_rate": 0.0001,
    "loss_function": "categorical_crossentropy",
    "metrics": ["accuracy"]
}

print("✅ Training configuration defined:")
for k, v in train_config.items():
    print(f"{k}: {v}")


✅ Hybrid CNN-ViT model compiled successfully.
✅ Training configuration defined:
epochs: 5
batch_size: 4
optimizer: Adam
learning_rate: 0.0001
loss_function: categorical_crossentropy
metrics: ['accuracy']
