In [2]:
import torch
import torch.nn as nn

In [4]:
vector_dimension = 768 

linear_dimension = 3072 # dimension of the linear layer of feed forward for normalization

image_size = 224 # pixels in each dimension of image

num_channels = 3 # number of channels in image (R, G, B)

patch_size = 16 # number of patches/ blocks we break each image into

attention_heads = 12 # number of attention heads

num_layers = 12 # number of transformer block layers in the model

dropout = 0.4

normalization_constant = 1e-6 # proportionality constant for normalization layer

num_image_tokens : int = None # number of tokens produced for each image i.e. it produces a list of vectors/ embeddings for a patch of each image

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

In [None]:
# class for the embedding layer of the vision transformer model

class VisionEmbedding(nn.Module):

    def __init__(self, vector_dimension, patch_size, num_channels, image_size):
        super().__init__()

        self.convolutor = nn.Conv2d(
            in_channels = num_channels,  # 3 channels per image R, G, B
            out_channels= vector_dimension, 
            kernel_size= patch_size,
            stride = patch_size
            )
        
        self.num_patches = (image_size // patch_size) ** 2

        self.positional_embedding = nn.Embedding(self.num_patches, vector_dimension)

        self.register_buffer("positions", torch.arange(self.num_patches).expand((1, -1)), persistent= False)

    def forward(self, image_tensors):

        # image tensors are of shape [ batch_size x num_channels x height x width ] 

        # converting them to tensors of shape [ batch_size x vector_dimension x (height / patch_size) x (width / patch_size)]
        # using the CNN

        image_embeddings = self.convolutor(image_tensors) # [ batch_size x vector_dimension x (height / patch_size) x (width / patch_size) ]

        image_embeddings = image_embeddings.flatten(2) # [ batch_size x vector_dimension x num_patches ]

        image_embeddings = image_embeddings.transpose(1, 2) # [ batch_size x num_patches x vector_dimension ]

        return image_embeddings + self.positional_embedding(self.positions)

In [None]:
class VisionTransformer(nn.Module):

    def __init__(self, vector_dimension, normalization_constant, patch_size, num_channels, image_size):
        super().__init__()

        self.embedding_layer = VisionEmbedding(vector_dimension, patch_size, num_channels, image_size)

        self.encoder_layer

        self.post_normalization_layer = nn.LayerNorm(vector_dimension, normalization_constant)

    def forward(self, image_data):

        hidden_state = self.embedding_layer(image_data)

        hidden_state = self.encoder_layer(hidden_state)

        hidden_state = self.post_normalization_layer(hidden_state)

        return hidden_state

NameError: name 'nn' is not defined