# Implementing Vision Transformer from Scratch

In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras import layers
from keras.datasets import cifar100
import tensorflow_addons as tfa

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings('ignore')

# Hyperparameters

In [None]:
BATCH_SIZE = 64
EPOCH = 2
PATCH_SIZE = 4  # -> number of patches=8  or sequence_length is 8 since images are 32*32

# Data

### Load data

In [9]:
(X_train, y_train), (X_test, y_test) = cifar100.load_data()

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (50000, 32, 32, 3)
y_train shape: (50000, 1)
X_test shape: (10000, 32, 32, 3)
y_test shape: (10000, 1)


### Extract Patches

In [13]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size


    def call(self, images):
        patches = tf.image.extract_patches(
            images = images,
            sizes = [1, self.patch_size, self.patch_size, 1],
            strides = [1, self.patch_size, self.patch_size, 1],
            rates = [1, 1, 1, 1],
            padding = "VALID",
        )

        return tf.reshape(patches, [BATCH_SIZE, -1, patches.shape[-1]])