In [27]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import os
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

In [28]:
class PatchEmbedding(tf.keras.layers.Layer):                   
    def __init__(self, size, num_of_patches, projection_dim):   
        super().__init__()
        self.size = size     
        self.num_of_patches = num_of_patches + 1              
        self.projection_dim = projection_dim   

        self.projection = tf.keras.layers.Dense(projection_dim)    
        self.clsToken = tf.Variable(initial_value=tf.keras.initializers.GlorotNormal()(shape=(1, 1, projection_dim)), trainable=True)   

        self.positionEmbedding = tf.keras.layers.Embedding(self.num_of_patches, projection_dim)           
       
    
    def call(self, inputs):
        patches = tf.image.extract_patches(inputs, sizes=[1, self.size, self.size, 1],                         
                            strides=[1, self.size, self.size, 1], rates=[1, 1, 1, 1], padding='VALID')
        patches = tf.reshape(patches, (tf.shape(inputs)[0], -1, self.size*self.size*3))                       

        patches = self.projection(patches)  

        clsToken = tf.repeat(self.clsToken, repeats=tf.shape(inputs)[0],axis=0)  
        patches = tf.concat((clsToken, patches), axis=1)            

        positions = tf.range(0, self.num_of_patches, 1)[tf.newaxis,...]  
        positionalEmbedding = self.positionEmbedding(positions)  
        patches = patches + positionalEmbedding        
        return patches

In [29]:
embedding = PatchEmbedding(16, 81, 128)
result = embedding(tf.random.normal(shape=(32, 144, 144, 3)))
print(result.shape)

(32, 82, 128)


In [30]:
class TransformerLayer(tf.keras.layers.Layer):    # d_model = projection_dim
    def __init__(self, d_model, heads, mlp_rate, dropout_rate = 0.1):
        super().__init__()
        self.layernorm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim= d_model//heads, value_dim = d_model//heads, dropout=dropout_rate)
       

        self.layernorm_2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.mlp = tf.keras.Sequential([
            tf.keras.layers.Dense(d_model * mlp_rate, activation='gelu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(d_model, activation='gelu'),
            tf.keras.layers.Dropout(dropout_rate)
        ])


    def call(self, inputs, training=True):   
        out_1 = self.layernorm_1(inputs)
        out_1 = self.mha(out_1,out_1, training=training)    
        out_1 = out_1 + inputs

        out_2 = self.layernorm_2(out_1)
        out_2 = self.mlp(out_2, training=training)
        out_2 = out_1 + out_2
        return out_2

In [31]:
transformer = TransformerLayer(128, 2, 2)
transformer(result).shape    # transformer output


TensorShape([32, 82, 128])

In [32]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, mlp_rate, num_layers=1, dropout_rate=0.1):
        super().__init__()
        self.encoders = [TransformerLayer(d_model, heads, mlp_rate, dropout_rate) for _ in range(num_layers)]
        # stacking transformer layers

    def call(self, inputs, training=True):
        x = inputs
        for layer in self.encoders:
            x = layer(x, training=training)
        return x

In [33]:
T_E = TransformerEncoder(128, 2, 2, 4)
T_E(result).shape


TensorShape([32, 82, 128])

In [34]:
class ViT(tf.keras.Model):
    def __init__(self, num_classes,patch_size, num_of_patches, d_model, heads, num_layers, mlp_rate, dropout_rate=0.1):
        super().__init__()
        self.PatchEmbedding = PatchEmbedding(patch_size, num_of_patches, d_model)
        self.encoder = TransformerEncoder(d_model, heads, mlp_rate, num_layers, dropout_rate)
        self.prediction = tf.keras.Sequential([
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(mlp_rate*d_model, activation='gelu'),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(num_classes, activation='softmax'),

        ])

    def call(self, inputs, training=True):
        patches = self.PatchEmbedding(inputs)
        encoderResult = self.encoder(patches, training=training)
        clsResult = encoderResult[:, 0, :] 
        prediction = self.prediction(clsResult, training=training)
        return prediction


In [35]:
ViTclassifier = ViT(
    num_classes =10,
    patch_size=16,
    num_of_patches = (144//16)**2,      
    d_model=128,
    heads=2,
    num_layers=4,         
    mlp_rate=2,
    dropout_rate=0.1
)

ViTclassifier(tf.random.normal(shape=(32, 144, 144, 3))).shape

TensorShape([32, 10])

In [36]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

In [37]:
preprocessingModel = data_augmentation = tf.keras.Sequential([
    tf.keras.layers.Normalization(),
    tf.keras.layers.Resizing(144, 144)

])
preprocessingModel.layers[0].adapt(x_train)   

augmentaionModel =tf.keras.Sequential([
    tf.keras.layers.RandomFlip('horizontal'),
    tf.keras.layers.RandomRotation(factor=0.2),
    tf.keras.layers.RandomZoom(width_factor=0.2, height_factor=0.2)

])

In [38]:
def convert_to_dataset(data, batch_size, shuffle=False, augment=False):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.map(lambda x, y:(preprocessingModel(x)[0],y), num_parallel_calls = tf.data.AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(len(dataset))
    dataset = dataset.batch(batch_size, drop_remainder=True )

    if augment:
        dataset = dataset.map(lambda x, y:(augmentaionModel(x, training=True),y), num_parallel_calls = tf.data.AUTOTUNE)
    
    return dataset.prefetch(tf.data.AUTOTUNE)

In [39]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.






INFO:tensorflow:Initializing the TPU system: grpc://10.97.232.90:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.97.232.90:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [40]:
strategy = tf.distribute.TPUStrategy(resolver)
print(strategy.num_replicas_in_sync)

INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


8


In [41]:

trainingData = convert_to_dataset(data=(x_train, y_train), batch_size=1024, shuffle=True, augment=True )

valData = convert_to_dataset(data=(x_test, y_test), batch_size=1024, shuffle=True, augment=False )

In [42]:
with strategy.scope():
    ViTclassifier = ViT(
    num_classes =10,
    patch_size=16,
    num_of_patches = (144//16)**2,     
    d_model=128,
    heads=2,
    num_layers=4,        
    mlp_rate=2,
    dropout_rate=0.1
)

    ViTclassifier.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer= 'adam',
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
                tf.keras.metrics.SparseCategoricalAccuracy(name='top_5_accuracy')
            ]
    )

In [43]:
ViTclassifier.fit(x = trainingData, validation_data=valData, batch_size=1024, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f087051bd10>

In [44]:
ViTclassifier.summary()

Model: "vi_t_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 patch_embedding_6 (PatchEmb  multiple                 109056    
 edding)                                                         
                                                                 
 transformer_encoder_6 (Tran  multiple                 529920    
 sformerEncoder)                                                 
                                                                 
 sequential_38 (Sequential)  (128, 10)                 35594     
                                                                 
Total params: 674,570
Trainable params: 674,570
Non-trainable params: 0
_________________________________________________________________


In [52]:
!git init

Reinitialized existing Git repository in /content/.git/


In [53]:
!git remote add origin https://github.com/Ali-1329/ViT.git

fatal: remote origin already exists.


In [54]:
!git add README.md

fatal: pathspec 'README.md' did not match any files


In [56]:
!git config --global user.email 'jafariali1329@gmail.com'

In [60]:
!git add  *

In [62]:
!git commit -m 'first commit'

[master (root-commit) 4e95507] first commit
 6 files changed, 50070 insertions(+)
 create mode 100755 sample_data/README.md
 create mode 100755 sample_data/anscombe.json
 create mode 100644 sample_data/california_housing_test.csv
 create mode 100644 sample_data/california_housing_train.csv
 create mode 100644 sample_data/mnist_test.csv
 create mode 100644 sample_data/mnist_train_small.csv
