In [2]:
!pip install deepvision-toolkit

In [3]:
import deepvision
import torch
import tensorflow as tf
import numpy as np

import tensorflow.keras.backend as K

# TwoWayTransformerDecoder

In [4]:
from deepvision.layers import TwoWayTransformerDecoder

### PyTorch TwoWayTransformerDecoder

In [5]:
inp1 = torch.randn([1, 256, 64, 64])
inp2 = torch.randn([1, 256, 64, 64])
inp3 = torch.randn([1, 7, 256])

transformer = TwoWayTransformerDecoder(depth=8, 
                                       project_dim=256, 
                                       num_heads=8, 
                                       mlp_dim=256, 
                                       backend='pytorch')

In [6]:
transformer

__TwoWayTransformerDecoderPT(
  (layers): ModuleList(
    (0-7): 8 x __TwoWayAttentionBlockPT(
      (self_attn): __DownscalingMultiheadAttentionPT(
        (q_proj): Linear(in_features=256, out_features=256, bias=True)
        (k_proj): Linear(in_features=256, out_features=256, bias=True)
        (v_proj): Linear(in_features=256, out_features=256, bias=True)
        (out_proj): Linear(in_features=256, out_features=256, bias=True)
      )
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (cross_attn_token_to_image): __DownscalingMultiheadAttentionPT(
        (q_proj): Linear(in_features=256, out_features=128, bias=True)
        (k_proj): Linear(in_features=256, out_features=128, bias=True)
        (v_proj): Linear(in_features=256, out_features=128, bias=True)
        (out_proj): Linear(in_features=128, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): _MLPBlock(
        (lin1): Linear(in_featu

In [7]:
outputs = transformer(inp1, inp2, inp3)
for out in outputs:
    print(out.shape)

torch.Size([1, 7, 256])
torch.Size([1, 4096, 256])


In [8]:
print('Param count', sum(p.numel() for p in transformer.parameters() if p.requires_grad))

Param count 5414016


### TensorFlow TwoWayTransformerDecoder

In [9]:
transformer = TwoWayTransformerDecoder(depth=8, 
                                       project_dim=256, 
                                       num_heads=8, 
                                       mlp_dim=256, 
                                       backend='tensorflow')

In [10]:
inp1 = tf.random.uniform([1, 256, 64, 64])
inp2 = tf.random.uniform([1, 256, 64, 64])
inp3 = tf.random.uniform([1, 7, 256])

outputs = transformer(inp1, inp2, inp3)
for out in outputs:
    print(out.shape)

(1, 7, 256)
(1, 4096, 256)


In [11]:
print('Param count', int(np.sum([K.count_params(p) for p in transformer.weights])))

Param count 5414016


# TwoWayAttentionBlock

In [12]:
from deepvision.layers import TwoWayAttentionBlock

### PyTorch TwoWayAttentionBlock

In [13]:
attn_block = TwoWayAttentionBlock(project_dim=256, 
                                  num_heads=8, 
                                  mlp_dim=2048,  
                                  backend='pytorch')
attn_block

__TwoWayAttentionBlockPT(
  (self_attn): __DownscalingMultiheadAttentionPT(
    (q_proj): Linear(in_features=256, out_features=256, bias=True)
    (k_proj): Linear(in_features=256, out_features=256, bias=True)
    (v_proj): Linear(in_features=256, out_features=256, bias=True)
    (out_proj): Linear(in_features=256, out_features=256, bias=True)
  )
  (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (cross_attn_token_to_image): __DownscalingMultiheadAttentionPT(
    (q_proj): Linear(in_features=256, out_features=128, bias=True)
    (k_proj): Linear(in_features=256, out_features=128, bias=True)
    (v_proj): Linear(in_features=256, out_features=128, bias=True)
    (out_proj): Linear(in_features=128, out_features=256, bias=True)
  )
  (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (mlp): _MLPBlock(
    (lin1): Linear(in_features=256, out_features=2048, bias=True)
    (lin2): Linear(in_features=2048, out_features=256, bias=True)
    (act): GELU(approximate=

In [14]:
queries = torch.randn(1, 7, 256)
keys = torch.randn(1, 4096, 256)
key_pe = torch.randn(1, 4096, 256)
query_pe = queries

outputs = attn_block(queries=queries, keys=keys, query_pe=query_pe, key_pe=key_pe)
for out in outputs:
    print(out.shape)

torch.Size([1, 7, 256])
torch.Size([1, 4096, 256])


In [15]:
print('Param count', sum(p.numel() for p in attn_block.parameters() if p.requires_grad))

Param count 1579520


### TensorFlow TwoWayAttentionBlock

In [16]:
attn_block = TwoWayAttentionBlock(project_dim=256, 
                                  num_heads=8, 
                                  mlp_dim=2048,  
                                  backend='tensorflow')

In [17]:
queries = tf.random.uniform([1, 7, 256])
keys = tf.random.uniform([1, 4096, 256])
key_pe = tf.random.uniform([1, 4096, 256])
query_pe = queries

outputs = attn_block(queries=queries, keys=keys, query_pe=query_pe, key_pe=key_pe)
for out in outputs:
    print(out.shape)

(1, 7, 256)
(1, 4096, 256)


In [18]:
print('Param count', int(np.sum([K.count_params(p) for p in attn_block.weights])))

Param count 1579520


# DownscalingMultiheadAttention

In [19]:
from deepvision.layers import DownscalingMultiheadAttention

### PyTorch DownscalingMultiheadAttention

In [20]:
attn_layer = DownscalingMultiheadAttention(256, 8, downsample_rate=1, backend="pytorch")
attn_layer

__DownscalingMultiheadAttentionPT(
  (q_proj): Linear(in_features=256, out_features=256, bias=True)
  (k_proj): Linear(in_features=256, out_features=256, bias=True)
  (v_proj): Linear(in_features=256, out_features=256, bias=True)
  (out_proj): Linear(in_features=256, out_features=256, bias=True)
)

In [21]:
q = torch.randn(1, 7, 256)
k = torch.randn(1, 4096, 256)
v = torch.randn(1, 4096, 256)

output = attn_layer(q=q, k=k, v=v)
output.shape

torch.Size([1, 7, 256])

In [22]:
print('Param count', sum(p.numel() for p in attn_layer.parameters() if p.requires_grad))

Param count 263168


### TensorFlow DownscalingMultiheadAttention

In [23]:
attn_layer = deepvision.layers.DownscalingMultiheadAttention(256, 8, downsample_rate=1, backend="tensorflow")

In [24]:
q = tf.random.uniform([1, 7, 256])
k = tf.random.uniform([1, 4096, 256])
v = tf.random.uniform([1, 4096, 256])

output = attn_layer(q=q, k=k, v=v)
print(output.shape)

(1, 7, 256)


In [25]:
print('Param count', int(np.sum([K.count_params(p) for p in attn_layer.weights])))

Param count 263168


# RelativePositionalTransformerEncoder

In [26]:
from deepvision.layers import RelativePositionalTransformerEncoder

### PyTorch RelativePositionalTransformerEncoder

In [27]:
input_shape = (3, 1024, 1024)
patch_size = 14
input_size = input_shape[1] // patch_size

transformer = RelativePositionalTransformerEncoder(
        project_dim=768,
        num_heads=8,
        mlp_dim=2048,
        input_size=(input_size, input_size),
        window_size=0,
        backend='pytorch'
    )

In [28]:
transformer

__RelativePositionalTransformerEncoderPT(
  (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): __RelativePositionalMultiheadAttentionPT(
    (qkv): Linear(in_features=768, out_features=2304, bias=True)
    (proj): Linear(in_features=768, out_features=768, bias=True)
  )
  (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): _MLPBlock(
    (lin1): Linear(in_features=768, out_features=2048, bias=True)
    (lin2): Linear(in_features=2048, out_features=768, bias=True)
    (act): GELU(approximate='none')
  )
)

In [29]:
inputs = torch.randn(1, 64, 64, 768)
outputs = transformer(inputs)
outputs.shape

torch.Size([1, 64, 64, 768])

In [30]:
print('Param count', sum(p.numel() for p in transformer.parameters() if p.requires_grad))

Param count 5541824


### TensorFlow RelativePositionalTransformerEncoder

In [31]:
input_shape = (1024, 1024, 3)
patch_size = 14
input_size = input_shape[1] // patch_size

transformer = RelativePositionalTransformerEncoder(
        project_dim=768,
        num_heads=8,
        mlp_dim=2048,
        input_size=(input_size, input_size),
        window_size=0,
        backend='tensorflow'
    )

In [32]:
"""
k_size=20.
q_size=10.

tf.cast(tf.reshape(tf.range(q_size), [int(q_size), 1]), tf.float32) * tf.math.maximum(k_size / q_size, 1.0)
"""

'\nk_size=20.\nq_size=10.\n\ntf.cast(tf.reshape(tf.range(q_size), [int(q_size), 1]), tf.float32) * tf.math.maximum(k_size / q_size, 1.0)\n'

In [33]:
inputs = tf.random.uniform([1, 64, 64, 768])
outputs = transformer(inputs)
outputs.shape

TensorShape([1, 64, 64, 768])

In [34]:
print('Param count', int(np.sum([K.count_params(p) for p in transformer.weights])))

Param count 5541824


# RelativePositionalMultiheadAttention

In [35]:
from deepvision.layers import RelativePositionalMultiheadAttention

### PyTorch RelativePositionalMultiheadAttention

In [36]:
input_shape = (3, 1024, 1024)
patch_size = 14
input_size = input_shape[1] // patch_size

attn_layer = RelativePositionalMultiheadAttention(
            project_dim=768,
            num_heads=8,
            qkv_bias=True,
            use_rel_pos=True,
            input_size=(input_size, input_size),
            backend="pytorch",)

In [37]:
attn_layer

__RelativePositionalMultiheadAttentionPT(
  (qkv): Linear(in_features=768, out_features=2304, bias=True)
  (proj): Linear(in_features=768, out_features=768, bias=True)
)

In [38]:
inputs = torch.randn([1, 64, 64, 768])
outputs = attn_layer(inputs)
outputs.shape

torch.Size([1, 64, 64, 768])

In [39]:
print('Param count', sum(p.numel() for p in attn_layer.parameters() if p.requires_grad))

Param count 2390208


### TensorFlow RelativePositionalMultiheadAttention

In [40]:
input_shape = (3, 1024, 1024)
patch_size = 14
input_size = input_shape[1] // patch_size

attn_layer = RelativePositionalMultiheadAttention(
            project_dim=768,
            num_heads=8,
            qkv_bias=True,
            use_rel_pos=True,
            input_size=(input_size, input_size),
            backend="tensorflow",)

In [41]:
inputs = tf.random.uniform([1, 64, 64, 768])
outputs = attn_layer(inputs)
outputs.shape

TensorShape([1, 64, 64, 768])

In [42]:
print('Param count', int(np.sum([K.count_params(p) for p in attn_layer.weights])))

Param count 2390208
