In [1]:
!nvidia-smi

Thu Jul 10 16:02:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.58.02              Driver Version: 555.58.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:0A:00.0 Off |                  Off |
|  0%   39C    P8             15W /  450W |     524MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
from torchvision.models import vit_h_14, ViT_H_14_Weights
from torchinfo import summary

# Load your model (as you already do)
weights = ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V1
model = vit_h_14(weights=weights)

# --- Add this line to print the summary ---
# You must provide an example input size for the model.
# (batch_size, channels, height, width)
input_size = (1, 3, 224, 224) 
summary(model, input_size=input_size)

Layer (type:depth-idx)                             Output Shape              Param #
VisionTransformer                                  [1, 1000]                 1,280
├─Conv2d: 1-1                                      [1, 1280, 16, 16]         753,920
├─Encoder: 1-2                                     [1, 257, 1280]            328,960
│    └─Dropout: 2-1                                [1, 257, 1280]            --
│    └─Sequential: 2-2                             [1, 257, 1280]            --
│    │    └─EncoderBlock: 3-1                      [1, 257, 1280]            19,677,440
│    │    └─EncoderBlock: 3-2                      [1, 257, 1280]            19,677,440
│    │    └─EncoderBlock: 3-3                      [1, 257, 1280]            19,677,440
│    │    └─EncoderBlock: 3-4                      [1, 257, 1280]            19,677,440
│    │    └─EncoderBlock: 3-5                      [1, 257, 1280]            19,677,440
│    │    └─EncoderBlock: 3-6                      [1, 257, 12

In [3]:
starting_block = model.encoder.layers[0]
print(starting_block)

EncoderBlock(
  (ln_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (ln_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (mlp): MLPBlock(
    (0): Linear(in_features=1280, out_features=5120, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.0, inplace=False)
    (3): Linear(in_features=5120, out_features=1280, bias=True)
    (4): Dropout(p=0.0, inplace=False)
  )
)


In [4]:
num_blocks = len(model.encoder.layers)

In [5]:
middle_block = model.encoder.layers[num_blocks // 2]
print(middle_block)

EncoderBlock(
  (ln_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (ln_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (mlp): MLPBlock(
    (0): Linear(in_features=1280, out_features=5120, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.0, inplace=False)
    (3): Linear(in_features=5120, out_features=1280, bias=True)
    (4): Dropout(p=0.0, inplace=False)
  )
)


In [6]:
final_block = model.encoder.layers[num_blocks - 1]
print(final_block)

EncoderBlock(
  (ln_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (ln_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
  (mlp): MLPBlock(
    (0): Linear(in_features=1280, out_features=5120, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.0, inplace=False)
    (3): Linear(in_features=5120, out_features=1280, bias=True)
    (4): Dropout(p=0.0, inplace=False)
  )
)


In [7]:
print(model.encoder)

Encoder(
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): Sequential(
    (encoder_layer_0): EncoderBlock(
      (ln_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (self_attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
      )
      (dropout): Dropout(p=0.0, inplace=False)
      (ln_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): MLPBlock(
        (0): Linear(in_features=1280, out_features=5120, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=5120, out_features=1280, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (encoder_layer_1): EncoderBlock(
      (ln_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (self_attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
   