In [1]:
!pip install timm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
import os
from transformers import EfficientNetImageProcessor, EfficientNetForImageClassification
import timm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

import timm

model = timm.create_model("hf_hub:timm/maxvit_tiny_tf_224.in1k", pretrained=True)

model.to(device)  # Move model to GPU

# Define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)

# Define loss function
criterion = nn.CrossEntropyLoss()

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m50.9 MB/s[0m eta 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/597 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/124M [00:00<?, ?B/s]



In [2]:
!pip install torchsummary



In [3]:
from torchsummary import summary

# Assuming the model variable contains your model
summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 64, 112, 112]           1,792
          Identity-2         [-1, 64, 112, 112]               0
          GELUTanh-3         [-1, 64, 112, 112]               0
    BatchNormAct2d-4         [-1, 64, 112, 112]             128
            Conv2d-5         [-1, 64, 112, 112]          36,928
              Stem-6         [-1, 64, 112, 112]               0
     AvgPool2dSame-7           [-1, 64, 56, 56]               0
          Identity-8           [-1, 64, 56, 56]               0
      Downsample2d-9           [-1, 64, 56, 56]               0
         Identity-10         [-1, 64, 112, 112]               0
         Identity-11         [-1, 64, 112, 112]               0
   BatchNormAct2d-12         [-1, 64, 112, 112]             128
         Identity-13         [-1, 64, 112, 112]               0
           Conv2d-14        [-1, 256, 1

In [4]:
# Initialize a counter for the number of layers
num_layers = 0

# Iterate through the model's children and count the number of layers
for child in model.children():
    num_layers += 1

print("Number of layers in the model:", num_layers)

Number of layers in the model: 4


In [5]:
from torchsummary import summary

# Define a function to select the first three layers from the model
def select_first_three_layers(model):
    first_three_layers = []
    num_layers = 0
    for name, module in model.named_children():
        first_three_layers.append(module)
        num_layers += 1
        if num_layers == 3:
            break
    return nn.Sequential(*first_three_layers)

# Select the first three layers
first_three_layers_model = select_first_three_layers(model)

# Use summary to display information for the first three layers
summary(first_three_layers_model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 64, 112, 112]           1,792
          Identity-2         [-1, 64, 112, 112]               0
          GELUTanh-3         [-1, 64, 112, 112]               0
    BatchNormAct2d-4         [-1, 64, 112, 112]             128
            Conv2d-5         [-1, 64, 112, 112]          36,928
              Stem-6         [-1, 64, 112, 112]               0
     AvgPool2dSame-7           [-1, 64, 56, 56]               0
          Identity-8           [-1, 64, 56, 56]               0
      Downsample2d-9           [-1, 64, 56, 56]               0
         Identity-10         [-1, 64, 112, 112]               0
         Identity-11         [-1, 64, 112, 112]               0
   BatchNormAct2d-12         [-1, 64, 112, 112]             128
         Identity-13         [-1, 64, 112, 112]               0
           Conv2d-14        [-1, 256, 1

In [6]:
# Initialize a counter for the number of layers
num_layers = 0

# Iterate through the model's children and count the number of layers
for child in first_three_layers_model.children():
    num_layers += 1

print("Number of layers in the model:", num_layers)

Number of layers in the model: 3


In [7]:
# Initialize a list to store parameters of the first three layers
first_three_layers_params = []

# Define a counter to track the number of layers
num_layers = 0

# Iterate through the named parameters of the model
for name, param in model.named_parameters():
    # Check if the parameter belongs to one of the first three layers
    if num_layers < 3:
        first_three_layers_params.append((name, param))

    # Increment the layer counter
    num_layers += 1

# Print the parameters of the first three layers
for name, param in first_three_layers_params:
    print(name, param.size())

stem.conv1.weight torch.Size([64, 3, 3, 3])
stem.conv1.bias torch.Size([64])
stem.norm1.weight torch.Size([64])


In [17]:
import torch
import torch.nn as nn

class SpatialSelfAttention(nn.Module):
    def __init__(self, in_channels):
        super(SpatialSelfAttention, self).__init__()
        self.query_conv = nn.Conv2d(in_channels=in_channels, out_channels=in_channels // 8, kernel_size=1)
        self.key_conv = nn.Conv2d(in_channels=in_channels, out_channels=in_channels // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=1)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        batch_size, C, height, width = x.size()
        query = self.query_conv(x).view(batch_size, -1, height * width).permute(0, 2, 1)
        key = self.key_conv(x).view(batch_size, -1, height * width)
        value = self.value_conv(x).view(batch_size, -1, height * width)
        attention_scores = torch.matmul(query, key)
        attention_scores = self.softmax(attention_scores)
        out = torch.matmul(value, attention_scores.permute(0, 2, 1))
        out = out.view(batch_size, C, height, width)
        out = x + out
        return out


class SpatialSelfAttentionProcessor(nn.Module):
    def __init__(self, in_channels):
        super(SpatialSelfAttentionProcessor, self).__init__()
        self.ssa = SpatialSelfAttention(in_channels=in_channels)

    def forward(self, x):
        split_tensors = x.chunk(4, dim=2)
        split_tensors = [t.chunk(4, dim=3) for t in split_tensors]
        split_tensors = [item for sublist in split_tensors for item in sublist]

        output_tensors = []
        prev_output = None
        for i, split_input in enumerate(split_tensors):
            if i > 0:
                split_input = split_input + prev_output
            split_output = self.ssa(split_input)
            output_tensors.append(split_output)
            prev_output = split_output

        output_tensor = torch.cat(output_tensors, dim=3)
        output_tensor = torch.cat(output_tensor.chunk(4, dim=2), dim=1)

        return output_tensor


import torch.nn as nn

# Freeze layers up to the third layer to prevent updating their weights during training
for param in first_three_layers_model.parameters():
    param.requires_grad = False

# Create an instance of the Spatial Self-Attention Processor
spatial_self_attention_processor = SpatialSelfAttentionProcessor(in_channels=512)

# Combine the first three layers model with the Spatial Self-Attention Processor
combined_model = nn.Sequential(
    first_three_layers_model,  # Pre-trained layers up to the third layer
    spatial_self_attention_processor  # Spatial Self-Attention Processor to be attached after the third layer
)

# Move the combined model to the appropriate device for computation, e.g., GPU
combined_model.to(device)

Sequential(
  (0): Sequential(
    (0): Stem(
      (conv1): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
      (norm1): BatchNormAct2d(
        64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
        (drop): Identity()
        (act): GELUTanh()
      )
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (1): Sequential(
      (0): MaxxVitStage(
        (blocks): Sequential(
          (0): MaxxVitBlock(
            (conv): MbConvBlock(
              (shortcut): Downsample2d(
                (pool): AvgPool2dSame(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0))
                (expand): Identity()
              )
              (pre_norm): BatchNormAct2d(
                64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
                (drop): Identity()
                (act): Identity()
              )
              (down): Identity()
              (conv1_1x1): Conv2d(64, 256, kernel_size=(1, 1), stride

In [18]:
from torchsummary import summary

# Print summary of the combined model
summary(combined_model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 64, 112, 112]           1,792
          Identity-2         [-1, 64, 112, 112]               0
          GELUTanh-3         [-1, 64, 112, 112]               0
    BatchNormAct2d-4         [-1, 64, 112, 112]             128
            Conv2d-5         [-1, 64, 112, 112]          36,928
              Stem-6         [-1, 64, 112, 112]               0
     AvgPool2dSame-7           [-1, 64, 56, 56]               0
          Identity-8           [-1, 64, 56, 56]               0
      Downsample2d-9           [-1, 64, 56, 56]               0
         Identity-10         [-1, 64, 112, 112]               0
         Identity-11         [-1, 64, 112, 112]               0
   BatchNormAct2d-12         [-1, 64, 112, 112]             128
         Identity-13         [-1, 64, 112, 112]               0
           Conv2d-14        [-1, 256, 1