In [1]:
!pip install timm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
import os
from transformers import EfficientNetImageProcessor, EfficientNetForImageClassification
import timm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

import timm

model = timm.create_model("hf_hub:timm/maxvit_tiny_tf_224.in1k", pretrained=True)

model.to(device)  # Move model to GPU

# Define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)

# Define loss function
criterion = nn.CrossEntropyLoss()

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m45.7 MB/s[0m eta 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/597 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/124M [00:00<?, ?B/s]



In [2]:
class DefConvLayer_red(nn.Module):
    def __init__(self, filters, strides, kernel_size=3):
        super(DefConvLayer_red, self).__init__()
        assert isinstance(kernel_size, int), "expect kernel_size to be of type 'int'"
        assert isinstance(strides, int), "expect strides to be of type int"
        self.N = kernel_size ** 2
        self.filters = filters
        self.strides = strides

        # Define learnable weights
        self.W = nn.Parameter(torch.randn(1, 1, 1, self.N, filters))

    def forward(self, input, offsets):
        # input: (m, n_H, n_W, n_C)
        # offsets: (m, n_H, n_W, 2*N)
        input_offsets = self.BLIN(input, offsets)  # (m, n_H, n_W, n_C, N)

        # Expand dimensions for broadcasting
        input_offsets = input_offsets.unsqueeze(-1)
        input_offsets = input_offsets.expand(-1, -1, -1, -1, -1, self.filters)

        # Reshape weights for broadcasting
        W = self.W.view(1, 1, 1, input.shape[-1], self.N, self.filters)

        # Perform element-wise multiplication
        output = input_offsets * W

        # Sum along the appropriate dimensions
        output = torch.sum(output, dim=-2)  # (m, n_H, n_W, n_C, filters)
        output = torch.sum(output, dim=-2)  # (m, n_H, n_W, filters)
        return output

    def BLIN(self, input, offsets_in):
        m, n_Hi, n_Wi, n_C = input.shape
        n_Ho, n_Wo, N = offsets_in.shape[1], offsets_in.shape[2], offsets_in.shape[3] // 2

        # Expand input into (m, n_Hi, n_Wi, n_C, N)
        input_offsets = input.unsqueeze(-1).expand(-1, -1, -1, -1, N)

        # Replicate offsets for each input channel
        offsets = offsets_in.view(m, n_Ho, n_Wo, 1, N, 2)
        offsets = offsets.expand(-1, -1, -1, n_C, -1, -1)

        # Generate index grid
        grid_m, grid_i, grid_j, grid_c, grid_N = torch.meshgrid(torch.arange(m), torch.arange(n_Hi),
                                                                torch.arange(n_Wi), torch.arange(n_C), torch.arange(N))

        # Adjust indices for strides
        ur_grid_m = grid_m[:, ::self.strides, ::self.strides, :, :].reshape(-1)
        ur_grid_i = grid_i[:, ::self.strides, ::self.strides, :, :].reshape(-1)
        ur_grid_j = grid_j[:, ::self.strides, ::self.strides, :, :].reshape(-1)
        ur_grid_c = grid_c[:, ::self.strides, ::self.strides, :, :].reshape(-1)
        ur_grid_N = grid_N[:, ::self.strides, ::self.strides, :, :].reshape(-1)
        ur_offsets = offsets.view(-1, 2)

        # Calculate adjusted coordinates
        coords_i = ur_grid_i.float() + ur_offsets[:, 0]
        coords_j = ur_grid_j.float() + ur_offsets[:, 1]

        # Clip coordinates to handle edges
        coords_i = torch.clamp(coords_i, 0, n_Hi - 1)
        coords_j = torch.clamp(coords_j, 0, n_Wi - 1)
        coords_2d = torch.stack([coords_i, coords_j], dim=-1)

        # Nearest indices
        coords_lt = torch.floor(coords_2d).long()
        coords_rb = torch.ceil(coords_2d).long()
        coords_lb = torch.stack((coords_rb[..., 0], coords_lt[..., 1]), dim=-1)
        coords_rt = torch.stack((coords_lt[..., 0], coords_rb[..., 1]), dim=-1)

        # Gather input values at specified locations
        indices_lt = torch.stack([ur_grid_m, coords_lt[..., 0], coords_lt[..., 1], ur_grid_c, ur_grid_N], dim=-1)
        indices_rb = torch.stack([ur_grid_m, coords_rb[..., 0], coords_rb[..., 1], ur_grid_c, ur_grid_N], dim=-1)
        indices_lb = torch.stack([ur_grid_m, coords_lb[..., 0], coords_lb[..., 1], ur_grid_c, ur_grid_N], dim=-1)
        indices_rt = torch.stack([ur_grid_m, coords_rt[..., 0], coords_rt[..., 1], ur_grid_c, ur_grid_N], dim=-1)

        vals_lt = torch.gather(input_offsets, dim=0, index=indices_lt.unsqueeze(1))
        vals_rb = torch.gather(input_offsets, dim=0, index=indices_rb.unsqueeze(1))
        vals_lb = torch.gather(input_offsets, dim=0, index=indices_lb.unsqueeze(1))
        vals_rt = torch.gather(input_offsets, dim=0, index=indices_rt.unsqueeze(1))

        # Linear interpolation
        ur_coords_offset_lt = coords_2d - coords_lt.float()
        vals_t = vals_lt + (vals_rt - vals_lt) * ur_coords_offset_lt[..., 1]
        vals_b = vals_lb + (vals_rb - vals_lb) * ur_coords_offset_lt[..., 1]
        input_offsets = vals_t + (vals_b - vals_t) * ur_coords_offset_lt[..., 0]

        # Reshape to output shape
        input_offsets = input_offsets.view(m, n_Ho, n_Wo, n_C, N)

        return input_offsets

class DefConv_full(nn.Module):
    def __init__(self, filters, kernel_size, strides=1):
        super(DefConv_full, self).__init__()
        self.offsets_layer = nn.Conv2d(in_channels=kernel_size ** 2,
                                       out_channels=2 * kernel_size ** 2,
                                       kernel_size=kernel_size,
                                       stride=strides,
                                       padding=kernel_size // 2)  # 'same' padding in TensorFlow
        self.defconv_red = DefConvLayer_red(filters, strides, kernel_size)

    def forward(self, input):
        offsets = self.offsets_layer(input)
        output = self.defconv_red(input, offsets)
        return output

In [3]:
class SSA(nn.Module):
    def __init__(self, filters):
        super(SSA, self).__init__()
        self.filters = filters

        # Convolutional layers
        self.conv_q = nn.Conv2d(filters, filters, kernel_size=1, padding='same')
        self.conv_k = nn.Conv2d(filters, filters, kernel_size=1, padding='same')
        self.conv_v = nn.Conv2d(filters, filters, kernel_size=1, padding='same')
        self.conv_final = nn.Conv2d(filters, filters, kernel_size=1, padding='same')

    def forward(self, inputs):
        # Calculate shapes
        batch_size, channels, height, width = inputs.size()
        a = height * width

        # Query, Key, and Value
        q = self.conv_q(inputs)
        k = self.conv_k(inputs)
        v = self.conv_v(inputs)

        # Reshape
        q = q.view(batch_size, a, channels)
        k = k.view(batch_size, a, channels).permute(0, 2, 1)
        v = v.view(batch_size, a, channels)

        # Dot product of Query and Key
        qk = torch.matmul(q, k)
        qk = F.softmax(qk, dim=-1)

        # Attention
        qkv = torch.matmul(qk, v)
        qkv = qkv.view(batch_size, height, width, channels)

        # Final convolution
        qkv = self.conv_final(qkv)

        return qkv

In [4]:
class CDSA(nn.Module):
    def __init__(self, fltr, nh):
        super(CDSA, self).__init__()
        self.attn = nn.ModuleList([SSA(fltr) for _ in range(nh)])
        self.conv = nn.Conv2d(fltr * nh, fltr, kernel_size=1, stride=1, padding=0)

    def forward(self, input):
        attn = []
        feature_split = torch.chunk(input, chunks=self.nh, dim=1)

        x = self.attn[0](feature_split[0])
        attn.append(x)

        for i in range(1, self.nh):
            x = feature_split[i] + x
            x = self.attn[i](x)
            attn.append(x)

        mh_lka_attn = torch.cat(attn, dim=1)
        mh_lka_attn = self.conv(mh_lka_attn)
        return mh_lka_attn

In [5]:
class CAL(nn.Module):
    def __init__(self, fltr, nh):
        super(CAL, self).__init__()
        self.conv1 = nn.Conv2d(fltr, fltr, kernel_size=3, padding=1)
        self.layernorm1 = nn.LayerNorm(fltr, eps=1e-6)
        self.cdsa = CDSA(fltr, nh)
        self.conv2 = nn.Conv2d(fltr, fltr, kernel_size=1, padding=0)
        self.layernorm2 = nn.LayerNorm(fltr, eps=1e-6)

    def forward(self, input):
        x = self.conv1(input)
        rs1 = x = x + input
        x = self.layernorm1(x)
        x = self.cdsa(x)
        rs2 = x = rs1 + x
        x = self.layernorm2(x)
        x = self.conv2(x)
        x = rs2 + x
        return x

In [6]:
import torch
import timm
from torchsummary import summary
import torch.nn as nn
import torch.nn.functional as F

class AkashModel(nn.Module):
    def __init__(self):
        super(AkashModel, self).__init__()

        # Load the pre-trained model
        self.model = timm.create_model("hf_hub:timm/maxvit_tiny_tf_224.in1k", pretrained=True)

        # Define a dictionary to store activations
        self.activation = {}

        # Define a function to register forward hook
        def get_activation(name):
            def hook(module, input, output):
                self.activation[name] = output.detach()
            return hook

        # Register the forward hook to the desired layer
        layer_name = 'stages.2.blocks.4.attn_grid.drop_path2'
        desired_layer = self.model.stages[2].blocks[4].attn_grid.drop_path2
        desired_layer.register_forward_hook(get_activation(layer_name))

    def forward(self, x):
        # Forward pass through the pre-trained model
        output = self.model(x)

        # Retrieve the activation output from the dictionary
        activation_output = self.activation['stages.2.blocks.4.attn_grid.drop_path2']

        # Pass the activation output through the Spatial Self-Attention Processor
        processed_output = self.ssa(activation_output)

        mn_output = Conv2D(fltr, 1, padding='same', activation='relu')(processed_output)
        num_splits = 2
        CAL_out = CAL(mn_output,fltr,nh)

        CAL_out = GlobalAveragePooling2D()(CAL_out)
        out=Dense(classes,activation='softmax')(CAL_out)

        return out


In [8]:
loss_fun= 'categorical_crossentropy'
gpu_num=1
k=5
lr1=0.005
lr2=0.0001
image_size=224
classes=8
ratio=8
fltr=256
nh=2  # number of splits
mag='40'

In [17]:
model = AkashModel()
summary(model, input_size=(3, 224, 224))

AttributeError: 'AkashModel' object has no attribute 'ssa'

In [15]:
import torch.nn as nn
import timm
from torchsummary import summary
class MyModel(nn.Module):
    def __init__(self, num_classes, fltr=256, mn_output=256, nh=128):
        super(MyModel, self).__init__()
        self.model = timm.create_model("hf_hub:timm/maxvit_tiny_tf_224.in1k", pretrained=True)
        self.activation = {}
        def get_activation(name):
            def hook(module, input, output):
                self.activation[name] = output.detach()
            return hook
        layer_name = 'stages.2.blocks.4.attn_grid.drop_path2'
        desired_layer = self.model.stages[2].blocks[4].attn_grid.drop_path2
        desired_layer.register_forward_hook(get_activation(layer_name))

        self.conv = nn.Conv2d(256, fltr, 1)
        self.CAL_out = CAL(mn_output, fltr, nh)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(fltr, num_classes)

    def forward(self, x):
        output = self.model(x)
        activation_output = self.activation['stages.2.blocks.4.attn_grid.drop_path2']
        mn_output = self.conv(activation_output)
        mn_output = self.CAL_out(mn_output, fltr, nh)
        mn_output = self.pool(mn_output)
        out = self.fc(mn_output)
        return out

model = MyModel(num_classes=classes)
summary(model, input_size=(3, 224, 224))

RuntimeError: Given groups=1, weight of size [256, 256, 1, 1], expected input[2, 14, 14, 256] to have 256 channels, but got 14 channels instead