In [15]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [16]:
from transformers import UperNetForSemanticSegmentation
import torch
model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-tiny")

In [17]:
model_dict = model.state_dict()
print(list(model_dict.keys())[:10])
print(list(model_dict.keys())[-10:])

['backbone.embeddings.patch_embeddings.projection.weight', 'backbone.embeddings.patch_embeddings.projection.bias', 'backbone.embeddings.norm.weight', 'backbone.embeddings.norm.bias', 'backbone.encoder.layers.0.blocks.0.layernorm_before.weight', 'backbone.encoder.layers.0.blocks.0.layernorm_before.bias', 'backbone.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table', 'backbone.encoder.layers.0.blocks.0.attention.self.relative_position_index', 'backbone.encoder.layers.0.blocks.0.attention.self.query.weight', 'backbone.encoder.layers.0.blocks.0.attention.self.query.bias']
['decode_head.fpn_bottleneck.batch_norm.running_var', 'decode_head.fpn_bottleneck.batch_norm.num_batches_tracked', 'auxiliary_head.convs.0.conv.weight', 'auxiliary_head.convs.0.batch_norm.weight', 'auxiliary_head.convs.0.batch_norm.bias', 'auxiliary_head.convs.0.batch_norm.running_mean', 'auxiliary_head.convs.0.batch_norm.running_var', 'auxiliary_head.convs.0.batch_norm.num_batches_tracked', 'auxiliary_

In [18]:
model.eval()  # Set the model to evaluation mode

# Move model to a suitable device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a dummy input tensor with the expected shape (batch_size, channels, height, width)
dummy_input = torch.randn(1, 3, 512, 512).to(device)  # Adjust dimensions if necessary

# Print a summary by passing the dummy input
with torch.no_grad():
    output = model(dummy_input)
    print(model)  # This will print the architecture

UperNetForSemanticSegmentation(
  (backbone): SwinBackbone(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0): SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutpu

In [19]:
# Print state_dict keys with non-empty and non-zero parameters
for key, param in model.state_dict().items():
    if param.numel() > 0 and torch.sum(param).item() != 0:  # Check for non-empty and non-zero tensors
        print(key)

backbone.embeddings.patch_embeddings.projection.weight
backbone.embeddings.patch_embeddings.projection.bias
backbone.embeddings.norm.weight
backbone.embeddings.norm.bias
backbone.encoder.layers.0.blocks.0.layernorm_before.weight
backbone.encoder.layers.0.blocks.0.layernorm_before.bias
backbone.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table
backbone.encoder.layers.0.blocks.0.attention.self.relative_position_index
backbone.encoder.layers.0.blocks.0.attention.self.query.weight
backbone.encoder.layers.0.blocks.0.attention.self.query.bias
backbone.encoder.layers.0.blocks.0.attention.self.key.weight
backbone.encoder.layers.0.blocks.0.attention.self.key.bias
backbone.encoder.layers.0.blocks.0.attention.self.value.weight
backbone.encoder.layers.0.blocks.0.attention.self.value.bias
backbone.encoder.layers.0.blocks.0.attention.output.dense.weight
backbone.encoder.layers.0.blocks.0.attention.output.dense.bias
backbone.encoder.layers.0.blocks.0.layernorm_after.weight
backbon

In [26]:
import torch
from transformers import UperNetForSemanticSegmentation

# Load the pretrained UperNet model
model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-tiny")

# Update the main classifier layer in `decode_head`
model.decode_head.classifier = torch.nn.Conv2d(
    in_channels=model.decode_head.classifier.in_channels,  # Keep the same input channels
    out_channels=4,  # Change the output channels to 4 classes
    kernel_size=model.decode_head.classifier.kernel_size,
    stride=model.decode_head.classifier.stride,
    padding=model.decode_head.classifier.padding
)

model.auxiliary_head.classifier = torch.nn.Conv2d(
    in_channels=model.auxiliary_head.classifier.in_channels,  # Keep the input channels unchanged
    out_channels=4,  # Set the number of output classes to 4
    kernel_size=model.auxiliary_head.classifier.kernel_size,  # Keep other parameters unchanged
    stride=model.auxiliary_head.classifier.stride,
    padding=model.auxiliary_head.classifier.padding
)

In [21]:
model.eval()  # Set the model to evaluation mode

# Move model to a suitable device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a dummy input tensor with the expected shape (batch_size, channels, height, width)
dummy_input = torch.randn(1, 3, 512, 512).to(device)  # Adjust dimensions if necessary

# Print a summary by passing the dummy input
with torch.no_grad():
    output = model(dummy_input)
    print(model)  # This will print the architecture

UperNetForSemanticSegmentation(
  (backbone): SwinBackbone(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0): SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutpu

In [27]:
# Load the pretrained model state_dict
pretrained_model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-tiny")
pretrained_state_dict = pretrained_model.state_dict()

# Filter out keys related to decode_head and auxiliary_head
backbone_state_dict = {k: v for k, v in pretrained_state_dict.items() if not (k.startswith("decode_head.") or k.startswith("auxiliary_head."))}

# Load only the filtered state_dict into your modified model
model.load_state_dict(backbone_state_dict, strict=False)





_IncompatibleKeys(missing_keys=['decode_head.classifier.weight', 'decode_head.classifier.bias', 'decode_head.psp_modules.0.1.conv.weight', 'decode_head.psp_modules.0.1.batch_norm.weight', 'decode_head.psp_modules.0.1.batch_norm.bias', 'decode_head.psp_modules.0.1.batch_norm.running_mean', 'decode_head.psp_modules.0.1.batch_norm.running_var', 'decode_head.psp_modules.1.1.conv.weight', 'decode_head.psp_modules.1.1.batch_norm.weight', 'decode_head.psp_modules.1.1.batch_norm.bias', 'decode_head.psp_modules.1.1.batch_norm.running_mean', 'decode_head.psp_modules.1.1.batch_norm.running_var', 'decode_head.psp_modules.2.1.conv.weight', 'decode_head.psp_modules.2.1.batch_norm.weight', 'decode_head.psp_modules.2.1.batch_norm.bias', 'decode_head.psp_modules.2.1.batch_norm.running_mean', 'decode_head.psp_modules.2.1.batch_norm.running_var', 'decode_head.psp_modules.3.1.conv.weight', 'decode_head.psp_modules.3.1.batch_norm.weight', 'decode_head.psp_modules.3.1.batch_norm.bias', 'decode_head.psp_modu

In [29]:
import torch

# Zero out all parameters in the decode_head
for name, module in model.decode_head.named_modules():
    if hasattr(module, 'weight') and module.weight is not None:
        with torch.no_grad():  # Ensure gradients are not tracked
            module.weight.zero_()  # Zero out weights
    if hasattr(module, 'bias') and module.bias is not None:
        with torch.no_grad():
            module.bias.zero_()  # Zero out biases

    # Handle BatchNorm running statistics (if applicable)
    if isinstance(module, torch.nn.BatchNorm2d):
        with torch.no_grad():
            module.running_mean.zero_()
            module.running_var.zero_()

# Zero out all parameters in the auxiliary_head (if it exists)
if hasattr(model, 'auxiliary_head'):
    for name, module in model.auxiliary_head.named_modules():
        if hasattr(module, 'weight') and module.weight is not None:
            with torch.no_grad():
                module.weight.zero_()  # Zero out weights
        if hasattr(module, 'bias') and module.bias is not None:
            with torch.no_grad():
                module.bias.zero_()  # Zero out biases

        # Handle BatchNorm running statistics (if applicable)
        if isinstance(module, torch.nn.BatchNorm2d):
            with torch.no_grad():
                module.running_mean.zero_()
                module.running_var.zero_()


In [30]:
# Print state_dict keys with non-empty and non-zero parameters
for key, param in model.state_dict().items():
    if param.numel() > 0 and torch.sum(param).item() != 0:  # Check for non-empty and non-zero tensors
        print(key)

backbone.embeddings.patch_embeddings.projection.weight
backbone.embeddings.patch_embeddings.projection.bias
backbone.embeddings.norm.weight
backbone.embeddings.norm.bias
backbone.encoder.layers.0.blocks.0.layernorm_before.weight
backbone.encoder.layers.0.blocks.0.layernorm_before.bias
backbone.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table
backbone.encoder.layers.0.blocks.0.attention.self.relative_position_index
backbone.encoder.layers.0.blocks.0.attention.self.query.weight
backbone.encoder.layers.0.blocks.0.attention.self.query.bias
backbone.encoder.layers.0.blocks.0.attention.self.key.weight
backbone.encoder.layers.0.blocks.0.attention.self.key.bias
backbone.encoder.layers.0.blocks.0.attention.self.value.weight
backbone.encoder.layers.0.blocks.0.attention.self.value.bias
backbone.encoder.layers.0.blocks.0.attention.output.dense.weight
backbone.encoder.layers.0.blocks.0.attention.output.dense.bias
backbone.encoder.layers.0.blocks.0.layernorm_after.weight
backbon

In [25]:
# Save only the model's state dictionary
torch.save(model.state_dict(), "upernet-swin-tiny-backbone.pth")
