# InternImage Encoder Code Explanation

In [None]:
import torch
import torch.nn as nn
from collections import OrderedDict
import torch.utils.checkpoint as checkpoint
import torch.distributed as dist
import torch.nn.functional as F
import math
import logging

from .supporting_scripts.checkpoint import _load_checkpoint
from .supporting_scripts.dropPath import trunc_normal_, DropPath
from .supporting_scripts.logging import get_root_logger
from .supporting_scripts.weight_init import constant_init, trunc_normal_init

from . import *

These functions are part of a PyTorch module designed to convert tensors between two common formats used in deep learning: channels first (NCHW) and channels last (NHWC).

The `to_channels_first` class converts a tensor from channels last format (NHWC) to channels first format (NCHW). In the forward method, it takes an input tensor x and rearranges its dimensions using the permute function to swap the channel dimension from the last position (index 3) to the second position (index 1), ensuring the channels are in the correct order for the channels first format.

Conversely, the `to_channels_last` class performs the opposite transformation, converting a tensor from channels first format (NCHW) to channels last format (NHWC). In its forward method, it rearranges the dimensions of the input tensor x using permute, moving the channel dimension from the second position (index 1) to the last position (index 3), thereby converting the tensor to channels last format.

In [None]:
class to_channels_first(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.permute(0, 3, 1, 2)


class to_channels_last(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.permute(0, 2, 3, 1)

This function, `build_norm_layer`, is responsible for constructing a normalization layer based on specified parameters. Let's break it down:

The function takes several arguments:

* `dim`: The number of channels in the input tensor.
* `norm_layer`: The type of normalization layer to be used, which can be either "BN" (Batch Normalization) or "LN" (Layer Normalization).
* `in_format` and `out_format`: The format of the input and output tensors, which can be either "channels_first" (NCHW) or "channels_last" (NHWC). These parameters determine whether tensor format conversions need to be applied before or after applying the normalization layer.
* `eps`: A small value added to the denominator for numerical stability in normalization computations.

The function first initializes an empty list called layers to store the components of the normalization layer.

Next, it checks the type of normalization layer specified (BN or LN). If norm_layer is set to 'BN', it constructs a sequence of layers for batch normalization. Depending on the input and output tensor formats, it may append instances of `to_channels_first()` or `to_channels_last()` to ensure the tensor is in the correct format before and after applying batch normalization.

Similarly, if `norm_layer` is set to 'LN', it constructs a sequence of layers for layer normalization. Again, depending on the input and output tensor formats, it may append instances of `to_channels_first()` or `to_channels_last()` to ensure the tensor is in the correct format before and after applying layer normalization.

If the specified `norm_layer` is not supported, the function raises a NotImplementedError.

Finally, the function returns a nn.Sequential container containing all the layers constructed based on the specified parameters.

In [None]:
def build_norm_layer(dim,
                     norm_layer,
                     in_format='channels_last',
                     out_format='channels_last',
                     eps=1e-6):
    layers = []
    if norm_layer == 'BN':
        if in_format == 'channels_last':
            layers.append(to_channels_first())
        layers.append(nn.BatchNorm2d(dim))
        if out_format == 'channels_last':
            layers.append(to_channels_last())
    elif norm_layer == 'LN':
        if in_format == 'channels_first':
            layers.append(to_channels_last())
        layers.append(nn.LayerNorm(dim, eps=eps))
        if out_format == 'channels_first':
            layers.append(to_channels_first())
    else:
        raise NotImplementedError(
            f'build_norm_layer does not support {norm_layer}')
    return nn.Sequential(*layers)

This function, `build_act_layer`, is responsible for constructing an activation layer based on the specified activation function type. 

The function takes a single argument:
* `act_layer`: The type of activation layer to be used, which can be one of the following: "ReLU", "SiLU" (Sigmoid Linear Unit), or "GELU" (Gaussian Error Linear Unit).

The function first checks the type of activation layer specified (`ReLU`, `SiLU`, or `GELU`).

If `act_layer` is set to `'ReLU'`, it returns an instance of `nn.ReLU` activation function with `inplace=True`, meaning it modifies the input tensor in-place, which can save memory. If `act_layer` is set to `'SiLU'`, it returns an instance of `nn.SiLU` activation function with `inplace=True`. If `act_layer` is set to `'GELU'`, it returns an instance of `nn.GELU` activation function. If the specified `act_layer` is not supported, the function raises a `NotImplementedError`.

In [None]:
def build_act_layer(act_layer):
    if act_layer == 'ReLU':
        return nn.ReLU(inplace=True)
    elif act_layer == 'SiLU':
        return nn.SiLU(inplace=True)
    elif act_layer == 'GELU':
        return nn.GELU()

    raise NotImplementedError(f'build_act_layer does not support {act_layer}')

This class, `StemLayer`, is used as the initial processing stage the InternImage network, responsible for extracting basic features from the input images. 

The class constructor `__init__` takes the following arguments:
- `in_chans`: The number of input channels to the stem layer. Default is 3, assuming RGB images.
- `out_chans`: The number of output channels from the stem layer. Default is 96.
- `act_layer`: The type of activation layer to be used within the stem layer. Default is 'GELU' (Gaussian Error Linear Unit).
- `norm_layer`: The type of normalization layer to be used within the stem layer. Default is 'BN' (Batch Normalization).

Inside the constructor, the stem layer is defined as follows:

1. `self.conv1`: A 2D convolutional layer (`nn.Conv2d`) with parameters specified by `in_chans`, `out_chans // 2` (half of the output channels), kernel size 3x3, stride 2, and padding 1. This layer reduces the spatial dimensions of the input tensor while increasing its depth.
2. `self.norm1`: The normalization layer applied after the first convolution. It is constructed using the `build_norm_layer` function with parameters derived from `out_chans // 2` (the number of channels output by the first convolution), `norm_layer`, and input and output formats specified as 'channels_first'.
3. `self.act`: The activation layer specified by `act_layer`. It is constructed using the `build_act_layer` function.
4. `self.conv2`: Another 2D convolutional layer with parameters similar to `self.conv1`, but now operating on the output channels from the first convolutional layer.
5. `self.norm2`: The normalization layer applied after the second convolution. It is constructed using the `build_norm_layer` function with parameters derived from `out_chans` (the total number of output channels), `norm_layer`, and input and output formats specified as 'channels_first' and 'channels_last', respectively.

The `forward` method defines the forward pass of the stem layer. It applies each layer sequentially:
- Convolution 1
- Normalization 1
- Activation
- Convolution 2
- Normalization 2

Finally, it returns the output tensor `x`.

In [None]:
class StemLayer(nn.Module):
    r""" Stem layer of InternImage
    Args:
        in_chans (int): number of input channels
        out_chans (int): number of output channels
        act_layer (str): activation layer
        norm_layer (str): normalization layer
    """

    def __init__(self,
                 in_chans=3,
                 out_chans=96,
                 act_layer='GELU',
                 norm_layer='BN'):
        super().__init__()
        self.conv1 = nn.Conv2d(in_chans,
                               out_chans // 2,
                               kernel_size=3,
                               stride=2,
                               padding=1)
        self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
                                      'channels_first', 'channels_first')
        self.act = build_act_layer(act_layer)
        self.conv2 = nn.Conv2d(out_chans // 2,
                               out_chans,
                               kernel_size=3,
                               stride=2,
                               padding=1)
        self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
                                      'channels_last')

    def forward(self, x):
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.act(x)
        x = self.conv2(x)
        x = self.norm2(x)
        return x


This class, `DownsampleLayer`, is used to reduce the spatial dimensions of the feature maps in the InternImage network, facilitating hierarchical feature extraction and increasing computational efficiency. 

The class constructor `__init__` takes the following arguments:
- `channels`: The number of input channels to the downsample layer.
- `norm_layer`: The type of normalization layer to be used within the downsample layer. Default is 'LN' (Layer Normalization).

Inside the constructor, the downsample layer is defined as follows:

1. `self.conv`: A 2D convolutional layer (`nn.Conv2d`) with parameters specified by `channels`, `2 * channels` (twice the number of input channels), kernel size 3x3, stride 2, padding 1, and no bias. This layer reduces the spatial dimensions of the input tensor by a factor of 2 while increasing its depth.
2. `self.norm`: The normalization layer applied after the convolution. It is constructed using the `build_norm_layer` function with parameters derived from `2 * channels` (the number of output channels from the convolution), `norm_layer`, and input and output formats specified as 'channels_first' and 'channels_last', respectively.

The `forward` method defines the forward pass of the downsample layer. It applies each layer sequentially:
- Permute the dimensions of the input tensor to convert it from 'channels_last' format to 'channels_first' format.
- Convolution
- Normalization

Finally, it returns the output tensor `x`.

In [None]:
class DownsampleLayer(nn.Module):
    r""" Downsample layer of InternImage
    Args:
        channels (int): number of input channels
        norm_layer (str): normalization layer
    """

    def __init__(self, channels, norm_layer='LN'):
        super().__init__()
        self.conv = nn.Conv2d(channels,
                              2 * channels,
                              kernel_size=3,
                              stride=2,
                              padding=1,
                              bias=False)
        self.norm = build_norm_layer(2 * channels, norm_layer,
                                     'channels_first', 'channels_last')

    def forward(self, x):
        x = self.conv(x.permute(0, 3, 1, 2))
        x = self.norm(x)
        return x

This class, `MLPLayer`, represents a Multilayer Perceptron (MLP)/Feed-Forward Network (FFN) layer used in the InternImage model. 

The class constructor `__init__` takes the following arguments:
- `in_features`: The number of input features to the MLP layer.
- `hidden_features`: The number of hidden features in the MLP layer. If not provided, defaults to `in_features`.
- `out_features`: The number of output features from the MLP layer. If not provided, defaults to `in_features`.
- `act_layer`: The type of activation layer to be used within the MLP layer. Default is 'GELU' (Gaussian Error Linear Unit).
- `drop`: The dropout rate to be applied to the output of the MLP layer. Default is 0.0, meaning no dropout is applied.

Inside the constructor, the MLP layer is defined as follows:

1. `self.fc1`: A fully connected (linear) layer (`nn.Linear`) mapping the input features to the hidden features.
2. `self.act`: The activation layer specified by `act_layer`. It is constructed using the `build_act_layer` function.
3. `self.fc2`: Another fully connected layer mapping the hidden features to the output features.
4. `self.drop`: A dropout layer (`nn.Dropout`) applied to the output of both fully connected layers, with dropout rate specified by `drop`.

The `forward` method defines the forward pass of the MLP layer. It applies each layer sequentially:
- Fully connected layer 1 (`self.fc1`)
- Activation layer (`self.act`)
- Dropout (`self.drop`)
- Fully connected layer 2 (`self.fc2`)
- Dropout (`self.drop`)

Finally, it returns the output tensor `x`.

In [None]:
class MLPLayer(nn.Module):
    r""" MLP layer of InternImage
    Args:
        in_features (int): number of input features
        hidden_features (int): number of hidden features
        out_features (int): number of output features
        act_layer (str): activation layer
        drop (float): dropout rate
    """

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer='GELU',
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = build_act_layer(act_layer)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

In [None]:
channels=64,
depths=[4, 4, 18, 4],
groups=[4, 8, 16, 32],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2,
drop_path_type='linear',
act_layer='GELU',
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
with_cp=False,
dw_kernel_size=None,  # for InternImage-H/G
level2_post_norm=False,  # for InternImage-H/G
level2_post_norm_block_ids=None,  # for InternImage-H/G
res_post_norm=False,  # for InternImage-H/G
center_feature_scale=False,  # for InternImage-H/G
out_indices=(0, 1, 2, 3),
init_cfg=None,
**kwargs):

This class, `InternImageLayer`, is a fundamental building block of the InternImage model, incorporating various operations and techniques to enhance feature extraction and modeling capabilities. 

The class constructor `__init__` takes the following arguments:
- `core_op`: The core operation of the InternImage layer, `DCNv3_pytorch`.
- `channels`: The number of input channels to the layer.
- `groups`: Groups of each block.
- `mlp_ratio`: The ratio of MLP hidden features to input channels. Default is 4.0.
- `drop`: The dropout rate to be applied within the layer. Default is 0.0.
- `drop_path`: The drop path rate. Default is 0.0.
- `act_layer`: The type of activation layer to be used within the layer. Default is 'GELU' (Gaussian Error Linear Unit).
- `norm_layer`: The type of normalization layer to be used within the layer. Default is 'LN' (Layer Normalization).
- `post_norm`: Whether to use post normalization. Default is False.
- `layer_scale`: Whether to apply layer scale. Default is None.
- `offset_scale`: The offset scale. Default is 1.0.
- `with_cp`: Whether to use checkpoint. Default is False.
- `dw_kernel_size`: For InternImage-H/G. Default is None.
- `res_post_norm`: For InternImage-H/G. Default is False.
- `center_feature_scale`: For InternImage-H/G. Default is False.

Inside the constructor, the InternImage layer is defined as follows:

1. `self.norm1`: The normalization layer applied before the core operation.
2. `self.post_norm`: A boolean indicating whether post normalization is used.
3. `self.dcn`: The core operation, `DCNv3_pytorch`.
4. `self.drop_path`: The drop path layer.
5. `self.norm2`: The normalization layer applied after the core operation.
6. `self.mlp`: An MLP layer applied after the core operation.
7. `self.layer_scale`: A boolean indicating whether layer scale is applied.
8. `self.gamma1`, `self.gamma2`: Parameters for layer scaling.
9. `self.res_post_norm`: A boolean indicating whether residual post normalization is used.

The `forward` method defines the forward pass of the InternImage layer. It applies the core operation, normalization layers, drop path, and optionally layer scale and checkpointing.

Within the loop, the input tensor `x` is processed through an inner forward function `_inner_forward(x)`. This function represents the core computational logic of the layer. 

If `self.layer_scale` is not enabled, indicating that layer scaling is not applied, the forward pass proceeds with the following steps:
1. Optionally, post-normalization is applied to the input tensor using `self.norm1`, followed by the core operation `self.dcn`. If post-normalization is not used, the core operation is applied directly to the normalized input tensor. 
2. The output of the core operation is passed through the `self.drop_path` layer to apply drop path regularization.
3. Optionally, post-normalization is applied to the output of the core operation using `self.norm2`, followed by the MLP operation `self.mlp`. If post-normalization is not used, the MLP operation is applied directly to the normalized output of the core operation. 
4. The output of the MLP operation is again passed through the `self.drop_path` layer to apply drop path regularization.

If `self.layer_scale` is enabled, indicating that layer scaling is applied, an additional step is performed:
1. The output of the core operation is scaled by learnable parameters `self.gamma1` before post-normalization or the MLP operation. Similarly, the output of the MLP operation is scaled by learnable parameters `self.gamma2`. 

If `self.with_cp` is enabled and the input tensor `x` requires gradient computation, the forward pass is wrapped in a checkpointing function using `checkpoint.checkpoint(_inner_forward, x)`. This allows for memory optimization during backpropagation by saving intermediate activations. 

Finally, the processed tensor `x` is returned as the output of the forward pass through the layer. 

In [2]:
class InternImageLayer(nn.Module):
    r""" Basic layer of InternImage
    Args:
        core_op (nn.Module): core operation of InternImage
        channels (int): number of input channels
        groups (list): Groups of each block.
        mlp_ratio (float): ratio of mlp hidden features to input channels
        drop (float): dropout rate
        drop_path (float): drop path rate
        act_layer (str): activation layer
        norm_layer (str): normalization layer
        post_norm (bool): whether to use post normalization
        layer_scale (float): layer scale
        offset_scale (float): offset scale
        with_cp (bool): whether to use checkpoint
    """

    def __init__(self,
                 core_op,
                 channels,
                 groups,
                 mlp_ratio=4.,
                 drop=0.,
                 drop_path=0.,
                 act_layer='GELU',
                 norm_layer='LN',
                 post_norm=False,
                 layer_scale=None,
                 offset_scale=1.0,
                 with_cp=False,
                 dw_kernel_size=None, # for InternImage-H/G
                 res_post_norm=False, # for InternImage-H/G
                 center_feature_scale=False): # for InternImage-H/G
        super().__init__()
        self.channels = channels
        self.groups = groups
        self.mlp_ratio = mlp_ratio
        self.with_cp = with_cp

        self.norm1 = build_norm_layer(channels, 'LN')
        self.post_norm = post_norm
        self.dcn = DCNv3_pytorch(
            channels=channels,
            kernel_size=3,
            stride=1,
            pad=1,
            dilation=1,
            group=groups,
            offset_scale=offset_scale,
            act_layer=act_layer,
            norm_layer=norm_layer,
            dw_kernel_size=dw_kernel_size, # for InternImage-H/G
            center_feature_scale=center_feature_scale) # for InternImage-H/G
        self.drop_path = DropPath(drop_path) if drop_path > 0. \
            else nn.Identity()
        self.norm2 = build_norm_layer(channels, 'LN')
        self.mlp = MLPLayer(in_features=channels,
                            hidden_features=int(channels * mlp_ratio),
                            act_layer=act_layer,
                            drop=drop)
        self.layer_scale = layer_scale is not None
        if self.layer_scale:
            self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
                                       requires_grad=True)
            self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
                                       requires_grad=True)
        self.res_post_norm = res_post_norm
        if res_post_norm:
            self.res_post_norm1 = build_norm_layer(channels, 'LN')
            self.res_post_norm2 = build_norm_layer(channels, 'LN')

    def forward(self, x):

        def _inner_forward(x):
            if not self.layer_scale:
                if self.post_norm:
                    x = x + self.drop_path(self.norm1(self.dcn(x)))
                    x = x + self.drop_path(self.norm2(self.mlp(x)))
                elif self.res_post_norm: # for InternImage-H/G
                    x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x))))
                    x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x))))
                else:
                    x = x + self.drop_path(self.dcn(self.norm1(x)))
                    x = x + self.drop_path(self.mlp(self.norm2(x)))
                return x
            if self.post_norm:
                x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
                x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
            else:
                x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
                x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
            return x

        if self.with_cp and x.requires_grad:
            x = checkpoint.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)
        return x


NameError: name 'nn' is not defined

The `InternImageBlock` class represents the basic block of the InternImage model. It is designed to process input tensors, from the Stem block, through a sequence of InternImageLayer instances. Each of these InternImageLayer instances, has customizable parameters necessary to extract hierarchical features from the input data.

The class constructor `__init__` takes several previously discussed arguments as well as the following:
- `depth`: The number of InternImage layers (deformable convolution, followed by LN, FFN and LN) to arange sequentially.
- `downsample`: Whether to apply downsampling. Default is True.

The `forward` method of the `InternImageBlock` iterates over each block in the sequence, applying the forward pass of each corresponding `InternImageLayer` instance to the input tensor `x`, and optionally post normalization is applied if enabled and the block index is in the list of `post_norm_block_ids`. After processing through all the blocks, if post normalization or center feature scale is enabled, the output tensor `x` is normalized using the `norm` layer. If downsampling is enabled, the output tensor `x` is passed through the `DownsampleLayer` instance `downsample` to reduce its spatial dimensions. If the `return_wo_downsample` flag is set to True, the original output tensor `x` before downsampling is saved as `x_`. Finally, the function returns either the downsampled output tensor `x` or both the downsampled output tensor `x` and the original output tensor `x_` depending on the value of `return_wo_downsample`.

Let's break down the `InternImageBlock` class forward method:

The forward loop in the `InternImageBlock` class iterates over the blocks of `InternImageLayer` instances contained within the `blocks` module list. Each block is applied sequentially to the input tensor `x`.

Within the loop:
1. For each block, the input tensor `x` is passed through the `InternImageLayer` instance `blk`. This involves applying the core operation, normalization layers, and optionally drop path regularization within the block.
2. If post-normalization is enabled and the current block index `i` is in the list of `post_norm_block_ids`, post-normalization is applied to the output tensor `x` using the corresponding post-normalization layer from the `post_norms` module list.
3. If post-normalization is not enabled or if `center_feature_scale` is True, the output tensor `x` is normalized using the `norm` layer.
4. If downsampling is enabled (`downsample` is not None), the output tensor `x` is passed through the `DownsampleLayer` instance `downsample` to reduce its spatial dimensions.
5. If `return_wo_downsample` is True, the original output tensor `x` before downsampling is saved as `x_`.

Finally, the function returns either the downsampled output tensor `x` or both the downsampled output tensor `x` and the original output tensor `x_`, depending on the value of `return_wo_downsample`.

In [None]:
class InternImageBlock(nn.Module):
    r""" Block of InternImage
    Args:
        core_op (nn.Module): core operation of InternImage
        channels (int): number of input channels
        depths (list): Depth of each block.
        groups (list): Groups of each block.
        mlp_ratio (float): ratio of mlp hidden features to input channels
        drop (float): dropout rate
        drop_path (float): drop path rate
        act_layer (str): activation layer
        norm_layer (str): normalization layer
        post_norm (bool): whether to use post normalization
        layer_scale (float): layer scale
        offset_scale (float): offset scale
        with_cp (bool): whether to use checkpoint
    """

    def __init__(self,
                 core_op,
                 channels,
                 depth,
                 groups,
                 downsample=True,
                 mlp_ratio=4.,
                 drop=0.,
                 drop_path=0.,
                 act_layer='GELU',
                 norm_layer='LN',
                 post_norm=False,
                 offset_scale=1.0,
                 layer_scale=None,
                 with_cp=False,
                 dw_kernel_size=None, # for InternImage-H/G
                 post_norm_block_ids=None, # for InternImage-H/G
                 res_post_norm=False, # for InternImage-H/G
                 center_feature_scale=False): # for InternImage-H/G
        super().__init__()
        self.channels = channels
        self.depth = depth
        self.post_norm = post_norm
        self.center_feature_scale = center_feature_scale

        self.blocks = nn.ModuleList([
            InternImageLayer(
                core_op=core_op,
                channels=channels,
                groups=groups,
                mlp_ratio=mlp_ratio,
                drop=drop,
                drop_path=drop_path[i] if isinstance(
                    drop_path, list) else drop_path,
                act_layer=act_layer,
                norm_layer=norm_layer,
                post_norm=post_norm,
                layer_scale=layer_scale,
                offset_scale=offset_scale,
                with_cp=with_cp,
                dw_kernel_size=dw_kernel_size, # for InternImage-H/G
                res_post_norm=res_post_norm, # for InternImage-H/G
                center_feature_scale=center_feature_scale # for InternImage-H/G
            ) for i in range(depth)
        ])
        if not self.post_norm or center_feature_scale:
            self.norm = build_norm_layer(channels, 'LN')
        self.post_norm_block_ids = post_norm_block_ids
        if post_norm_block_ids is not None: # for InternImage-H/G
            self.post_norms = nn.ModuleList(
                [build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids]
            )
        self.downsample = DownsampleLayer(
            channels=channels, norm_layer=norm_layer) if downsample else None

    def forward(self, x, return_wo_downsample=False):
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids):
                index = self.post_norm_block_ids.index(i)
                x = self.post_norms[index](x) # for InternImage-H/G
        if not self.post_norm or self.center_feature_scale:
            x = self.norm(x)
        if return_wo_downsample:
            x_ = x
        if self.downsample is not None:
            x = self.downsample(x)

        if return_wo_downsample:
            return x, x_
        return x

The `InternImage` class is an implementation of the InternImage model. This class represents the InternImage Encoder. It returns a list of tensors, each corresponding to a corresponding stage.

The model architecture consists of a series of stages, each containing multiple levels represented by `InternImageBlock` instances. These blocks are designed to process input tensors through a combination of deformable convolutions, normalization, activation, and downsampling operations. The model utilizes the specified core operator, activation layer, normalization layer, and dropout settings to learn hierarchical representations from the input data.

The class constructor `__init__` takes several arguments:
- `core_op` (str): Specifies the core operator for the model. Default: 'DCNv3_pytorch'.
- `channels` (int): Number of channels in the first stage of the model. Default: 64.
- `depths` (list): Depth of each block within the model. Default: [4, 4, 18, 4].
- `groups` (list): Number of groups for each block. Default: [4, 8, 16, 32].
- `mlp_ratio` (float): Ratio of MLP hidden dimension to embedding dimension. Default: 4.0.
- `drop_rate` (float): Dropout probability. Default: 0.0.
- `drop_path_rate` (float): Stochastic depth rate. Default: 0.2.
- `drop_path_type` (str): Type of drop path strategy ('linear' or 'uniform'). Default: 'linear'.
- `act_layer` (str): Activation layer. Default: 'GELU'.
- `norm_layer` (str): Normalization layer. Default: 'LN'.
- `layer_scale` (bool): Whether to use layer scale. Default: None.
- `offset_scale` (float): Scale factor for offsets in deformable convolutions. Default: 1.0.
- `post_norm` (bool): Whether to use post normalization. Default: False.
- `with_cp` (bool): Whether to use checkpointing during training. Default: False.
- `dw_kernel_size` (int): Size of the depthwise convolution kernel. Default: None.
- `level2_post_norm` (bool): Whether to use level 2 post normalization. Default: False.
- `level2_post_norm_block_ids` (list): Indexes of post normalization blocks for level 2. Default: None.
- `res_post_norm` (bool): Whether to use residual post normalization. Default: False.
- `center_feature_scale` (bool): Whether to use center feature scale. Default: False.
- `out_indices` (tuple): Indexes of levels to output features from. Default: (0, 1, 2, 3).
- `init_cfg` (dict): Configuration for weight initialization. Default: None.
- Additional keyword arguments (`**kwargs`) for flexibility and extensibility.

The forward pass in the `InternImage` class involves embedding the input image patches, applying positional dropout, processing the embedded features through each level of the model, collecting features from specified levels, and returning the collected features for further analysis or downstream tasks. Let's outline the steps for the forward pass more clearly: 

1. **Patch Embedding:** 
   - The input image tensor `x` is passed through the `patch_embed` module, which performs patch embedding to extract features from the image. 
   - The output is a tensor representing the embedded patches.

2. **Positional Dropout:**
   - The embedded patches undergo positional dropout, which randomly zeroes out elements in the tensor based on a dropout probability specified by `drop_rate`.
   - This helps regularize the model and prevent overfitting by adding noise to the embedded features.

3. **Processing through Levels:**
   - The embedded and dropout-applied tensor is sequentially processed through each level of the model.
   - For each level, the tensor is passed through an `InternImageBlock` instance, which applies a series of operations including deformable convolutions, normalization, activation, and optionally downsampling.
   - The output of each level is collected and stored for further processing.

4. **Output Collection:**
   - Features from specified levels, as indicated by `out_indices`, are collected and stored in `seq_out`.
   - These features represent hierarchical representations of the input image at different scales, capturing both low-level and high-level information.

5. **Return:**
   - The collected features (`seq_out`) are returned as the output of the forward pass.
   - These features can be used for downstream tasks such as classification, object detection, or segmentation.


Overall, the `InternImage` class encapsulates the architecture and functionality of the InternImage model, providing a versatile and scalable solution for various computer vision tasks.

In [None]:
class InternImage(nn.Module):
    r""" InternImage
        A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions`  -
          https://arxiv.org/pdf/2103.14030
    Args:
        core_op (str): Core operator. Default: 'DCNv3'
        channels (int): Number of the first stage. Default: 64
        depths (list): Depth of each block. Default: [4, 4, 18, 4]
        groups (list): Groups of each block. Default: [4, 8, 16, 32]
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        drop_rate (float): Probability of an element to be zeroed. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        act_layer (str): Activation layer. Default: 'GELU'
        norm_layer (str): Normalization layer. Default: 'LN'
        layer_scale (bool): Whether to use layer scale. Default: False
        cls_scale (bool): Whether to use class scale. Default: False
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
        dw_kernel_size (int): Size of the dwconv. Default: None
        level2_post_norm (bool): Whether to use level2 post norm. Default: False
        level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
        res_post_norm (bool): Whether to use res post norm. Default: False
        center_feature_scale (bool): Whether to use center feature scale. Default: False
    """

    def __init__(self,
                 core_op='DCNv3_pytorch',
                 channels=64,
                 depths=[4, 4, 18, 4],
                groups=[4, 8, 16, 32],
                 mlp_ratio=4.,
                 drop_rate=0.,
                 drop_path_rate=0.2,
                 drop_path_type='linear',
                 act_layer='GELU',
                 norm_layer='LN',
                 layer_scale=None,
                 offset_scale=1.0,
                 post_norm=False,
                 with_cp=False,
                 dw_kernel_size=None,  # for InternImage-H/G
                 level2_post_norm=False,  # for InternImage-H/G
                 level2_post_norm_block_ids=None,  # for InternImage-H/G
                 res_post_norm=False,  # for InternImage-H/G
                 center_feature_scale=False,  # for InternImage-H/G
                 out_indices=(0, 1, 2, 3),
                 init_cfg=None,
                 **kwargs):
        super().__init__()
        self.core_op = core_op
        self.num_levels = len(depths)
        self.depths = depths
        self.channels = channels
        self.num_features = int(channels * 2**(self.num_levels - 1))
        self.post_norm = post_norm
        self.mlp_ratio = mlp_ratio
        self.init_cfg = init_cfg
        self.out_indices = out_indices
        self.level2_post_norm_block_ids = level2_post_norm_block_ids
        logger = get_root_logger()
        logger.info(f'using core type: {core_op}')
        logger.info(f'using activation layer: {act_layer}')
        logger.info(f'using main norm layer: {norm_layer}')
        logger.info(f'using dpr: {drop_path_type}, {drop_path_rate}')
        logger.info(f"level2_post_norm: {level2_post_norm}")
        logger.info(f"level2_post_norm_block_ids: {level2_post_norm_block_ids}")
        logger.info(f"res_post_norm: {res_post_norm}")

        in_chans = 3
        self.patch_embed = StemLayer(in_chans=in_chans,
                                     out_chans=channels,
                                     act_layer=act_layer,
                                     norm_layer=norm_layer)
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]
        if drop_path_type == 'uniform':
            for i in range(len(dpr)):
                dpr[i] = drop_path_rate

        self.levels = nn.ModuleList()
        for i in range(self.num_levels):
            post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and (
                i == 2) else None # for InternImage-H/G
            level = InternImageBlock(
                core_op=core_op,
                channels=int(channels * 2**i),
                depth=depths[i],
                groups=groups[i],
                mlp_ratio=self.mlp_ratio,
                drop=drop_rate,
                drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
                act_layer=act_layer,
                norm_layer=norm_layer,
                post_norm=post_norm,
                downsample=(i < self.num_levels - 1),
                layer_scale=layer_scale,
                offset_scale=offset_scale,
                with_cp=with_cp,
                dw_kernel_size=dw_kernel_size,  # for InternImage-H/G
                post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G
                res_post_norm=res_post_norm, # for InternImage-H/G
                center_feature_scale=center_feature_scale # for InternImage-H/G
            )
            self.levels.append(level)

        self.num_layers = len(depths)
        self.apply(self._init_weights)
        self.apply(self._init_deform_weights)

    def init_weights(self):
        logger = get_root_logger()
        if self.init_cfg is None:
            logger.warn(f'No pre-trained weights for '
                        f'{self.__class__.__name__}, '
                        f'training start from scratch')
            for m in self.modules():
                if isinstance(m, nn.Linear):
                    trunc_normal_init(m, std=.02, bias=0.)
                elif isinstance(m, nn.LayerNorm):
                    constant_init(m, 1.0)
        else:
            assert 'checkpoint' in self.init_cfg, f'Only support ' \
                                                  f'specify `Pretrained` in ' \
                                                  f'`init_cfg` in ' \
                                                  f'{self.__class__.__name__} '
            ckpt = _load_checkpoint(self.init_cfg.checkpoint,
                                    logger=logger,
                                    map_location='cpu')
            if 'state_dict' in ckpt:
                _state_dict = ckpt['state_dict']
            elif 'model' in ckpt:
                _state_dict = ckpt['model']
            else:
                _state_dict = ckpt

            state_dict = OrderedDict()
            for k, v in _state_dict.items():
                if k.startswith('backbone.'):
                    state_dict[k[9:]] = v
                else:
                    state_dict[k] = v

            # strip prefix of state_dict
            if list(state_dict.keys())[0].startswith('module.'):
                state_dict = {k[7:]: v for k, v in state_dict.items()}

            # load state_dict
            meg = self.load_state_dict(state_dict, False)
            logger.info(meg)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def _init_deform_weights(self, m):
        import sys
        import os

        # Assuming your 'modules' directory is one level up from internimage.py
        sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

        # Now you can import dcnv3_pytorch as opsm
        import modules.dcnv3_pytorch as opsm
        if isinstance(m, getattr(opsm, self.core_op)):
            m._reset_parameters()

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.pos_drop(x)

        seq_out = []
        for level_idx, level in enumerate(self.levels):
            x, x_ = level(x, return_wo_downsample=True)
            if level_idx in self.out_indices:
                seq_out.append(x_.permute(0, 3, 1, 2).contiguous())
        return seq_out

## Using these modules:

### Intern Image Encoder

In [None]:
# Define the input tensor shape
batch_size = 5
channels = 3
height = 224
width = 224

# Create a random input tensor
input_tensor = torch.randn(batch_size, channels, height, width)

# Initialize the Encoder/backbone of our model
backbone = InternImage(
            channels = 64,
            depths=[4, 4, 18, 4], 
            groups=[4, 8, 16, 32],
            mlp_ratio = 4.,
            drop_path_rate = 0.2,
            norm_layer='LN',
            offset_scale=1.0,
            post_norm=False,
            with_cp=False, 
            out_indices=(0, 1, 2, 3),
            feature_channels=[64, 128, 256, 512])

# Forward pass through the model
output = backbone(input_tensor)

# Print the shapes of input and output
print("Input shape:", input_tensor.shape)

# Print the shapes of output features from each level
for i, features in enumerate(output):
    print(f"Level {i}: {features.shape}")