<b>Назначение:</b> <br>
Отладка используемых архитектур нейронных сетей

In [34]:
import torch.nn as nn
import torch
import numpy as np
from transformers import AutoImageProcessor, ResNetForImageClassification, Mask2FormerForUniversalSegmentation
import albumentations as A
import cv2
import matplotlib.pyplot as plt

### ResNet

In [2]:
RESNET_MODEL_NAME = "./base_models/microsoft_resnet50"
CLASSES = 14

In [3]:
class ResNet50Classifier(nn.Module):
    def __init__(self, num_classes):
        super(ResNet50Classifier, self).__init__()
        
        #
        model = ResNetForImageClassification.from_pretrained(RESNET_MODEL_NAME)
        model.requires_grad_(False)

        # ResNet backbone
        self.backbone = model.resnet

        #
        self.fc = nn.Sequential(
            nn.Flatten(start_dim=1, end_dim=-1),
            nn.Linear(2048, 512),
            nn.Dropout(0.2),
            nn.SiLU(),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        encoder_output = self.backbone(x)
        out = self.fc(encoder_output)
        return out


In [4]:
model = ResNet50Classifier(CLASSES)

In [5]:
sum([ p.numel() for p in model.parameters() if p.requires_grad])

1056270

### MaskFormerNet

In [8]:
M2F_MODEL_NAME = "./base_models/facebook-m2f_swin_large"

In [12]:
class Mask2FormerClassifier(nn.Module):
    def __init__(self, num_classes):
        super(Mask2FormerClassifier, self).__init__()
        
        #
        m2f = Mask2FormerForUniversalSegmentation.from_pretrained(M2F_MODEL_NAME)
        m2f.requires_grad_(False)
        self.bb_features = 1536

        # M2F backbone
        self.embeddings = m2f.model.pixel_level_module.encoder.embeddings
        self.encoder = m2f.model.pixel_level_module.encoder.encoder
        self.layernorm = nn.LayerNorm(self.bb_features)
        self.pooler = nn.AdaptiveAvgPool1d(1)

        #
        self.fc = nn.Sequential(
            nn.Linear(self.bb_features, 512),
            nn.Dropout(0.1),
            nn.SiLU(),
            nn.Linear(self.bb_features, 256),
            nn.Dropout(0.1),
            nn.SiLU(),
            nn.Linear(256, num_classes)
    )

    def forward(self, x):
        embedding_output, input_dimensions = self.embeddings(x)
        encoder_outputs = self.encoder(embedding_output, input_dimensions)
        
        sequence_output = encoder_outputs.last_hidden_state
        sequence_output = self.layernorm(sequence_output)

        pooled_output = self.pooler(sequence_output.transpose(1, 2))
        pooled_output = torch.flatten(pooled_output, 1)

        out = self.fc(pooled_output)

        return out

In [13]:
model = Mask2FormerClassifier(CLASSES)

In [14]:
sum([ p.numel() for p in model.parameters() if p.requires_grad])

1187086

### ConvNet (my)

In [49]:
TRANSFORM_MYCNN = A.Compose([
    A.Resize(height=224, width=224)
    ])

In [55]:
class MyCNNClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MyCNNClassifier, self).__init__()
        self.conv_layers = nn.Sequential(
            # Bx3x224x224
            nn.BatchNorm2d(3),
            nn.Conv2d(3, 32, kernel_size=4, stride=2),
            # Bx32x111x111
            nn.SiLU(),
            nn.BatchNorm2d(32),
            nn.AvgPool2d(kernel_size=3, stride=2, padding=1),
            # Bx32x56x56
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            # Bx64x27x27
            nn.SiLU(),
            nn.BatchNorm2d(64),
            nn.AvgPool2d(kernel_size=3, stride=2),
            # Bx64x13x13
            nn.Conv2d(64, 128, kernel_size=4, stride=3),
            # Bx128x4x4
            nn.SiLU(),
            nn.BatchNorm2d(128),
            nn.AvgPool2d(kernel_size=2, stride=2),
            # Bx128x2x2
        )

        self.linear_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=512, out_features=64),
            # Bx64
            nn.SiLU(),
            nn.Linear(in_features=64, out_features=num_classes)
            # Bxnum_classes
        )
    
    def forward(self, x):
        conv_out = self.conv_layers(x)
        out = self.linear_layers(conv_out)
        return out


In [53]:
model = MyCNNClassifier(CLASSES, 14)

In [52]:
sum([ p.numel() for p in model.parameters() if p.requires_grad])

205428