# Pytorch Model Inspection:

In [1]:
from torch.utils.data import random_split, DataLoader, Subset, TensorDataset
from torchvision import datasets, transforms
from ultralytics import YOLO
import torch
from mmpose.apis import MMPoseInferencer

In [None]:
mmpose_model = MMPoseInferencer('rtmw-m_8xb1024-270e_cocktail14-256x192')

### Get Model output for every layer:

In [2]:
data_transforms = transforms.Compose([
    transforms.Resize((640, 480)), # Resize images to 640x640
    transforms.ToTensor() # Convert to tensor
])

In [3]:
data_path = 'example_images'
own_dataset = datasets.ImageFolder(root=data_path, transform=data_transforms)

train_loader = DataLoader(own_dataset, batch_size=1, shuffle=False)

image = [i for i in train_loader][0]


In [5]:
yolo_model = YOLO('yolov8n-pose.pt')
depth = 0
def print_size(module, input, output):
    global depth
    if isinstance(output, tuple):
        depth += 1
        for element in output:
            print_size(module, input, element)
    elif isinstance(output, list):
        for element in output:
            print_size(module, input, element)        
    else:
        #print(type(output))
        print(f"depth: {depth}, {module.__class__.__name__}: {output.size()}")
        if depth > 0:
            depth -= 1

# Assuming `yolo` is your model instance
for layer in yolo_model.modules():
    layer.register_forward_hook(print_size)

yolo_model(image[0])


depth: 0, SiLU: torch.Size([1, 16, 320, 240])
depth: 0, Conv: torch.Size([1, 16, 320, 240])
depth: 0, SiLU: torch.Size([1, 32, 160, 120])
depth: 0, Conv: torch.Size([1, 32, 160, 120])
depth: 0, SiLU: torch.Size([1, 32, 160, 120])
depth: 0, Conv: torch.Size([1, 32, 160, 120])
depth: 0, SiLU: torch.Size([1, 16, 160, 120])
depth: 0, Conv: torch.Size([1, 16, 160, 120])
depth: 0, SiLU: torch.Size([1, 16, 160, 120])
depth: 0, Conv: torch.Size([1, 16, 160, 120])
depth: 0, Bottleneck: torch.Size([1, 16, 160, 120])
depth: 0, SiLU: torch.Size([1, 32, 160, 120])
depth: 0, Conv: torch.Size([1, 32, 160, 120])
depth: 0, C2f: torch.Size([1, 32, 160, 120])
depth: 0, SiLU: torch.Size([1, 64, 80, 60])
depth: 0, Conv: torch.Size([1, 64, 80, 60])
depth: 0, SiLU: torch.Size([1, 64, 80, 60])
depth: 0, Conv: torch.Size([1, 64, 80, 60])
depth: 0, SiLU: torch.Size([1, 32, 80, 60])
depth: 0, Conv: torch.Size([1, 32, 80, 60])
depth: 0, SiLU: torch.Size([1, 32, 80, 60])
depth: 0, Conv: torch.Size([1, 32, 80, 60]

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: ultralytics.engine.results.Keypoints object
 masks: None
 names: {0: 'person'}
 obb: None
 orig_img: array([[[221, 191, 157],
         [221, 191, 157],
         [220, 190, 156],
         ...,
         [198, 176, 151],
         [198, 176, 152],
         [198, 176, 152]],
 
        [[224, 194, 160],
         [224, 194, 160],
         [224, 194, 160],
         ...,
         [198, 176, 151],
         [198, 176, 152],
         [198, 176, 152]],
 
        [[226, 196, 161],
         [225, 195, 160],
         [225, 195, 160],
         ...,
         [198, 176, 152],
         [197, 175, 151],
         [198, 177, 151]],
 
        ...,
 
        [[230, 164, 100],
         [231, 164, 101],
         [229, 162,  99],
         ...,
         [190, 146,  96],
         [189, 145,  96],
         [190, 146,  97]],
 
        [[224, 156,  93],
         [224, 156,  93],
         [223, 155,

#### print the summary of the YOLO model

! starts the training

In [None]:
from torchinfo import summary
summary(yolo_model, (3, 320, 320))  

## Alternative:

- inspect the output in front of YOLOs pose.head 
- cut the pose.head from yolo


In [None]:
height = 320
width= 320

# Assuming `model` is your YOLO model and it's already defined
dummy_input = torch.randn(1, 3, height, width)  # Replace height and width with actual input dimensions

# Initialize a temporary variable for passing data through layers
temp_output = dummy_input

# Forward pass through the model up to layer 21
with torch.no_grad():
    for i, module in enumerate(yolo_model.children()):  # Adjust this line based on the actual structure of your model
        if isinstance(temp_output, tuple):
            # If the module expects a single tensor but the current output is a tuple, 
            # you might need to adjust this part depending on how the module expects its inputs
            temp_output = module(*temp_output)  
        else:
            temp_output = module(temp_output)
        if i == 20:  # Layer indices are 0-based; layer 21 is index 20
            break

# Check if the output is a tuple and print sizes

# If the final output is a tuple, select the appropriate element
if isinstance( temp_output, tuple):
    print("is tuple")
    for i, elem in enumerate(temp_output):
        try:
            output =  temp_output[i]  # Adjust this based on which part of the tuple you need
            print("i: ", i, output.size())
        except Exception as e:
            if isinstance(elem, tuple):
                output = elem[i]
                print("i: ", i, output.size())
            if isinstance(elem, torch.Tensor):
                output = elem
                print("i: ", i, output.size())
            if type(elem) == list:
                for j, e in enumerate(elem):
                    output = elem[j]
                    print("j: ", j, output.size(), i)
            else:
                print(e)
else: 
    output = temp_output.size()
    print("Output size of layer 21: ", output.size())

### Model Cutting and Fusing:

In [4]:
import torch
import torch.nn as nn
from torchinfo import summary

In [5]:
num_classes = 3  
num_features = 4*6300
h1 = 512
h2 = 256


class ClassificHead(nn.Module):
    def __init__(self, input_features, num_classes, h1, h2):
        super(ClassificHead, self).__init__()
        self.network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_features, h1),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(h2, h2 // 2),
            nn.ReLU(),
            nn.Linear(h2 // 2, num_classes)
        )

    def forward(self, x):
        return self.network(x)

In [6]:
classification_head=ClassificHead(num_features, num_classes, h1, h2)

In [39]:
list(yolo_model.model.children())[0][:22]

Sequential(
  (0): Conv(
    (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (1): Conv(
    (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (2): C2f(
    (cv1): Conv(
      (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (cv2): Conv(
      (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (m): ModuleList(
      (0): Bottleneck(
        (cv1): Conv(
        

pytorch model can be called with:

- .children()
- .modules()    - recursive call
- .named_modules()

In [88]:
list(yolo_model.model.named_modules())

[('',
  PoseModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
   

In [73]:
model2=nn.Sequential(*list(yolo_model.model.modules())[0:1])
el1, el2 =model2(image[0])
#el1.size(), el2.size()

In [15]:
yolo_model=YOLO("yolov8n-pose.pt")
yolo_backbone= nn.Sequential(*list(yolo_model.model.children())[0][:22]) # keeps layer 0 to 21, without the pose head (layer 22)
pose_head= nn.Sequential(*list(yolo_model.model.children())[0][22:23])
pose_stem= nn.Sequential(*list(pose_head[0].children())[:-1])
#display(summary(nn.Sequential(pose_stem)))
#summary(pose_head)
#pose_stem=nn.Sequential(pose_head[0].cv2, pose_head[0].cv3, pose_head[0].dfl)
yolo_pose = nn.Sequential(*yolo_backbone, pose_stem)
#display(summary(yolo_model))
#summary(yolo_pose)


In [16]:
class CustomYOLOPose(nn.Module):
    def __init__(self, yolo_backbone, pose_stem):
        super().__init__()
        self.yolo_backbone = yolo_backbone
        self.pose_stem = pose_stem

    def forward(self, x):
        x = self.yolo_backbone(x)
        # Ensure x is in the correct format for pose_stem
        # For example, if x is a tuple, select the correct element or combine elements as needed
        # x = x[0] or x = torch.cat(x, dim=1) or any other necessary manipulation
        x = self.pose_stem(x)
        return x


In [20]:
model = CustomYOLOPose(yolo_backbone, pose_stem)