In [6]:
import torch
from torchvision import models
import torchvision.transforms as transforms
import torchvision.datasets as datasets

Download labeling from GitHub - https://github.com/tzutalin/labelImg


`!pip install pyqt5`

`!pip install lxml`


Installation guide - https://github.com/heartexlabs/labelImg#installation

Run this command in the terminal

`pyrcc5 -o libs/resources.py resources.qrc`

In [13]:
import os
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Use a pre-trained model
model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True)

# Change the number of classes
num_classes = 2  # 1 class (person) + background
model.head.classification_head.cls_logits = nn.Conv2d(256, num_classes, kernel_size=3, stride=1, padding=1)

# Send the model to the device
model = model.to(device)

class YOLODataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform if transform else transforms.ToTensor()
        self.imgs = list(sorted(os.listdir(os.path.join(root_dir, "images"))))
        self.labels = list(sorted(os.listdir(os.path.join(root_dir, "labels"))))

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, "images", self.imgs[idx])
        label_path = os.path.join(self.root_dir, "labels", self.labels[idx])

        img = Image.open(img_path).convert("RGB")
        img_width, img_height = img.size

        # Apply transformation after getting original size
        img = self.transform(img)

        # Read YOLO label file
        with open(label_path, "r") as file:
            lines = file.read().splitlines()

        boxes = []
        labels = []
        for line in lines:
            class_id, x_center, y_center, width, height = map(float, line.split())
            labels.append(int(class_id))

            x_min = img_width * (x_center - width / 2)
            y_min = img_height * (y_center - height / 2)
            x_max = img_width * (x_center + width / 2)
            y_max = img_height * (y_center + height / 2)
            boxes.append([x_min, y_min, x_max, y_max])

        target = {}
        target['boxes'] = torch.tensor(boxes, dtype=torch.float32)
        target['labels'] = torch.tensor(labels, dtype=torch.int64)

        return img, target

# Define your own paths here
train_dataset = YOLODataset("data/train set")
valid_dataset = YOLODataset("data/validation set")

def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    return images, targets

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)


In [15]:
model.train()
num_epochs = 25

for epoch in range(num_epochs):
    for i, (images, targets) in enumerate(train_dataloader):
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Debug: print out the targets to check their values
        if i == 0 and epoch == 0:
            print("Debug output:")
            for target in targets:
                print(f"Boxes: {target['boxes']}")
                print(f"Labels: {target['labels']}")

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Epoch: {epoch + 1}, Loss: {losses.item()}")

Debug output:
Boxes: tensor([[1.5000e+01, 5.0500e+02, 5.4000e+01, 5.4100e+02],
        [5.2001e+01, 4.9600e+02, 8.8001e+01, 5.3400e+02],
        [1.2100e+02, 4.8000e+02, 1.5800e+02, 5.1800e+02],
        [1.5200e+02, 4.7500e+02, 1.8900e+02, 5.1000e+02],
        [1.8500e+02, 4.7200e+02, 2.2100e+02, 5.0300e+02],
        [2.1500e+02, 4.6600e+02, 2.5300e+02, 4.9600e+02],
        [2.4900e+02, 4.5900e+02, 2.8100e+02, 4.8800e+02],
        [2.7500e+02, 4.5200e+02, 3.1000e+02, 4.8000e+02],
        [4.5500e+02, 3.7700e+02, 4.8700e+02, 4.0200e+02],
        [4.8500e+02, 3.6400e+02, 5.2400e+02, 3.8500e+02],
        [4.6600e+02, 4.1500e+02, 5.0300e+02, 4.4400e+02],
        [1.0010e+00, 5.7000e+02, 3.5000e+01, 6.0600e+02],
        [3.7000e+01, 5.6000e+02, 7.6001e+01, 5.9800e+02],
        [7.8001e+01, 5.5400e+02, 1.1200e+02, 5.8800e+02],
        [1.1200e+02, 5.3900e+02, 1.5400e+02, 5.7900e+02],
        [1.4900e+02, 5.3100e+02, 1.9000e+02, 5.6500e+02],
        [1.8800e+02, 5.2200e+02, 2.2400e+02, 5.5400

RuntimeError: shape '[2, -1, 91, 96, 168]' is invalid for input of size 64512