In [4]:
import os

import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [2]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 3  # 1 class (person) + background

in_features = model.roi_heads.box_predictor.cls_score.in_features

model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [3]:
model.eval()
pred = model(torch.ones([2, 3, 800, 800]))

In [5]:
class BBoxDataset(Dataset):
    """
    PyTorch datasets for object detection.

    Parameters
    ----------
    images : list of str
        Image's path

    bboxes: list of torch.Tensor
        Each tensor's shape: [[x0, y0, x1, y1], [x0, y0, x1, y1], ...]

    labels: list of torch.Tensor
        Each tensor's shape: [] or [0, 3] or [0, 4 ,5] ...

    transform: torchvision.transforms
    """

    def __init__(self, images, bboxes, labels, transform):
        self.images = images
        self.bboxes = bboxes
        self.labels = labels
        self.transform = transform

    def _path_to_tensor(self, path):
        img = Image.open(path).convert('RGB')
        return self.transform(img)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = self._path_to_tensor(self.images[idx])
        bbox = self.bboxes[idx]
        label = self.labels[idx]
        target = {'boxes': bbox.float(), 'labels': label.int()}
        return img, target

In [6]:
# IMG_DIR = './PennFudanPed/PNGImages/'
df = pd.read_csv('annotation.csv')

class_to_idx = {label: idx for idx, label in enumerate(df['label'].unique())}
images = df['filename'].tolist()

In [7]:
class_to_idx

{'PASpersonWalking': 0, 'PASpersonStanding': 1}

In [8]:
groupby_image = df.groupby('filename')

bboxes = []
labels = []
for img in images:
    rows = groupby_image.get_group(img)

    # Bouding Box
    x_min = rows['x_min'].values
    y_min = rows['y_min'].values
    x_max = rows['x_max'].values
    y_max = rows['y_max'].values
    bbox = np.stack([x_min, y_min, x_max, y_max]).reshape(-1, 4)
    bbox = torch.from_numpy(bbox)
    bboxes.append(bbox)

    # Labels
    label = rows['label'].map(class_to_idx).values
    label = torch.from_numpy(label).view(-1)
    labels.append(label)

In [9]:
transform = transforms.Compose([
    transforms.Resize((800, 800)),
    transforms.ToTensor(),
])
dataset = BBoxDataset(images, bboxes, labels, transform)

In [10]:
def bbox_collate_fn(batch):
    images = []
    targets = []
    for sample in batch:
        image, target = sample
        images.append(image)
        targets.append(target)
    images = torch.stack(images, dim=0)
    return images, targets

In [20]:
loader = DataLoader(dataset, batch_size=4, shuffle=True, drop_last=True, collate_fn=bbox_collate_fn)

In [21]:
targets

[{'boxes': tensor([[246., 337.,  37., 436.],
          [170., 175., 129., 157.],
          [340., 426., 181., 501.],
          [462., 464., 492., 402.]]),
  'labels': tensor([0, 0, 0, 0], dtype=torch.int32)},
 {'boxes': tensor([[ 42., 127., 187., 299.],
          [426.,  60.,  83.,  76.],
          [ 76.,  68., 165., 201.],
          [309., 381., 527., 350.],
          [359., 369., 400., 343.]]),
  'labels': tensor([0, 0, 0, 0, 0], dtype=torch.int32)}]

In [None]:
device = 'cpu' # 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.train()
# model.eval()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

for i, (inputs, targets) in enumerate(loader):
    inputs = inputs.to(device)

    optimizer.zero_grad()

    outputs = model(inputs, targets)
    loss_classifier = outputs['loss_classifier']
    loss_box_reg = outputs['loss_box_reg']
    loss_objectness = outputs['loss_objectness']
    
    loss = loss_classifier + loss_box_reg
    loss.backward()
    print(loss)
    
    optimizer.step()

tensor(0.0283, grad_fn=<AddBackward0>)
tensor(0.0122, grad_fn=<AddBackward0>)
tensor(0.0133, grad_fn=<AddBackward0>)
tensor(0.0013, grad_fn=<AddBackward0>)
tensor(0.0007, grad_fn=<AddBackward0>)
tensor(0.0068, grad_fn=<AddBackward0>)
tensor(0.0012, grad_fn=<AddBackward0>)
tensor(0.0012, grad_fn=<AddBackward0>)
tensor(0.0018, grad_fn=<AddBackward0>)
tensor(0.0843, grad_fn=<AddBackward0>)
tensor(0.0723, grad_fn=<AddBackward0>)
tensor(0.0068, grad_fn=<AddBackward0>)
tensor(0.0011, grad_fn=<AddBackward0>)
tensor(0.0028, grad_fn=<AddBackward0>)
tensor(0.0016, grad_fn=<AddBackward0>)
tensor(0.0012, grad_fn=<AddBackward0>)
tensor(0.0058, grad_fn=<AddBackward0>)
tensor(0.0560, grad_fn=<AddBackward0>)
