In [1]:
import torch
import torch.utils.data as data
from PIL import Image
import os
import os.path
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms
from pycocotools.coco import COCO

os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")
import utils
from engine import train_one_epoch, evaluate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
class CoCoDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation)
        self.ids = list(sorted(self.coco.imgs.keys()))

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path))

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor([img_id])
        # Size of bbox (Rectangular)
        areas = []
        for i in range(num_objs):
            areas.append(coco_annotation[i]['area'])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        # Iscrowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Annotation is in dictionary format
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = self.ids[index]
        my_annotation["area"] = areas
        my_annotation["iscrowd"] = iscrowd

        if self.transforms is not None:
            img = self.transforms(img)

        return img, my_annotation

    def __len__(self):
        return len(self.ids)

# Since inputs for a PyTorch model must be in tensor format.
def get_transform():
    custom_transforms = []
    custom_transforms.append(torchvision.transforms.ToTensor())
    return torchvision.transforms.Compose(custom_transforms)

In [9]:
train = CoCoDataset("/content/drive/MyDrive/dataset/train/images", "/content/drive/MyDrive/dataset/train/anno.json", transforms=get_transform())
val = CoCoDataset("/content/drive/MyDrive/dataset/val/images", "/content/drive/MyDrive/dataset/val/anno.json", transforms=get_transform())

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [10]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

num_classes = 5  # 4 class (plane, drone, helicopter, bird) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [11]:
# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# define training and validation data loaders
data_loader = data.DataLoader(
    train,
    batch_size=1,
    shuffle=True,
    num_workers=2,
    collate_fn=collate_fn
)

data_loader_test = data.DataLoader(
    val,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_fn
)

In [13]:
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it for 5 epochs
num_epochs = 15

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

Epoch: [0]  [ 0/57]  eta: 0:00:20  lr: 0.000094  loss: 0.0435 (0.0435)  loss_classifier: 0.0137 (0.0137)  loss_box_reg: 0.0286 (0.0286)  loss_objectness: 0.0002 (0.0002)  loss_rpn_box_reg: 0.0011 (0.0011)  time: 0.3583  data: 0.0842  max mem: 1449
Epoch: [0]  [10/57]  eta: 0:00:10  lr: 0.000986  loss: 0.0433 (0.0459)  loss_classifier: 0.0127 (0.0129)  loss_box_reg: 0.0275 (0.0310)  loss_objectness: 0.0003 (0.0007)  loss_rpn_box_reg: 0.0011 (0.0014)  time: 0.2129  data: 0.0110  max mem: 1449
Epoch: [0]  [20/57]  eta: 0:00:07  lr: 0.001878  loss: 0.0379 (0.0420)  loss_classifier: 0.0127 (0.0134)  loss_box_reg: 0.0225 (0.0267)  loss_objectness: 0.0003 (0.0005)  loss_rpn_box_reg: 0.0011 (0.0014)  time: 0.1985  data: 0.0035  max mem: 1449
Epoch: [0]  [30/57]  eta: 0:00:05  lr: 0.002770  loss: 0.0387 (0.0474)  loss_classifier: 0.0139 (0.0150)  loss_box_reg: 0.0225 (0.0302)  loss_objectness: 0.0002 (0.0008)  loss_rpn_box_reg: 0.0009 (0.0013)  time: 0.1988  data: 0.0033  max mem: 1449
Epoch: [