### Training a object detection Neural Network

First, upload data to Google Drive. Then, mount your Google Drive so that CoLab can have access to files in your Google Drive.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Copy your dataset folder from the location in your Google Drive to current working directory in CoLab.

In [2]:
!cp -R "<<UPDATE THIS>>" .

Install detecto. The reason we are using detecto is that its source code is very readable. It hides the nitty gritty details of typical training workflow in PyTorch.

In [None]:
!pip3 install detecto

In [1]:
from detecto import core, utils, config
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision import transforms
import matplotlib.pyplot as plt

In [2]:
class FreezableModel(core.Model):
  """
    Subclass Detecto core.Model to allow user to specify which layers of back-
    bone they want to freeze 
  """
  DEFAULT = 'fasterrcnn_resnet50_fpn'
  MOBILENET = 'fasterrcnn_mobilenet_v3_large_fpn'
  MOBILENET_320 = 'fasterrcnn_mobilenet_v3_large_320_fpn'

  def __init__(self, classes=None, device=None, pretrained=True,
                model_name=DEFAULT, trainable_backbone_layers=None):
    self._device = device if device else config.config['default_device']

    # Load a model pre-trained on COCO
    if model_name == self.DEFAULT:
        self._model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=pretrained,
                                                                            trainable_backbone_layers=trainable_backbone_layers)
    elif model_name == self.MOBILENET:
        self._model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=pretrained,
                                                                                      trainable_backbone_layers=trainable_backbone_layers)
    elif model_name == self.MOBILENET_320:
        self._model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=pretrained,
                                                                                          trainable_backbone_layers=trainable_backbone_layers)
    else:
        raise ValueError(f'Invalid value {model_name} for model_name. ' +
                          f'Please choose between {self.DEFAULT}, {self.MOBILENET}, and {self.MOBILENET_320}.')

    if classes:
        # Get the number of input features for the classifier
        in_features = self._model.roi_heads.box_predictor.cls_score.in_features
        # Replace the pre-trained head with a new one (note: +1 because of the __background__ class)
        self._model.roi_heads.box_predictor = FastRCNNPredictor(in_features, len(classes) + 1)
        self._disable_normalize = False
    else:
        classes = config['default_classes']
        self._disable_normalize = True

    self._model.to(self._device)

    # Mappings to convert from string labels to ints and vice versa
    self._classes = ['__background__'] + classes
    self._int_mapping = {label: index for index, label in enumerate(self._classes)}

In [None]:
# Convert XML files to CSV format
# You may have to update this based on how you have placed your train and val labels
utils.xml_to_csv('dataset/train_labels/', 'train_labels.csv') 
utils.xml_to_csv('dataset/val_labels/', 'val_labels.csv')

In [27]:
# Define custom transforms to apply to your dataset
custom_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(500),
    transforms.ColorJitter(brightness=[0.9,1.1], contrast=[0.9,1.1], saturation=0.3),
    transforms.GaussianBlur(kernel_size=(5, 5), sigma=(0.1, 0.3)),
    transforms.RandomPosterize(bits=6),
    transforms.RandomAdjustSharpness(sharpness_factor=2),
    transforms.RandomAutocontrast(),
    transforms.RandomEqualize(),
    transforms.ToTensor(),
    utils.normalize_transform(),
])

In [28]:
# Pass in a CSV file instead of XML files for faster Dataset initialization speeds
# You may have to update dataset/train_images and dataset/val_images based on 
# how you have placed your images
dataset = core.Dataset('train_labels.csv', 'dataset/train_images/', transform=custom_transforms)
val_dataset = core.Dataset('val_labels.csv', 'dataset/val_images/')  # Validation dataset for training

In [30]:
# Create your own DataLoader with custom options
loader = core.DataLoader(dataset, batch_size=4, shuffle=True) 

Visulize images and labels from dataloader

In [31]:
import cv2
import numpy as np

In [None]:
images, labels = next(iter(loader))
for img, label in zip(images, labels):
  bboxes = label['boxes']
  names = label['labels']
  # img = utils.reverse_normalize(img)
  img = np.ascontiguousarray(img.permute(1,2,0))
  img = np.uint8(img*255)
  for bbox, name in zip(bboxes, names):
    print(bbox, name)
    bbox = bbox.numpy()
    start_pt = (bbox[0], bbox[1])
    end_pt = (bbox[2], bbox[3])
    img = cv2.rectangle(img, start_pt, end_pt, (0,255,255), 5)
    img = cv2.putText(img, name, (bbox[0]-10, bbox[1]-10), 
                      cv2.FONT_HERSHEY_SIMPLEX, 2, (255,0,255), 5, cv2.LINE_AA)

  plt.imshow(img)
  plt.show()

In [34]:
class_names = np.unique(dataset._csv['class'])
print(f"Classes in dataset: {class_names}")

In [38]:
model = FreezableModel(list(class_names), model_name='fasterrcnn_resnet50_fpn', trainable_backbone_layers=0)

In [None]:
for name, parameter in model._model.named_parameters():
  print(name, parameter.requires_grad)

In [None]:
losses = model.fit(loader, val_dataset, epochs=10, learning_rate=0.008, verbose=True)

plt.plot(losses)  # Visualize loss throughout training
plt.show()

model.save("<<UPDATE THIS>>/model_weights.pth")  # Save model to a file
# Update this path so that the weights are saved on your Google Drive

In [None]:
# Directly access underlying torchvision model for even more control
torch_model = model.get_internal_model()

## Take photo from web cam and perform inference with trained model

In [40]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

In [None]:
from IPython.display import Image
try:
  filename = take_photo()
  print('Saved to {}'.format(filename))
  
  # Show the image which was just taken.
  display(Image(filename))
except Exception as err:
  # Errors will be thrown if the user does not have a webcam or if they do not
  # grant the page permission to access it.
  print(str(err))

## Perform inference

In [42]:
from detecto import visualize

In [None]:
model = FreezableModel.load("<<UPDATE THIS>>/model_weights.pth", list(class_names))
image = utils.read_image('/content/photo.jpg')
model._disable_normalize = True 
labels, boxes, scores = model.predict(image)  # Get all predictions on an image

# Get top prediction
idx_score = np.argmax(scores.numpy())
box = boxes[idx_score]
label = labels[idx_score]

visualize.show_labeled_image(image, box, label)  # Plot top prediction

## Fine tuning some later stages of backbone

In [46]:
trainable_layers = 3

In [47]:
if trainable_layers < 0 or trainable_layers > 5:
    raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
layers_to_train = ["fpn", "body.layer4", "body.layer3", "body.layer2", "body.layer1", "body.conv1"][:trainable_layers]
if trainable_layers == 5:
    layers_to_train.append("bn1")
for name, parameter in model._model.backbone.named_parameters():
    if all([not name.startswith(layer) for layer in layers_to_train]):
        parameter.requires_grad = False

In [None]:
for name, parameter in model._model.backbone.named_parameters():
  print(name, parameter.requires_grad)

In [None]:
losses = model.fit(loader, val_dataset, epochs=5, learning_rate=0.003, verbose=True)

plt.plot(losses)  # Visualize loss throughout training
plt.show()

model.save("<<UPDATE THIS>>/fine_tuned_model_weights.pth")  # Save fine-tuned model to a file